In [1]:
import pandas as pd
import numpy as np
import sklearn

In [2]:
print(f'pandas=={pd.__version__}')
print(f'numpy=={np.__version__}')
print(f'sklearn=={sklearn.__version__}')

pandas==2.3.3
numpy==2.3.3
sklearn==1.6.1


In [7]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [9]:
data_url = '../data/WA_Fn-UseC_-Telco-Customer-Churn.csv'
df = pd.read_csv(data_url)

df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
df.totalcharges = df.totalcharges.fillna(0)

df.churn = (df.churn == 'yes').astype(int)

In [10]:
y_train = df.churn

In [13]:
numerical = ['tenure', 'monthlycharges', 'totalcharges']

categorical = [
    'gender',
    'seniorcitizen',
    'partner',
    'dependents',
    'phoneservice',
    'multiplelines',
    'internetservice',
    'onlinesecurity',
    'onlinebackup',
    'deviceprotection',
    'techsupport',
    'streamingtv',
    'streamingmovies',
    'contract',
    'paperlessbilling',
    'paymentmethod',
]

In [14]:
dv = DictVectorizer()

train_dict = df[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

model = LogisticRegression(solver='liblinear')
model.fit(X_train, y_train)

In [15]:
datapoint = {
    'gender': 'female',
    'seniorcitizen': 0,
    'partner': 'yes',
    'dependents': 'no',
    'phoneservice': 'no',
    'multiplelines': 'no_phone_service',
    'internetservice': 'dsl',
    'onlinesecurity': 'no',
    'onlinebackup': 'yes',
    'deviceprotection': 'no',
    'techsupport': 'no',
    'streamingtv': 'no',
    'streamingmovies': 'no',
    'contract': 'month-to-month',
    'paperlessbilling': 'yes',
    'paymentmethod': 'electronic_check',
    'tenure': 1,
    'monthlycharges': 29.85,
    'totalcharges': 29.85
}

In [16]:
X = dv.transform(datapoint)


In [17]:
model.predict_proba(X)[0, 1] 


np.float64(0.6638159290212889)

In [18]:
import pickle


In [19]:
with open('./model/model_test.bin', 'wb') as f_out:
    pickle.dump((dv, model), f_out)

In [20]:
with open('./model/model_test.bin', 'rb') as f_in:
    (dv, model) = pickle.load(f_in)

In [21]:
from sklearn.pipeline import make_pipeline


In [22]:
pipeline = make_pipeline(
    DictVectorizer(),
    LogisticRegression(solver='liblinear')
)

In [23]:
pipeline.fit(train_dict, y_train)


In [24]:
pipeline.predict_proba(datapoint)[0, 1]


np.float64(0.6638159290212889)

In [25]:
import requests


In [28]:
url = 'http://localhost:8000/predict'

customer = {"lead_source": "organic_search",
    "number_of_courses_viewed": 4,
    "annual_income": 80304.0
    
}

response = requests.post(url, json=customer)

In [30]:
response.json()

{'churn_probaility': 0.534, 'churn': True}