In [11]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

### Data Preparation

In [12]:
df = pd.read_csv('../../Datasets/telco/telco.csv');
df.columns = df.columns.str.lower()
df.head(3)

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


In [13]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

# convert to numeric because totalcharges is of type object
# ignore errors with errors='coerce'
df.totalcharges = pd.to_numeric(df.totalcharges, errors='coerce')
# fill NaN with zeros
df.totalcharges = df.totalcharges.fillna(0)
# convert target to zeros and ones
df['churn'] = (df['churn'] == 'yes').astype(int)

In [26]:
# model
from sklearn.linear_model import LogisticRegression

# for calculating the accuracy
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error
from sklearn.metrics import auc, roc_auc_score

# for splitting the data
from sklearn.model_selection import train_test_split

# 60, 20, 20 splitting
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
len(df_full_train), len(df_test)

df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

full_train_y = df_full_train.churn.values
train_y = df_train.churn.values
val_y = df_val.churn.values
test_y = df_test.churn.values

del df_train['churn']
del df_val['churn']
del df_test['churn']

In [27]:
len(df_val), len(df_test), len(df_train)

(1409, 1409, 4225)

### EDA

In [28]:
numerical_variables = ['tenure', 'monthlycharges', 'totalcharges']
categorical_variables = [column for column in df_full_train.columns if column not in numerical_variables and column != 'churn' and column != 'customerid']

# get churn rate by getting the number of 1s / size 
global_churn_rate = df_full_train.churn.mean()
global_churn_rate.round(2) # must be equal to normalized value of 1

0.27

In [29]:
df[categorical_variables].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

In [30]:
df_full_train.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

### Model

In [31]:
# one hot encoding from sklearn
from sklearn.feature_extraction import DictVectorizer

train_dicts = df_train[categorical_variables + numerical_variables].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
# dv.fit_transform(X) fits and immediately transforms the DictVectorizer to a matrix of 0s and 1s
dv.fit(train_dicts)
X_train = dv.transform(train_dicts)
# val
val_dicts = df_val[categorical_variables + numerical_variables].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [32]:
model = LogisticRegression(max_iter=3000)
model.fit(X_train, train_y)

# predict
y_pred = model.predict_proba(X_val)[:,1]
churn_decision = (y_pred >= 0.5)

df_pred = pd.DataFrame()
df_pred['probability'] = y_pred
df_pred['prediction'] = churn_decision.astype(int)
df_pred['actual_val'] = val_y
df_pred['correct'] = (df_pred.prediction == df_pred.actual_val)
df_pred

Unnamed: 0,probability,prediction,actual_val,correct
0,0.006629,0,0,True
1,0.204776,0,0,True
2,0.217348,0,0,True
3,0.564584,1,1,True
4,0.218046,0,0,True
...,...,...,...,...
1404,0.293219,0,0,True
1405,0.041337,0,1,False
1406,0.151561,0,0,True
1407,0.788574,1,1,True


In [33]:
df_pred.correct.mean()

0.8041163946061036

In [52]:
def train(data, y, C):
    dicts = data[categorical_variables + numerical_variables].to_dict(orient='records')
    dictv = DictVectorizer(sparse=False)
    X = dictv.fit_transform(dicts)
    
    logit = LogisticRegression(max_iter=4000, C=C)
    logit.fit(X, y)
    return logit, dictv
    
def get_auc(model_train, model_test, model_train_y, model_test_y, C):
    # train model
    logit, fdv = train(model_train, model_train_y, C)
    
    # process testing data
    dicts = model_test[categorical_variables + numerical_variables].to_dict(orient='records')
    X_test = fdv.transform(dicts)
    
    # make predictions
    predictions = logit.predict_proba(X_test)[:,1]
    return roc_auc_score(model_test_y, predictions)

def predict_results(final_model, testing_data, vectorizer):
    # process testing data
    dicts = testing_data[categorical_variables + numerical_variables].to_dict(orient='records')
    X_test = vectorizer.transform(dicts)
    tmp_predictions = final_model.predict_proba(X_test)[:,1]
    return tmp_predictions

In [42]:
from sklearn.model_selection import KFold
from tqdm.auto import tqdm

# folds for cross validation
kfold = KFold(n_splits=5, shuffle=True, random_state=1)
counter = 1
fold_scores = []
for train_idx, test_idx in kfold.split(df_full_train):
    # get splits
    model_train = df_full_train.iloc[train_idx]
    model_test = df_full_train.iloc[test_idx]

    # extract churn column
    model_train_y = model_train.churn.values
    model_test_y = model_test.churn.values

    # delete churn from dataset
    del model_train['churn']
    del model_test['churn']

    auc_score = get_auc(model_train, model_test, model_train_y, model_test_y, 1)
    
    # get auc for each fold
    # print(F"AUC {counter}: {auc_score}")
    fold_scores.append(auc_score)
    counter += 1

print('C=%s  %.3f +- %.3f' % (1.0, np.mean(fold_scores), np.std(fold_scores)))

C=1.0  0.842 +- 0.007


In [43]:
fold_scores

[0.8443963859299322,
 0.8449894881472653,
 0.8333373469797312,
 0.8347649005563726,
 0.8518480750525557]

# Lecture 5 - Deployment

In [53]:
model, fdv = train(df_full_train, full_train_y, 1.0)
predictions = predict_results(model, df_test, fdv)
roc_auc_score(test_y, predictions)
# # process testing data
# dicts = df_test[categorical_variables + numerical_variables].to_dict(orient='records')
# X_test = fdv.transform(dicts)

# # using default solver 
# predictions = model.predict(X_test)
# (predictions == test_y).mean()

0.8584736260521953

### Save the model

I will be using pickle to import the model

In [55]:
import pickle

output_file=f'model_C={1.0}.bin'
# open the file
f_out = open(output_file, 'wb')

# use pickle to put our model and dictionary vectorizer inside f_out 
pickle.dump((model, fdv), f_out)

# close the file
f_out.close()

Another way to do the code above is:

In [57]:
# f_out = open(output_file, 'wb')
with open(output_file, 'wb') as f_out:
    pickle.dump((model, fdv), f_out)
# once outside the 'with' statement, the file will close.

### Load the model

Note: Restart kernel first

In [2]:
import pickle

model_file = 'model_C=1.0.bin'

with open(model_file, 'rb') as f_in:
    model, fdv = pickle.load(f_in)

model, fdv

(LogisticRegression(max_iter=4000), DictVectorizer(sparse=False))

In [11]:
customer = {
 'gender': 'female',
 'seniorcitizen': 0,
 'partner': 'yes',
 'dependents': 'no',
 'tenure': 10,
 'phoneservice': 'no',
 'multiplelines': 'no_phone_service',
 'internetservice': 'dsl',
 'onlinesecurity': 'no',
 'onlinebackup': 'yes',
 'deviceprotection': 'no',
 'techsupport': 'no',
 'streamingtv': 'no',
 'streamingmovies': 'no',
 'contract': 'month-to-month',
 'paperlessbilling': 'yes',
 'paymentmethod': 'electronic_check',
 'monthlycharges': 29.85,
 'totalcharges': 29.85,
}

X = fdv.transform([customer])
model.predict_proba(X)[:,1]

array([0.49114843])