In [1]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_csv('data-week-3.csv')
df.columns = df.columns.str.lower().str.replace(' ', '_')
categorical_columns  = list(df.dtypes[df.dtypes == 'object'].index)

#lets format the string names in each row in the categorical columns
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ','_')

#df.totalcharges have not been categorised correctly
df.totalcharges= df.totalcharges.fillna(0)
df.churn = (df.churn == 'yes').astype('int')

numerical_coulumns = list(df.dtypes[(df.dtypes == 'int64') | (df.dtypes == 'float64')].index)

In [3]:
df_full_train, df_test =train_test_split(df,test_size=0.2,random_state=1)
df_train,df_val = train_test_split(df_full_train,test_size=0.25, random_state=1)

#reset indices
df_train = df_train.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)

y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

del df_train['churn']
del df_val['churn']
del df_test['churn']

categorical_columns,numerical_coulumns


(['customerid',
  'gender',
  'partner',
  'dependents',
  'phoneservice',
  'multiplelines',
  'internetservice',
  'onlinesecurity',
  'onlinebackup',
  'deviceprotection',
  'techsupport',
  'streamingtv',
  'streamingmovies',
  'contract',
  'paperlessbilling',
  'paymentmethod',
  'totalcharges',
  'churn'],
 ['seniorcitizen', 'tenure', 'monthlycharges', 'churn'])

In [4]:
categorical = ['customerid',
  'gender',
  'partner',
  'dependents',
  'phoneservice',
  'multiplelines',
  'internetservice',
  'onlinesecurity',
  'onlinebackup',
  'deviceprotection',
  'techsupport',
  'streamingtv',
  'streamingmovies',
  'contract',
  'paperlessbilling',
  'paymentmethod',
  'seniorcitizen',               
  ]
numerical =  ['totalcharges', 'tenure', 'monthlycharges']

#lets create a function that with dictionarize a dataframe we give it
#extract the undelying feature matrix and fit train our model on the x_train and y_train.

from sklearn.feature_extraction import DictVectorizer
def train(df_train,y_train,C=1.0):
    dicts = df_train[categorical + numerical].to_dict(orient ='records')
    dv = DictVectorizer(sparse = False)
    X_train = dv.fit_transform(dicts)

    model = LogisticRegression(C=C, max_iter=1000)
    model.fit(X_train, y_train)
    return dv,model

In [5]:
#a function that will be used to make predictions for a dataframe we provide
def predict(df, dv,model):
    dicts = df[categorical + numerical].to_dict(orient='records')
    X = dv.transform(dicts)
    y_pred = model.predict_proba(X)[:,1]
    #make sure your predict function returns predictions not just printing them otherwise you return None
    return y_pred

In [6]:
#lets now implement kfold cross validation
from tqdm import tqdm
from IPython.display import display 

C=1.0
n_splits = 5
scores = []
#!pip install tqdm


#you need to instansiate it first and then you will use the instance inside te for loop 
kfold = KFold(n_splits=n_splits,shuffle=True,random_state=1)

#kfold.split(df_full_train) generates the (train_idx, val_idx) pairs.
#total=kfold.get_n_splits() tells tqdm how many iterations to expect — one per fold.
#iterates over the number of splits paying attention to eac value train and validation pairs


for train_idx,val_idx in tqdm(kfold.split(df_full_train)):
    df_train = df_full_train.iloc[train_idx]
    df_val = df_full_train.iloc[val_idx]
    y_train = df_train.churn.values
    y_val = df_val.churn.values
    dv, model = train(df_train, y_train, C=C)
    y_pred = predict(df_val, dv, model)
    display(y_pred)
    auc = roc_auc_score(y_val, y_pred)
    scores.append(auc)

#lets find out the mean scores and standard deviation among the auc across different folds
#In an f-string:
#Expressions go inside {}
#Formatting (like .3f for 3 decimal places) comes after a colon :
print(f'average score :{np.mean(scores):.3f} standard deviation: {np.std(scores):.3f} ')
    


0it [00:00, ?it/s]

array([0.05335015, 0.13467678, 0.15423019, ..., 0.43203902, 0.0096085 ,
       0.8115041 ])

1it [00:28, 28.03s/it]

array([0.0333105 , 0.03455345, 0.04776354, ..., 0.83937641, 0.09234492,
       0.39417371])

2it [01:00, 30.86s/it]

array([0.42338086, 0.20325296, 0.09895938, ..., 0.16992171, 0.11293355,
       0.00450234])

3it [01:23, 27.30s/it]

array([0.03039014, 0.76616616, 0.00468009, ..., 0.032292  , 0.62865994,
       0.04547961])

4it [01:52, 27.84s/it]

array([0.006677  , 0.12665392, 0.00675714, ..., 0.78579697, 0.0384051 ,
       0.0059824 ])

5it [02:16, 27.23s/it]

average score :0.840 standard deviation: 0.007 





In [7]:
#finally lets train the model on full traain data
dv, model = train(df_full_train,df_full_train.churn.values,C=1.0)
y_pred = predict(df_test,dv,model)
#y_test = df_test.churn.values
auc = roc_auc_score(y_test,y_pred)
auc

0.8572440334969179

In [10]:
#lets save the model
import pickle
output_file = f'model_C={C}.bin'

# f_out = open(output_file,'wb') #wb for read binary
# pickle.dump((dv,model),f_out)

# f_out.close()


In [11]:
#alternatively we can use the with statement to ensure we do not accidentally forget to close the file 
#with statement means -> “Open this file for writing in binary mode, call it f_out, and after I’m done saving the model, close it automatically — no matter what.”
with open(output_file, 'wb') as f_out: #Opens a file in write-binary mode ('wb') — that means you can write bytes (which is what pickle.dump produces).
    pickle.dump((dv, model), f_out)

CHECK OUT HOW WE LOAD THE MODEL IN 'LOAD.ipynb'

In [14]:
df.iloc[19]

customerid                4183-myfrb
gender                        female
seniorcitizen                      0
partner                           no
dependents                        no
tenure                            21
phoneservice                     yes
multiplelines                     no
internetservice          fiber_optic
onlinesecurity                    no
onlinebackup                     yes
deviceprotection                 yes
techsupport                       no
streamingtv                       no
streamingmovies                  yes
contract              month-to-month
paperlessbilling                 yes
paymentmethod       electronic_check
monthlycharges                 90.05
totalcharges                  1862.9
churn                              0
Name: 19, dtype: object

In [37]:
customer = df.iloc[51]
customer_dicts = customer.to_json()
customer_dicts

'{"customerid":"9420-lojkx","gender":"female","seniorcitizen":0,"partner":"no","dependents":"no","tenure":15,"phoneservice":"yes","multiplelines":"no","internetservice":"fiber_optic","onlinesecurity":"yes","onlinebackup":"yes","deviceprotection":"no","techsupport":"no","streamingtv":"yes","streamingmovies":"yes","contract":"month-to-month","paperlessbilling":"yes","paymentmethod":"credit_card_(automatic)","monthlycharges":99.1,"totalcharges":"1426.4","churn":1}'