## Final Notebook

This notebook is intended to show the final model we chose, the features we used, and the evaluation we made.

Ultimately, we wanted to predict which customers were soon to leave the telecomms company. This is a binary classification model: soon to leave, versus, not soon to leave.

The data can be downloaded from https://www.kaggle.com/becksddf/churn-in-telecoms-dataset, but the CSV is small enough that we simply included it in the data folder.

In [1]:
import os
import sys
module_path = os.path.abspath(os.path.join(os.pardir))
if module_path not in sys.path:
    sys.path.append(module_path)
    
import pandas as pd
import numpy as np

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score

import src.data_cleaning as dc

In [2]:
# assign features and target to appropriate variables

df = dc.create_df('bigml_59c28831336c6604c800002a.csv')
X = df.drop('churn', axis=1)
y = df['churn']

In [3]:
# create train-test split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [4]:
# define new class for custom transformations in the pipeline

class Dataprep(BaseEstimator):
    
    def __init__(self):
        pass
        
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        
        df = X.copy()
        df['international plan'] = df['international plan'] == 'yes'
        df['voice mail plan'] = df['voice mail plan'] == 'yes'
    
        df['account months'] = np.ceil(df['account length']/30).astype(int)
        df['total minutes'] = df['total day minutes']+df['total eve minutes']+df['total night minutes']+df['total intl minutes']
        df['total calls'] = df['total day calls']+df['total eve calls']+df['total night calls']+df['total intl calls']
        df['total cost'] = df['total day charge']+df['total eve charge']+df['total night charge']+df['total intl charge']
        df['cost per day'] = df['total cost']/df['account length']
    
        df.drop(['state', 'phone number', 'area code'], axis=1, inplace=True)
    
        return df

In [5]:
# initiate gradient booster

gb = GradientBoostingClassifier()

In [6]:
# initiate pipeline

pipe = Pipeline(steps=[
    ("dataprep", Dataprep()),
    ("model", gb)
])

In [7]:
# fit pipeline to training data

pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('dataprep', Dataprep()),
                ('model',
                 GradientBoostingClassifier(ccp_alpha=0.0,
                                            criterion='friedman_mse', init=None,
                                            learning_rate=0.1, loss='deviance',
                                            max_depth=3, max_features=None,
                                            max_leaf_nodes=None,
                                            min_impurity_decrease=0.0,
                                            min_impurity_split=None,
                                            min_samples_leaf=1,
                                            min_samples_split=2,
                                            min_weight_fraction_leaf=0.0,
                                            n_estimators=100,
                                            n_iter_no_change=None,
                                            presort='deprecated',
           

In [9]:
# assess model with cross validation

cross_val_score(pipe, X_train, y_train, cv=5, scoring='roc_auc')

array([0.9012443 , 0.93960929, 0.96225987, 0.88820742, 0.92385152])

In [10]:
# check recall and f1 scores on training data

print(classification_report(y_train, pipe.predict(X_train)))

              precision    recall  f1-score   support

       False       0.98      1.00      0.99      2141
        True       1.00      0.88      0.93       358

    accuracy                           0.98      2499
   macro avg       0.99      0.94      0.96      2499
weighted avg       0.98      0.98      0.98      2499



## Use on hold-out data

In [11]:
# check recall and f1 scores on testing data

print(classification_report(y_test, pipe.predict(X_test)))

              precision    recall  f1-score   support

       False       0.98      1.00      0.99       709
        True       0.99      0.88      0.93       125

    accuracy                           0.98       834
   macro avg       0.99      0.94      0.96       834
weighted avg       0.98      0.98      0.98       834

