In [3]:
# importing libraries
import boto3
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_curve, classification_report
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

# defining the bucket
s3 = boto3.resource('s3')
bucket_name = 'webster-data445-bucket'
bucket = s3.Bucket(bucket_name)

# defining the csv file
file_key = 'telecom_churn.csv'

bucket_object = bucket.Object(file_key)
file_object = bucket_object.get()
file_content_stream = file_object.get('Body')

# reading the csv file
churn_data = pd.read_csv(file_content_stream)
# dropping missing values
churn_data = churn_data.dropna()
churn_data.head()

Unnamed: 0,Churn,AccountWeeks,ContractRenewal,DataPlan,DataUsage,CustServCalls,DayMins,DayCalls,MonthlyCharge,OverageFee,RoamMins
0,0,128,1,1,2.7,1,265.1,110,89.0,9.87,10.0
1,0,107,1,1,3.7,1,161.6,123,82.0,9.78,13.7
2,0,137,1,0,0.0,0,243.4,114,52.0,6.06,12.2
3,0,84,0,0,0.0,2,299.4,71,57.0,3.1,6.6
4,0,75,0,0,0.0,3,166.7,113,41.0,7.42,10.1


In [4]:
X = churn_data[['AccountWeeks', 'ContractRenewal', 'CustServCalls', 'MonthlyCharge', 'DayMins']]
Y = churn_data['Churn']

# splitting into train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, stratify = Y)

In [10]:
# running over-sampling
X_smote, Y_smote = SMOTE().fit_resample(X_train, Y_train)

In [11]:
# checking number of 1 and 0 before/after oversampling
print(Y_train.value_counts())
print(Y_smote.value_counts())

0    2280
1     386
Name: Churn, dtype: int64
1    2280
0    2280
Name: Churn, dtype: int64


In [14]:
# building the random forest
RF1 = RandomForestClassifier(n_estimators = 500, max_depth = 3).fit(X_smote, Y_smote)

# predicting on test
RF_pred = RF1.predict_proba(X_test)[:, 1]

# computing ROC curve
fpr, tpr, threshold = roc_curve(Y_test, RF_pred)

# creating a dataframe with unpacked roc values
cutoffs = pd.DataFrame({'False_Positive' : fpr, 'True_Positive' : tpr, 'Cutoff' : threshold})

# finding the optimal cutoff
cutoffs['True_Positive_minus_1'] = cutoffs['True_Positive'] - 1

# finding euclidean distance
cutoffs['Euclidian_dist'] = np.sqrt(cutoffs['False_Positive']**2 + cutoffs['True_Positive_minus_1']**2)
cutoffs = cutoffs.sort_values(by = 'Euclidian_dist').reset_index(drop = True)

# changing likelihoods to labels
RF_pred = np.where(RF_pred < cutoffs['Euclidian_dist'][0], 0, 1)

# printing classification report
print(classification_report(Y_test, RF_pred))

              precision    recall  f1-score   support

           0       0.96      0.62      0.75       570
           1       0.27      0.84      0.41        97

    accuracy                           0.65       667
   macro avg       0.61      0.73      0.58       667
weighted avg       0.86      0.65      0.70       667



In [15]:
# building the random forest
AB1 = AdaBoostClassifier(base_estimator = DecisionTreeClassifier(max_depth = 3), n_estimators = 500, learning_rate = 0.01).fit(X_smote, Y_smote)

# predicting on test
AB_pred = AB1.predict_proba(X_test)[:, 1]

# computing ROC curve
fpr2, tpr2, threshold2 = roc_curve(Y_test, AB_pred)

# creating a dataframe with unpacked roc values
cutoffs2 = pd.DataFrame({'False_Positive' : fpr2, 'True_Positive' : tpr2, 'Cutoff' : threshold2})

# finding the optimal cutoff
cutoffs2['True_Positive_minus_1'] = cutoffs2['True_Positive'] - 1

# finding euclidean distance
cutoffs2['Euclidian_dist'] = np.sqrt(cutoffs2['False_Positive']**2 + cutoffs2['True_Positive_minus_1']**2)
cutoffs2 = cutoffs2.sort_values(by = 'Euclidian_dist').reset_index(drop = True)

# changing likelihoods to labels
AB_pred = np.where(AB_pred < cutoffs2['Euclidian_dist'][0], 0, 1)

# printing classification report
print(classification_report(Y_test, AB_pred))

              precision    recall  f1-score   support

           0       1.00      0.04      0.08       570
           1       0.15      1.00      0.26        97

    accuracy                           0.18       667
   macro avg       0.58      0.52      0.17       667
weighted avg       0.88      0.18      0.10       667



In [None]:
# Based on my results, I would choose the AdaBoostClassifier to predict churn