In [161]:
# Import the library
import pandas as pd
pd.set_option('display.max_columns', None)
import warnings
warnings.filterwarnings('ignore')

In [162]:
data = pd.read_csv("Customer_churn.txt")
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


### Round 1

In [163]:
churnData = data

In [164]:
churnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [165]:
# Check the datatypes of all the columns in the data.You would see that the column TotalCharges is object type. 
# Convert this column into numeric type using pd.to_numeric function.

churnData["TotalCharges"] = pd.to_numeric(churnData["TotalCharges"],errors='coerce')

In [166]:
churnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [167]:
churnData.isnull().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [168]:
churnData['TotalCharges']=churnData['TotalCharges'].fillna(0)

In [169]:
churnData.isnull().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [170]:
X = churnData[['tenure','SeniorCitizen','MonthlyCharges','TotalCharges']]
y = churnData['Churn']

In [171]:
y = y.map({'Yes': 1, 'No': 0})
y

0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn, Length: 7043, dtype: int64

In [172]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [174]:
y_train.value_counts()

0    4138
1    1496
Name: Churn, dtype: int64

In [83]:
# scale for better models
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [84]:
X.dtypes

tenure              int64
SeniorCitizen       int64
MonthlyCharges    float64
TotalCharges      float64
dtype: object

In [85]:
churnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [86]:
churnData_dummy = pd.get_dummies(churnData.drop("Churn",axis=1))

In [87]:
churnData_dummy.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Female,gender_Male,Partner_No,Partner_Yes,Dependents_No,Dependents_Yes,PhoneService_No,PhoneService_Yes,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year
0,0,1,29.85,29.85,1,0,0,1,1,0,1,0,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0
1,0,34,56.95,1889.5,0,1,1,0,1,0,0,1,0,0,1,1,0,0,0,0,1,1,0,0,1,0,0,1,0,0,0,1,0
2,0,2,53.85,108.15,0,1,1,0,1,0,0,1,0,0,1,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0
3,0,45,42.3,1840.75,0,1,1,0,1,0,1,0,0,0,1,1,0,0,0,0,1,0,0,1,1,0,0,1,0,0,0,1,0
4,0,2,70.7,151.65,1,0,1,0,1,0,0,1,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0,1,0,0


### Round 2

In [111]:
# (Optional) Fit a logistic Regression model on the training data.
# Log Reg is for classification

from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PowerTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.metrics import plot_confusion_matrix

model1 = LogisticRegression() 
trans=PowerTransformer()
trans.fit(X_train)
X_train_model1 = trans.transform(X_train)
X_test_model1  = trans.transform(X_test)

model1.fit(X_train_model1, y_train)

y_pred_train_model1 = model1.predict(X_train_model1)
y_pred_test_model1= model1.predict(X_test_model1)

In [112]:
# Fit a Knn Classifier (NOT KnnRegressor please!)model on the training data.

from sklearn.neighbors import KNeighborsClassifier
model2 = KNeighborsClassifier() 
model2.fit(X_train, y_train) # train model

y_pred_train_model2 = model2.predict(X_train)
y_pred_test_model2 = model2.predict(X_test)

In [113]:
# Fit a Decision Tree Classifier on the training data.

from sklearn.tree import DecisionTreeClassifier
model3 = DecisionTreeClassifier()
model3.fit(X_train, y_train)

y_pred_train_model3 = model3.predict(X_train)
y_pred_test_model3 = model3.predict(X_test)


In [114]:
# Compare the accuracy, precision, recall for the previous models on both the train and test sets.

# Log Reg
performance_model1 = pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Train': [accuracy_score(y_train, y_pred_train_model1),
                                         precision_score(y_train, y_pred_train_model1),
                                         recall_score(y_train, y_pred_train_model1)],
                               'Test': [accuracy_score(y_test, y_pred_test_model1),
                                        precision_score(y_test, y_pred_test_model1),
                                        recall_score(y_test, y_pred_test_model1)]})

display(performance_model1)

Unnamed: 0,Error_metric,Train,Test
0,Accuracy,0.788427,0.805536
1,Precision,0.651093,0.698795
2,Recall,0.437834,0.466488


In [115]:
# KNN
performance_model2 = pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Train': [accuracy_score(y_train, y_pred_train_model2),
                                         precision_score(y_train, y_pred_train_model2),
                                         recall_score(y_train, y_pred_train_model2)],
                               'Test': [accuracy_score(y_test, y_pred_test_model2),
                                        precision_score(y_test, y_pred_test_model2),
                                        recall_score(y_test, y_pred_test_model2)]})

display(performance_model2)

Unnamed: 0,Error_metric,Train,Test
0,Accuracy,0.835996,0.775727
1,Precision,0.737148,0.59164
2,Recall,0.594251,0.493298


In [116]:
# DT

performance_model3 = pd.DataFrame({'Error_metric': ['Accuracy','Precision','Recall'],
                               'Train': [accuracy_score(y_train, y_pred_train_model3),
                                         precision_score(y_train, y_pred_train_model3),
                                         recall_score(y_train, y_pred_train_model3)],
                               'Test': [accuracy_score(y_test, y_pred_test_model3),
                                        precision_score(y_test, y_pred_test_model3),
                                        recall_score(y_test, y_pred_test_model3)]})

display(performance_model3)

Unnamed: 0,Error_metric,Train,Test
0,Accuracy,0.99077,0.733144
1,Precision,0.991826,0.496084
2,Recall,0.973262,0.509383


### Round 3

In [117]:
# apply K-fold cross validation on your models built before, and check the model score. 
# Note: So far we have not balanced the data.

In [118]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

In [119]:
# for training set
# for Log Reg
scores1 = cross_val_score(model1, X_train_model1, y_train, cv=5)
print("Cross validation scores for Logistic Regression: ", scores1)
print("Score stats: %0.2f accuracy with a standard deviation of %0.2f" % (scores1.mean(), scores1.std()))

Cross validation scores for Logistic Regression:  [0.80212955 0.78793256 0.77462289 0.79503106 0.78330373]
Score stats: 0.79 accuracy with a standard deviation of 0.01


In [120]:
# KNN
scores2 = cross_val_score(model2, X_train, y_train, cv=5)
print("Cross validation scores for KNN: ", scores2)
print("Score stats: %0.2f accuracy with a standard deviation of %0.2f" % (scores2.mean(), scores2.std()))

Cross validation scores for KNN:  [0.77817214 0.75687666 0.7595386  0.78793256 0.75310835]
Score stats: 0.77 accuracy with a standard deviation of 0.01


In [121]:
# DT 
scores3 = cross_val_score(model3, X_train, y_train, cv=5)
print("Cross validation scores for DT: ", scores3)
print("Score stats: %0.2f accuracy with a standard deviation of %0.2f" % (scores3.mean(), scores3.std()))

Cross validation scores for DT:  [0.71872227 0.71872227 0.71251109 0.72670807 0.72468917]
Score stats: 0.72 accuracy with a standard deviation of 0.01


### Round 4

In [122]:
# fit a Random forest Classifier on the data and compare the accuracy.
# tune the hyper parameters with Gridsearch and check the results. retrain the final model with the best parameters found.

In [124]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rfc_ops = {"max_depth":6,
           "min_samples_leaf":20,
           "max_features":None,
           "n_estimators":100,
           "bootstrap":True,
           "oob_score":True,
           "random_state":42}

clf = RandomForestClassifier(**rfc_ops)

        #max_depth=6,min_samples_leaf=20,max_features=None,n_estimators=100,
         #                    bootstrap=True,oob_score=True, random_state=RAND_STATE)
clf.fit(X_train, y_train)
print("train prediction accuracy score: %.2f" %(clf.score(X_train, y_train)))
print("test prediction accuracy score: %.2f"  %(clf.score(X_test, y_test)))

train prediction accuracy score: 0.81
test prediction accuracy score: 0.81


In [132]:
param_grid = {
    'n_estimators': [50, 100,200],
    'min_samples_split': [2, 4],
    'min_samples_leaf' : [1, 2],
    'max_features': ['sqrt'],
    ##'max_samples' : ['None', 0.5],
    'max_depth':[3,5,10],
    'bootstrap':[True,False],
    "oob_score":[True,False],
    "min_samples_leaf":[10,20,40]
    }
clf_opti = RandomForestClassifier(random_state=42)

In [133]:
grid_search = GridSearchCV(clf_opti, param_grid, cv=5,return_train_score=True,n_jobs=-1)

In [134]:
grid_search.fit(X_train,y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'bootstrap': [True, False], 'max_depth': [3, 5, 10],
                         'max_features': ['sqrt'],
                         'min_samples_leaf': [10, 20, 40],
                         'min_samples_split': [2, 4],
                         'n_estimators': [50, 100, 200],
                         'oob_score': [True, False]},
             return_train_score=True)

In [135]:
best_params = grid_search.best_params_ #To check the best set of parameters returned
best_params

{'bootstrap': True,
 'max_depth': 5,
 'max_features': 'sqrt',
 'min_samples_leaf': 40,
 'min_samples_split': 2,
 'n_estimators': 100,
 'oob_score': True}

In [136]:
import numpy as np

from sklearn.model_selection import cross_val_score
clf_best = RandomForestClassifier(random_state=42, **best_params)
cross_val_scores = cross_val_score(clf_best, X_test, y_test, cv=5)
print(np.mean(cross_val_scores))

0.7963125615204059


### Managing imbalance in the dataset

Check for the imbalance.
Use the resampling strategies used in class for upsampling and downsampling to create a balance between the two classes.
Each time fit the model and check the accuracy of the model.

In [140]:
# Check for the imbalance.
y.value_counts()

0    5174
1    1869
Name: Churn, dtype: int64

In [141]:
NEG_CLASS_CNT = 5174

In [142]:
print("The majority class (negative cases) represents {:.2f}% of the data".format(NEG_CLASS_CNT/len(y)*100))

The majority class (negative cases) represents 73.46% of the data


### Downsampling

In [147]:
y.value_counts()

0    5174
1    1869
Name: Churn, dtype: int64

In [156]:
def down_samp_rand(Xin, yin, ratio=1):
        from imblearn.under_sampling import RandomUnderSampler
        """Downsamples majority class using random sampling.
        Ratio argument is the ratio of minority class to the downsampled majority
        """
        rus = RandomUnderSampler(sampling_strategy=ratio, random_state=42)
        X_rus, y_rus = rus.fit_resample(Xin, yin)
        return X_rus, y_rus

In [157]:
X_train, y_train = down_samp_rand(X_train,y_train)

In [158]:
y_train.value_counts()

0    1496
1    1496
Name: Churn, dtype: int64

In [159]:
rfc_ops = {"max_depth":6,
           "min_samples_leaf":20,
           "max_features":None,
           "n_estimators":100,
           "bootstrap":True,
           "oob_score":True,
           "random_state":42}

clf = RandomForestClassifier(**rfc_ops)

        #max_depth=6,min_samples_leaf=20,max_features=None,n_estimators=100,
         #                    bootstrap=True,oob_score=True, random_state=RAND_STATE)
clf.fit(X_train, y_train)
print("train prediction accuracy score: %.2f" %(clf.score(X_train, y_train)))
print("test prediction accuracy score: %.2f"  %(clf.score(X_test, y_test)))

train prediction accuracy score: 0.77
test prediction accuracy score: 0.73
