In [123]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression

In [79]:
churnData = pd.read_csv('Customer-Churn.csv')
churnData.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [80]:
churnData.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [81]:
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'], errors='coerce')

In [82]:
churnData['Churn'] = churnData['Churn'].apply(lambda x: x.replace("Yes", "1"))

In [83]:
churnData['Churn'] = churnData['Churn'].apply(lambda x: x.replace("No", "0"))

In [84]:
churnData['Churn'] = pd.to_numeric(churnData['Churn'], errors='coerce')

In [85]:
numerical = churnData.select_dtypes(include= np.number)

In [86]:
numerical

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
0,0,1,29.85,29.85,0
1,0,34,56.95,1889.50,0
2,0,2,53.85,108.15,1
3,0,45,42.30,1840.75,0
4,0,2,70.70,151.65,1
...,...,...,...,...,...
7038,0,24,84.80,1990.50,0
7039,0,72,103.20,7362.90,0
7040,0,11,29.60,346.45,0
7041,1,4,74.40,306.60,1


In [88]:
numerical.isna().sum()

SeniorCitizen      0
tenure             0
MonthlyCharges     0
TotalCharges      11
Churn              0
dtype: int64

In [89]:
from sklearn.preprocessing import MinMaxScaler

In [90]:
MinMaxTransformer = MinMaxScaler().fit(numerical)    
num_normalized = MinMaxTransformer.transform(numerical)
num_norm = pd.DataFrame(num_normalized,columns=numerical.columns)

In [91]:
num_norm.dropna(subset = ["TotalCharges"], inplace=True)

In [92]:
num_norm.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
0,0.0,0.013889,0.115423,0.001275,0.0
1,0.0,0.472222,0.385075,0.215867,0.0
2,0.0,0.027778,0.354229,0.01031,1.0
3,0.0,0.625,0.239303,0.210241,0.0
4,0.0,0.027778,0.521891,0.01533,1.0


In [99]:
num_norm['Churn'] = num_norm['Churn'].astype(int)

In [100]:
num_norm.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
0,0.0,0.013889,0.115423,0.001275,0
1,0.0,0.472222,0.385075,0.215867,0
2,0.0,0.027778,0.354229,0.01031,1
3,0.0,0.625,0.239303,0.210241,0
4,0.0,0.027778,0.521891,0.01533,1


In [148]:
X = num_norm.drop(['Churn'],axis=1)
y = num_norm['Churn']

In [149]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=100)

In [150]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train, y_train)
LR.score(X_test, y_test)

0.7824386775684323

In [151]:
pred = LR.predict(X_test)
print('LR.score: ',LR.score(X_test, y_test))
print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

LR.score:  0.7824386775684323
precision:  0.6308851224105462
recall:  0.44607190412782954
f1:  0.5226209048361934


In [135]:
num_norm['Churn'].value_counts()

0    5163
1    1869
Name: Churn, dtype: int64

Downsampling and checking the score

In [136]:
from sklearn.utils import resample

category_0 = num_norm[num_norm['Churn'] == 0]
category_1 = num_norm[num_norm['Churn'] == 1]

In [137]:
category_0_undersampled = resample(category_0, 
                                   replace=False, 
                                   n_samples = len(category_1))

In [138]:
print(category_0_undersampled.shape)
print(category_1.shape)

(1869, 5)
(1869, 5)


In [139]:
data_downsampled = pd.concat([category_0_undersampled, category_1], axis=0)

In [140]:
data_downsampled.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
2189,0.0,0.958333,0.061692,0.1961,0
5162,0.0,0.111111,0.01194,0.016201,0
4148,0.0,0.402778,0.010945,0.067251,0
2910,0.0,0.486111,0.071144,0.107431,0
719,1.0,0.944444,0.61791,0.642799,0


In [146]:
X = data_downsampled.drop(['Churn'],axis=1)
y = data_downsampled['Churn']

In [147]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=100)

LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train, y_train)
pred = LR.predict(X_test)
print('LR.score: ',LR.score(X_test, y_test))
print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

LR.score:  0.7312834224598931
precision:  0.7239583333333334
recall:  0.7453083109919572
f1:  0.7344782034346103


Upsamling and checking the score

In [116]:
category_1_oversampled = resample(category_1, 
                                  replace=True, 
                                  n_samples = len(category_0))

In [117]:
print(category_0.shape)
print(category_1_oversampled.shape)

(5163, 5)
(5163, 5)


In [118]:
data_upsampled = pd.concat([category_0, category_1_oversampled], axis=0)

In [119]:
data_upsampled['Churn'].value_counts()

0    5163
1    5163
Name: Churn, dtype: int64

In [144]:
X = data_upsampled.drop(['Churn'],axis=1)
y = data_upsampled['Churn']

In [145]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=100)

LR = LogisticRegression(random_state=0, solver='lbfgs')
LR.fit(X_train, y_train)
pred = LR.predict(X_test)
print('LR.score: ',LR.score(X_test, y_test))
print("precision: ",precision_score(y_test,pred))
print("recall: ",recall_score(y_test,pred))
print("f1: ",f1_score(y_test,pred))

LR.score:  0.7257322682159284
precision:  0.7280114722753346
recall:  0.7297556300910397
f1:  0.7288825077769802
