In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix
import warnings
from sklearn.metrics import cohen_kappa_score
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.metrics import classification_report
warnings.filterwarnings('ignore')

1. read data

In [2]:
df = pd.read_csv('customer_churn.csv',sep=",")
df.head(5)

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [3]:
df.shape

(7043, 21)

In [4]:
df['Churn'].value_counts(normalize=True) # we can use "normalize = True" - to get the shape information in % Thanks @Laz! :D

No     0.73463
Yes    0.26537
Name: Churn, dtype: float64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [6]:
df_num = df.select_dtypes(include=np.number)
df_num.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
0,0,1,29.85
1,0,34,56.95
2,0,2,53.85
3,0,45,42.3
4,0,2,70.7


In [7]:
df_cat = df.select_dtypes(include=np.object_)
df_cat.head()

Unnamed: 0,customerID,gender,Partner,Dependents,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,TotalCharges,Churn
0,7590-VHVEG,Female,Yes,No,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,No
1,5575-GNVDE,Male,No,No,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,1889.5,No
2,3668-QPYBK,Male,No,No,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,108.15,Yes
3,7795-CFOCW,Male,No,No,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),1840.75,No
4,9237-HQITU,Female,No,No,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,151.65,Yes


3. extract target

In [8]:
y = df_cat['Churn']
df_cat = df_cat.drop(['Churn'], axis=1)

4. scale the independent values

In [9]:
transformer = StandardScaler().fit(df_num)
standard_x = transformer.transform(df_num)
X = pd.DataFrame(standard_x)
X.head()

Unnamed: 0,0,1,2
0,-0.439916,-1.277445,-1.160323
1,-0.439916,0.066327,-0.259629
2,-0.439916,-1.236724,-0.36266
3,-0.439916,0.514251,-0.746535
4,-0.439916,-1.236724,0.197365


5. Build log regression model 

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 30) #select the sample size and train the model

In [11]:
classifyin = LogisticRegression(random_state=0).fit(X_train, y_train) #classify the tested samples without random
predict = classifyin.predict(X_test)

6. Evaluate

In [12]:
len(X_test)

2113

In [13]:
confusion_matrix(y_test,predict) #make the prediction based on the Y_test sample and the trained x variables


array([[1372,  127],
       [ 347,  267]], dtype=int64)

It has an OK model, out of 2113 values, it managed to correctly identify 1372 NOs and 267 YESs

347 were classified as YES but are really NO

127 were classified as NO but are really YES

In [14]:
round_score = classifyin.score(X_test,y_test)
round_score = round(round_score, 4)
round_score


0.7757

In [15]:
print(classification_report(y_test, predict))

              precision    recall  f1-score   support

          No       0.80      0.92      0.85      1499
         Yes       0.68      0.43      0.53       614

    accuracy                           0.78      2113
   macro avg       0.74      0.68      0.69      2113
weighted avg       0.76      0.78      0.76      2113



a. high precision for nos

b. recall for no is high, no values are better predicted than yes values as the ratio of no to yes is higher in the test and trained models

7. Even a simple model will give us more than 70% accuracy. Why?

##

8. SMOTE. Is there an improvement?

1. we create new values for the value of yes to have the same sample as the value no 

In [16]:
smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X, y) #we fill the sample with more data with the nearest points
y_sm.value_counts()

No     5174
Yes    5174
Name: Churn, dtype: int64

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.3, random_state=50)
classification = LogisticRegression(random_state=0).fit(X_train, y_train)
predictions = classification.predict(X_test)

round_score = classification.score(X_test,y_test)
round_score = round(round_score, 4)
round_score
#result is 0,72 its a bit worse than the previous

0.7275

In [18]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

          No       0.73      0.72      0.73      1565
         Yes       0.72      0.73      0.73      1540

    accuracy                           0.73      3105
   macro avg       0.73      0.73      0.73      3105
weighted avg       0.73      0.73      0.73      3105



a. we notice the results for yes have increased greatly overall

b. no results have slightly decreased, since the sample is now equal, the trained models have equal opportunites to come with true positives and true negatives. 

9. TOMEK LINKS. Is there an improvement?

In [19]:
tl = TomekLinks(sampling_strategy='majority')
X_tl, y_tl = tl.fit_resample(X, y)
y_tl.value_counts()
#no values has been reduced - expected as we are looking only for the 'majority' class

No     4697
Yes    1869
Name: Churn, dtype: int64

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.3, random_state=100)
classification = LogisticRegression(random_state=0, multi_class='ovr').fit(X_train, y_train)
predictions = classification.predict(X_test)

classification.score(X_test, y_test)
#the accuracy has gone up as the model is better at predicting no (lower population + true positives + true negatives)

0.8040609137055837

In [21]:
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

          No       0.82      0.92      0.87      1397
         Yes       0.73      0.52      0.61       573

    accuracy                           0.80      1970
   macro avg       0.78      0.72      0.74      1970
weighted avg       0.80      0.80      0.79      1970

