In [32]:
import pandas as pd
import numpy as np
pd.set_option("display.max_columns", None)
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE

## READ

In [33]:
churnData = pd.read_csv("/Users/jesuspetit/Desktop/Customer-Churn.csv")
churnData.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


## INFO

In [34]:
churnData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   OnlineSecurity    7043 non-null   object 
 7   OnlineBackup      7043 non-null   object 
 8   DeviceProtection  7043 non-null   object 
 9   TechSupport       7043 non-null   object 
 10  StreamingTV       7043 non-null   object 
 11  StreamingMovies   7043 non-null   object 
 12  Contract          7043 non-null   object 
 13  MonthlyCharges    7043 non-null   float64
 14  TotalCharges      7043 non-null   object 
 15  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(13)
memory

## To numeric

In [35]:
churnData = churnData.replace(' ', np.nan)

In [36]:
churnData['TotalCharges'] = pd.to_numeric(churnData['TotalCharges'])

In [37]:
churnData

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.50,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.50,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.90,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.60,Yes


In [38]:
churnData.isna().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [39]:
churnData['TotalCharges'].fillna((churnData['TotalCharges'].mean()), inplace = True)

In [40]:
X = churnData.iloc[: , [4, 14, 13]]
y = churnData.iloc[:, 1]

## Scaler

In [41]:
scaler = StandardScaler()

X = scaler.fit_transform(X)

In [42]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state= 0)

## Logistic Regression

In [43]:
classifier = LogisticRegression()

classifier.fit(X_train, y_train)

In [44]:
pred_train = classifier.predict(X_train)
pred_test = classifier.predict(X_test)

In [45]:
print('The accuracy of the model in the train set is: ', accuracy_score(y_train, pred_train))
print('The accuracy of the model in the test set is: ', accuracy_score(y_test, pred_test))

The accuracy of the model in the train set is:  0.8405906853464596
The accuracy of the model in the test set is:  0.8296422487223168


In [46]:
y.value_counts()

0    5901
1    1142
Name: SeniorCitizen, dtype: int64

## Smote

In [47]:
sm = SMOTE(k_neighbors=3)

X_train_SMOTE, y_train_SMOTE = sm.fit_resample(X_train, y_train)

In [48]:
print(X_train.shape)
print(X_train_SMOTE.shape)

(5282, 3)
(8880, 3)


In [49]:
y_train.value_counts()/len(y_train)

0    0.840591
1    0.159409
Name: SeniorCitizen, dtype: float64

In [50]:
y_train_SMOTE.value_counts()/len(y_train_SMOTE)

0    0.5
1    0.5
Name: SeniorCitizen, dtype: float64

In [51]:
classifier.fit(X_train_SMOTE, y_train_SMOTE)

In [52]:
pred_train_SMOTE = classifier.predict(X_train_SMOTE)
pred_test_SMOTE = classifier.predict(X_test)

In [53]:
print('The accuracy of the model in the train set is: ', accuracy_score(y_train_SMOTE, pred_train_SMOTE))
print('The accuracy of the model in the test set is: ', accuracy_score(y_test, pred_test_SMOTE))

The accuracy of the model in the train set is:  0.6477477477477478
The accuracy of the model in the test set is:  0.565587734241908
