In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import  accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks


In [52]:
data = pd.read_csv('Customer-Churn.csv')
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [53]:
data.columns=['gender', 'senior_citizen', 'partner', 'dependents', 'tenure',
       'phone_service', 'online_security', 'online_backup', 'device_protection',
       'tech_support', 'streaming_tv', 'streaming_movies', 'contract',
       'monthly_charges', 'total_charges', 'churn']
data.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,contract,monthly_charges,total_charges,churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [54]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   gender             7043 non-null   object 
 1   senior_citizen     7043 non-null   int64  
 2   partner            7043 non-null   object 
 3   dependents         7043 non-null   object 
 4   tenure             7043 non-null   int64  
 5   phone_service      7043 non-null   object 
 6   online_security    7043 non-null   object 
 7   online_backup      7043 non-null   object 
 8   device_protection  7043 non-null   object 
 9   tech_support       7043 non-null   object 
 10  streaming_tv       7043 non-null   object 
 11  streaming_movies   7043 non-null   object 
 12  contract           7043 non-null   object 
 13  monthly_charges    7043 non-null   float64
 14  total_charges      7043 non-null   object 
 15  churn              7043 non-null   object 
dtypes: float64(1), int64(2),

In [55]:
data['total_charges'] = pd.to_numeric(data['total_charges'], errors='coerce')

In [56]:
data.isna().sum()

gender                0
senior_citizen        0
partner               0
dependents            0
tenure                0
phone_service         0
online_security       0
online_backup         0
device_protection     0
tech_support          0
streaming_tv          0
streaming_movies      0
contract              0
monthly_charges       0
total_charges        11
churn                 0
dtype: int64

In [57]:
mean = np.mean(data['total_charges'])
data['total_charges'] = data['total_charges'].fillna(mean)

In [58]:
numerical = data.select_dtypes(np.number)
numerical

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges
0,0,1,29.85,29.85
1,0,34,56.95,1889.50
2,0,2,53.85,108.15
3,0,45,42.30,1840.75
4,0,2,70.70,151.65
...,...,...,...,...
7038,0,24,84.80,1990.50
7039,0,72,103.20,7362.90
7040,0,11,29.60,346.45
7041,1,4,74.40,306.60


In [59]:
churn_dict = {'Yes' :1, 'No' :0}
data['churn'] = data['churn'].map(churn_dict)

In [60]:
data['churn'].unique()

array([0, 1], dtype=int64)

In [61]:
data['churn'].value_counts()

0    5174
1    1869
Name: churn, dtype: int64

In [62]:
# Splitting the dataset into train and test

In [63]:
scaler = StandardScaler()
scaler.fit(numerical)
numerical= scaler.transform(numerical)

In [67]:
X = numerical
y = data['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [68]:
model = LogisticRegression(random_state=0, solver='lbfgs',
                  multi_class='multinomial').fit(X_train, y_train)

In [69]:
predictions =model.predict(X_test)
print("The accuracy of the model is:"  (classification.score(X_test, y_test)))

The accuracy of the model is: 0.79


# Apply SMOTE for upsampling the data

In [78]:
X = data[['senior_citizen', 'tenure', 'monthly_charges', 'total_charges']]
y = data['churn']

In [82]:
smote = SMOTE()

X_sm, y_sm = smote.fit_sample(X, y)
y_sm = y_sm.to_numpy()
y_sm = pd.Series(data=y_sm.flatten())
y_sm.value_counts()

1    5174
0    5174
dtype: int64

In [90]:
X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state=100)

classification = LogisticRegression(random_state=0, multi_class='ovr', max_iter=10000).fit(X_train, y_train.values.ravel())
y_pred = classification.predict(X_test)
print("The accuracy: ", (classification.score(X_test, y_test)))

from sklearn.metrics import cohen_kappa_score
print("The kappa: ", (cohen_kappa_score(y_pred,y_test)) )

The accuracy:{:4.2f}  0.7256038647342995
The kappa: {:4.2f} 0.4510073456763607


In [89]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
print("The accuracy of the model is: ".format(model.score(X_test, y_test)))

The accuracy of the model is: 0.75


# Apply TomekLinks for downsampling

In [85]:
tl = TomekLinks('majority')
X_tl, y_tl = tl.fit_sample(X, y)

unique, counts = np.unique(y_tl, return_counts=True)
print(np.asarray((unique, counts)).T)

[[   0 4620]
 [   1 1869]]




In [94]:
X_train, X_test, y_train, y_test = train_test_split(X_tl, y_tl, test_size=0.2, random_state=100)
classification = LogisticRegression(random_state=0, multi_class='ovr', max_iter=10000).fit(X_train, y_train)

y_pred = classification.predict(X_test)
print("The accuracy : ", (classification.score(X_test, y_test)))

from sklearn.metrics import cohen_kappa_score
print("The kappa:", (cohen_kappa_score(y_pred,y_test)) )

The accuracy :  0.802773497688752
The kappa: 0.516687466001088
