In [19]:
#Import the required libraries and modules that you would need.
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import Normalizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from imblearn.under_sampling import RandomUnderSampler
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, plot_confusion_matrix

In [3]:
churnData = pd.read_csv('churnData.csv')

In [17]:
churnData.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [5]:
x = churnData[['tenure','SeniorCitizen','MonthlyCharges','TotalCharges']]
# Normalizing data
transformer = Normalizer().fit(x)
x_normalized = transformer.transform(x)
x = pd.DataFrame(x_normalized)
y = churnData['Churn']

In [11]:
#Split the data into a training set and a test set.
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=100)

Apply SMOTE for upsampling the data

- Use logistic regression to fit the model and compute the accuracy of the model.
- Use decision tree classifier to fit the model and compute the accuracy of the model.
- Compare the accuracies of the two models.

In [13]:
##Over Sampling
#over sampling using SMOTE
sm = SMOTE(random_state = 42)
X_res, Y_res = sm.fit_resample(X_train,y_train)
Y_res.value_counts()

No     4150
Yes    4150
Name: Churn, dtype: int64

In [20]:
#Use logistic regression to fit the model and compute the accuracy of the model.
#Use decision tree classifier to fit the model and compute the accuracy of the model.
model1 = DecisionTreeClassifier()
model2 = LogisticRegression()


In [24]:

model_pipeline = [model1, model2]
model_names = ['Decision Tree', 'Logistic Regression']
scores = {}
i=0
for model in model_pipeline:
    mean_score = np.mean(cross_val_score(model, X_res, Y_res, cv=25))
    scores[model_names[i]] = mean_score
    i = i+1
print(scores)

# We can use the result to choose the best performing model

{'Decision Tree': 0.782409638554217, 'Logistic Regression': 0.6743373493975904}


Apply TomekLinks for downsampling

- It is important to remember that it does not make the two classes equal but only removes the points from the majority class that are close to other points in minority class.
- Use logistic regression to fit the model and compute the accuracy of the model.
- Use decision tree classifier to fit the model and compute the accuracy of the model.
- Compare the accuracies of the two models.
- You can also apply this algorithm one more time and check the how the imbalance in the two classes changed from the last time.

In [22]:
#under sampling using TomekLinks
t1 = TomekLinks()
X_under,Y_under = t1.fit_resample(X_train,y_train)
Y_under.value_counts()

No     3739
Yes    1475
Name: Churn, dtype: int64

In [25]:

model_pipeline = [model1, model2]
model_names = ['Decision Tree', 'Logistic Regression']
scores = {}
i=0
for model in model_pipeline:
    mean_score = np.mean(cross_val_score(model,X_under,Y_under, cv=25))
    scores[model_names[i]] = mean_score
    i = i+1
print(scores)

# We can use the result to choose the best performing model

{'Decision Tree': 0.764308980493191, 'Logistic Regression': 0.7479775487670224}
