In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from scipy.stats import chi2_contingency
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report
import warnings
warnings.filterwarnings("ignore")

First, we read the data

In [2]:
df = pd.read_csv('Customer-Churn.csv')
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


Secondly, we clean up the column names, replacing the spaces with underscores and breaking out the two word columns.

In [3]:
import re

df.columns = df.columns.map(lambda x: re.sub(r'(?<!^)(?=[A-Z])', '_', x))
df.columns = df.columns.str.lower()
df.rename(columns={'streaming_t_v': 'streaming_tv'}, inplace=True)

In [4]:
df['total_charges'] = df['total_charges'].replace(' ', np.nan)
df['total_charges'] = pd.to_numeric(df['total_charges'], errors='coerce')

We fill the NaNs in total_charges with the median

In [5]:
median = df['total_charges'].median()
df['total_charges'].fillna(median, inplace=True)

We dummify the categorical variables

In [6]:
df = pd.get_dummies(df, columns=["gender","partner", "dependents", "phone_service", "online_security",
                                "online_backup", "device_protection", "tech_support",
                                "streaming_tv", "streaming_movies", "contract"])
df

Unnamed: 0,senior_citizen,tenure,monthly_charges,total_charges,churn,gender_Female,gender_Male,partner_No,partner_Yes,dependents_No,...,tech_support_Yes,streaming_tv_No,streaming_tv_No internet service,streaming_tv_Yes,streaming_movies_No,streaming_movies_No internet service,streaming_movies_Yes,contract_Month-to-month,contract_One year,contract_Two year
0,0,1,29.85,29.85,No,1,0,0,1,1,...,0,1,0,0,1,0,0,1,0,0
1,0,34,56.95,1889.50,No,0,1,1,0,1,...,0,1,0,0,1,0,0,0,1,0
2,0,2,53.85,108.15,Yes,0,1,1,0,1,...,0,1,0,0,1,0,0,1,0,0
3,0,45,42.30,1840.75,No,0,1,1,0,1,...,1,1,0,0,1,0,0,0,1,0
4,0,2,70.70,151.65,Yes,1,0,1,0,1,...,0,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,24,84.80,1990.50,No,0,1,0,1,0,...,1,0,0,1,0,0,1,0,1,0
7039,0,72,103.20,7362.90,No,1,0,0,1,0,...,0,0,0,1,0,0,1,0,1,0
7040,0,11,29.60,346.45,No,1,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0
7041,1,4,74.40,306.60,Yes,0,1,0,1,1,...,0,1,0,0,1,0,0,1,0,0


We replace the target variable with 1s and 0s

In [7]:
df["churn"] = df["churn"].replace({'Yes': 1, 'No': 0})

We can observe that the target variable is imbalanced

In [8]:
df["churn"].value_counts()

0    5174
1    1869
Name: churn, dtype: int64

We split the X and y and complete the train test split

In [9]:
X = df.drop("churn", axis=1)
y = df[["churn"]]

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

We apply SMOTE to deal with the imbalance

In [11]:
smote = SMOTE()
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

We train the first model (logistic regression)

In [12]:
lr = LogisticRegression()
lr.fit(X_train_smote, y_train_smote)

We evaluate the logistic regression model. The accuracy is acceptable but it's still not great at predicting churn

In [13]:
y_pred_lr_smote = lr.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_pred_lr_smote)
lr_precision = precision_score(y_test, y_pred_lr_smote)
lr_recall = recall_score(y_test, y_pred_lr_smote)
lr_f1 = f1_score(y_test, y_pred_lr_smote)

print(f"Model Accuracy: {lr_accuracy}")
print(f"Precision: {lr_precision}")
print(f"Recall: {lr_recall}")
print(f"F1 Score: {lr_f1}")

Model Accuracy: 0.7970191625266146
Precision: 0.6018735362997658
Recall: 0.6890080428954424
F1 Score: 0.6425000000000001


We train a Decision Tree model

In [14]:
dt = DecisionTreeClassifier(criterion="entropy", max_depth=None, random_state=42)

In [15]:
dt.fit(X_train_smote, y_train_smote)

We evaluate and see it's worse than logistic regression

In [16]:
y_pred_dt_smote = dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt_smote)
dt_precision = precision_score(y_test, y_pred_dt_smote)
dt_recall = recall_score(y_test, y_pred_dt_smote)
dt_f1 = f1_score(y_test, y_pred_dt_smote)

print(f"Model Accuracy: {dt_accuracy}")
print(f"Precision: {dt_precision}")
print(f"Recall: {dt_recall}")
print(f"F1 Score: {dt_f1}")

Model Accuracy: 0.7374024130589071
Precision: 0.5038759689922481
Recall: 0.5227882037533512
F1 Score: 0.5131578947368421


We try with Tomek Link, starting with Logistic Regression

In [17]:
tomek = TomekLinks()

X_train_tomek, y_train_tomek = tomek.fit_resample(X_train, y_train)

In [18]:
lr.fit(X_train_tomek, y_train_tomek)

We can see it's better than SMOTE, particularly at predicting churn

In [19]:
y_pred_lr_tomek = lr.predict(X_test)
lr_accuracy = accuracy_score(y_test, y_pred_lr_tomek)
lr_precision = precision_score(y_test, y_pred_lr_tomek)
lr_recall = recall_score(y_test, y_pred_lr_tomek)
lr_f1 = f1_score(y_test, y_pred_lr_tomek)

print(f"Model Accuracy: {lr_accuracy}")
print(f"Precision: {lr_precision}")
print(f"Recall: {lr_recall}")
print(f"F1 Score: {lr_f1}")

Model Accuracy: 0.8069552874378992
Precision: 0.6259351620947631
Recall: 0.6729222520107239
F1 Score: 0.648578811369509


We try Tomek Links with Decision Trees, and we see it's still worse than logistic regression

In [20]:
dt.fit(X_train_tomek, y_train_tomek)

In [21]:
y_pred_dt_tomek = dt.predict(X_test)
dt_accuracy = accuracy_score(y_test, y_pred_dt_tomek)
dt_precision = precision_score(y_test, y_pred_dt_tomek)
dt_recall = recall_score(y_test, y_pred_dt_tomek)
dt_f1 = f1_score(y_test, y_pred_dt_tomek)

print(f"Model Accuracy: {dt_accuracy}")
print(f"Precision: {dt_precision}")
print(f"Recall: {dt_recall}")
print(f"F1 Score: {dt_f1}")

Model Accuracy: 0.7338537970191625
Precision: 0.49764150943396224
Recall: 0.5656836461126006
F1 Score: 0.5294855708908406


### Lab | Random Forests

In [24]:
rf = DecisionTreeClassifier()
rf.fit(X_train_smote, y_train_smote)
y_pred_rf = rf.predict(X_test)

In [25]:
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_precision = precision_score(y_test, y_pred_rf)
rf_recall = recall_score(y_test, y_pred_rf)
rf_f1 = f1_score(y_test, y_pred_rf)

print(f"Model Accuracy: {rf_accuracy}")
print(f"Precision: {rf_precision}")
print(f"Recall: {rf_recall}")
print(f"F1 Score: {rf_f1}")

Model Accuracy: 0.7352732434350603
Precision: 0.5
Recall: 0.5388739946380697
F1 Score: 0.5187096774193548


We can see that Random Forests with SMOTE does not perform better than Logistic Regression