In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
data = pd.read_csv('/Users/giulianamiranda/Documents/Labs/lab-cross-validation/files_for_lab/Customer-Churn.csv')
data

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [21]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
# Because it has 11 null values, I'm filling them. They are not actually empty, but spaces
mean_total_charges = data['TotalCharges'].astype(float).mean()
data['TotalCharges'].replace(' ', np.nan)
data['TotalCharges'].fillna(mean_total_charges, inplace=True)


In [22]:
# Dividing the numerical and categorical values
numerical = data.select_dtypes(include='number')
categorical = data.select_dtypes(include='object')

In [32]:
from sklearn.model_selection import train_test_split

X = numerical
y = data['Churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Applying SMOTE for upsampling the data

In [33]:
from imblearn.over_sampling import SMOTE

# Applying SMOTE to the training data only

smote = SMOTE(random_state=42)
X_train_sm, y_train_sm = smote.fit_resample(X_train, y_train)

In [34]:
y_train_sm.value_counts()

No     4138
Yes    4138
Name: Churn, dtype: int64

In [35]:
# Use logistic regression to fit the model and compute the accuracy of the model

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_sm)
X_test_scaled = scaler.transform(X_test)

lr = LogisticRegression(random_state=42)
lr.fit(X_train_scaled, y_train_sm)

# Predict on the test set
y_pred = lr.predict(X_test_scaled)

# Compute the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100}%')


Accuracy: 74.02413058907025%


In [47]:
# Use decision tree classifier to fit the model and compute the accuracy of the model.

from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(max_depth = 7)

# fit the model
model.fit(X_train_scaled, y_train_sm) #com o smote

# evaluating the predictions, getting the accuracy


print(f'Accuracy:', model.score(X_test_scaled, y_test) * 100, '%')



Accuracy: 72.10787792760823 %


In [36]:
# Compare the accuracies of the two models

# The accuracy of the LR model is a little higher than the decision tree


In [43]:
# Apply TomekLinks for downsampling

# It is important to remember that it does not make the two classes equal but only removes the points from the majority class that 
# are close to other points in minority class.
from imblearn.under_sampling import TomekLinks

tl = TomekLinks()

X_tl, y_tl = tl.fit_resample(X_train,y_train)
print(y_tl.value_counts())



No     3675
Yes    1496
Name: Churn, dtype: int64


In [49]:
#  Use logistic regression to fit the model and compute the accuracy of the model

X_train_scaled_2 = scaler.fit_transform(X_tl)
X_test_scaled_2 = scaler.transform(X_test)

lr = LogisticRegression(random_state=42)
lr.fit(X_train_scaled_2, y_tl)

# Predict on the test set
y_pred_2 = lr.predict(X_test_scaled_2)

# Compute the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100}%')




Accuracy: 79.20511000709723%


In [50]:
# Use decision tree classifier to fit the model and compute the accuracy of the model

model_2 = DecisionTreeClassifier(max_depth = 7)

# fit the model
model_2.fit(X_train_scaled_2, y_tl) #com o TL

# evaluating the predictions, getting the accuracy

print(f'Accuracy:', model.score(X_test_scaled_2, y_test) * 100, '%')



Accuracy: 63.94606103619588 %


In [None]:
# Compare the accuracies of the two models.

# Once more, the accuracy of the LR model is higher than whit the decision tree



In [None]:
# You can also apply this algorithm one more time and check the how the imbalance in the two classes 
# changed from the last time.

# The accuracy of the LR models were higher both times, but worked better with the under sampling technique
# With the decision tree, it was better with the oversamsling
