In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import cross_validate
from imblearn.under_sampling import TomekLinks
from sklearn.metrics import classification_report

In [2]:
df_data = pd.read_csv(r"C:\Users\joaoa\Desktop\Ironhack\Labs\lab-cross-validation\files_for_lab\Customer-Churn.csv")
df_data

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [3]:
# Checking null values
df_data.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [4]:
# Checking " " values in all columns
for col in df_data.columns:
    if ' ' in df_data[col].values:
        print("The variable '" + col + "' have " + str(df_data[col].value_counts()[" "]) + " empty values (' ')")
        
# Since I don't know if these empty cells are zeros or nulls, I decided to remove the rows that contain them
df_data.drop(df_data[df_data["TotalCharges"] == " "].index, inplace = True)
df_data.reset_index(drop = True, inplace = True)
df_data

The variable 'TotalCharges' have 11 empty values (' ')


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7028,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7029,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7030,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [5]:
# Checking data types
print(df_data.dtypes)

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object


In [6]:
# Changing some data types ("TotalCharges" should be float)
df_data["TotalCharges"] = df_data["TotalCharges"].astype(float)
print(df_data.dtypes)

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object


In [7]:
# Creating a new dataframe with categorical values
df_categ = df_data.select_dtypes(include = object)

# One hot enconding
dummies = pd.get_dummies(df_categ.drop("Churn", axis = 1), drop_first = True)
dummies

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year
0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0
2,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,1,1,1,1,0,1,0,0,0,1,0,1,0,1,0,1,1,0
7028,0,1,1,1,0,0,0,1,0,1,0,0,0,1,0,1,1,0
7029,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
7030,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [8]:
# Creating a new dataframe with numerical values
df_num = df_data.select_dtypes(include = np.number)
df_num

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,0,1,29.85,29.85
1,0,34,56.95,1889.50
2,0,2,53.85,108.15
3,0,45,42.30,1840.75
4,0,2,70.70,151.65
...,...,...,...,...
7027,0,24,84.80,1990.50
7028,0,72,103.20,7362.90
7029,0,11,29.60,346.45
7030,1,4,74.40,306.60


In [9]:
# Gathering all variables together again
df_data = pd.concat([dummies, df_num, df_data["Churn"]], axis = 1)
df_data

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
0,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,29.85,29.85,No
1,1,0,0,1,0,1,0,0,0,1,...,0,0,0,1,0,0,34,56.95,1889.50,No
2,1,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,0,2,53.85,108.15,Yes
3,1,0,0,0,0,1,0,0,0,1,...,0,0,0,1,0,0,45,42.30,1840.75,No
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,2,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,1,1,1,1,0,1,0,0,0,1,...,1,0,1,1,0,0,24,84.80,1990.50,No
7028,0,1,1,1,0,0,0,1,0,1,...,1,0,1,1,0,0,72,103.20,7362.90,No
7029,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,0,11,29.60,346.45,No
7030,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,1,4,74.40,306.60,Yes


In [10]:
# Defining X and y variables
X = df_data.drop("Churn",axis = 1)
y = df_data["Churn"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)

# Train dataframe
df_train = pd.concat([X_train, y_train], axis = 1)
df_train = df_train.reset_index(drop = True)
df_train

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
0,0,0,0,0,0,0,0,0,0,1,...,1,0,1,0,0,0,43,49.05,2076.20,Yes
1,1,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,3,53.40,188.70,Yes
2,0,1,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,55,77.75,4458.15,Yes
3,1,1,1,1,0,1,0,0,0,0,...,0,0,0,0,1,0,45,54.65,2553.70,No
4,0,1,1,1,0,1,0,0,0,0,...,1,0,1,1,0,0,55,100.90,5448.60,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5620,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,3,50.40,137.25,No
5621,0,0,0,1,0,0,0,1,0,1,...,1,0,1,1,0,0,51,111.50,5703.25,No
5622,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,9,81.15,784.45,No
5623,0,0,0,1,1,0,1,0,1,0,...,0,1,0,1,0,0,50,19.75,989.05,No


In [11]:
# Test dataframe
df_test = pd.concat([X_test, y_test], axis = 1)
df_test = df_test.reset_index(drop = True)
df_test

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
0,1,1,0,1,1,0,1,0,1,0,...,0,1,0,0,1,1,61,25.00,1501.75,No
1,0,0,0,1,1,0,1,0,1,0,...,0,1,0,0,0,0,19,24.70,465.85,No
2,1,1,0,1,0,0,0,0,0,1,...,1,0,1,0,0,0,13,102.25,1359.00,Yes
3,1,1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,37,55.05,2030.75,No
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,6,29.45,161.45,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1402,1,0,0,1,1,0,1,0,1,0,...,0,1,0,0,0,0,1,19.05,19.05,No
1403,1,0,0,1,0,0,0,0,0,0,...,1,0,1,0,0,0,12,94.55,1173.55,No
1404,0,0,0,1,0,0,0,1,0,1,...,0,0,0,1,0,0,26,56.05,1553.20,No
1405,0,0,0,1,0,1,0,1,0,1,...,1,0,1,1,0,1,35,110.80,3836.30,No


In [12]:
# Normalizing X_train variables
scaler = MinMaxScaler() # This is the normalization process
X_train_norm = scaler.fit_transform(df_train.drop("Churn", axis = 1))
X_train_norm = pd.DataFrame(X_train_norm, columns = df_train.drop("Churn", axis = 1).columns)
X_train_norm

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.591549,0.306468,0.237406
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.028169,0.349751,0.019600
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.760563,0.592040,0.512269
3,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.619718,0.362189,0.292507
4,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.760563,0.822388,0.626561
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5620,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.028169,0.319900,0.013663
5621,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.704225,0.927861,0.655947
5622,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.112676,0.625871,0.088346
5623,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.690141,0.014925,0.111955


In [13]:
# Normalizing X_test variables
scaler = MinMaxScaler() # This is the normalization process
X_test_norm = scaler.fit_transform(df_test.drop("Churn", axis = 1))
X_test_norm = pd.DataFrame(X_test_norm, columns = df_test.drop("Churn", axis = 1).columns)
X_test_norm

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,...,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.845070,0.063063,0.171414
1,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.253521,0.060060,0.051674
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.169014,0.836336,0.154913
3,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.507042,0.363864,0.232560
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.070423,0.107608,0.016489
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1402,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.003504,0.000029
1403,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.154930,0.759259,0.133477
1404,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.352113,0.373874,0.177361
1405,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.478873,0.921922,0.441263


In [14]:
# Gathering all Train variables together again
df_train = pd.concat([X_train_norm, df_train["Churn"]], axis = 1)
df_train

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.591549,0.306468,0.237406,Yes
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.028169,0.349751,0.019600,Yes
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.760563,0.592040,0.512269,Yes
3,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.619718,0.362189,0.292507,No
4,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.760563,0.822388,0.626561,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5620,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.028169,0.319900,0.013663,No
5621,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.704225,0.927861,0.655947,No
5622,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.112676,0.625871,0.088346,No
5623,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.690141,0.014925,0.111955,No


In [15]:
# Gathering all Test variables together again
df_test = pd.concat([X_test_norm, df_test["Churn"]], axis = 1)
df_test

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,1.0,1.0,0.845070,0.063063,0.171414,No
1,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.253521,0.060060,0.051674,No
2,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.169014,0.836336,0.154913,Yes
3,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.507042,0.363864,0.232560,No
4,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.070423,0.107608,0.016489,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1402,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.003504,0.000029,No
1403,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.154930,0.759259,0.133477,No
1404,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.352113,0.373874,0.177361,No
1405,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.478873,0.921922,0.441263,No


In [16]:
# Gathering Train and Test variables together for cross validation
df_data = pd.concat([df_train, df_test], axis = 0)
df_data = df_data.reset_index(drop = True)
df_data

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.591549,0.306468,0.237406,Yes
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.028169,0.349751,0.019600,Yes
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.760563,0.592040,0.512269,Yes
3,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.619718,0.362189,0.292507,No
4,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.760563,0.822388,0.626561,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.000000,0.003504,0.000029,No
7028,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.154930,0.759259,0.133477,No
7029,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.352113,0.373874,0.177361,No
7030,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,1.0,0.478873,0.921922,0.441263,No


In [17]:
# Computing the frequency of the unique values of the target variable before oversampling or undersampling the train dataset
df_train["Churn"].value_counts()

No     4130
Yes    1495
Name: Churn, dtype: int64

# Oversampling

In [18]:
# Oversampling df_train with SMOTE method
smote = SMOTE()
x_sm, y_sm = smote.fit_resample(df_train.drop("Churn", axis = 1), df_train["Churn"])

# Creating a balanced dataframe
data_smote = pd.concat([x_sm, y_sm], axis = 1)
data_smote

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,1.000000,...,1.0,0.0,1.0,0.0,0.0,0.0,0.591549,0.306468,0.237406,Yes
1,1.0,0.0,0.0,1.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.028169,0.349751,0.019600,Yes
2,0.0,1.0,0.0,1.0,0.0,0.000000,0.0,0.0,0.0,1.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.760563,0.592040,0.512269,Yes
3,1.0,1.0,1.0,1.0,0.0,1.000000,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,1.0,0.0,0.619718,0.362189,0.292507,No
4,0.0,1.0,1.0,1.0,0.0,1.000000,0.0,0.0,0.0,0.000000,...,1.0,0.0,1.0,1.0,0.0,0.0,0.760563,0.822388,0.626561,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8255,1.0,1.0,0.0,1.0,0.0,0.000000,0.0,0.0,0.0,1.000000,...,1.0,0.0,1.0,0.0,0.0,1.0,0.662952,0.814516,0.570716,Yes
8256,0.0,0.0,0.0,1.0,0.0,1.000000,0.0,1.0,0.0,0.640531,...,0.0,0.0,1.0,0.0,0.0,0.0,0.343737,0.597553,0.217691,Yes
8257,1.0,0.0,0.0,1.0,0.0,0.321569,0.0,1.0,0.0,0.000000,...,0.0,0.0,1.0,0.0,0.0,0.0,0.251588,0.687966,0.195553,Yes
8258,0.0,1.0,0.0,1.0,0.0,0.000000,0.0,0.0,0.0,0.000000,...,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.524554,0.006014,Yes


In [19]:
# Computing the frequency of the unique values of the target variable after oversampling the train dataframe
data_smote["Churn"].value_counts()

Yes    4130
No     4130
Name: Churn, dtype: int64

In [20]:
# Building a logistic model with data_smote
X_train_smote = data_smote.drop("Churn", axis = 1)
y_train_smote = data_smote["Churn"]

LR_1 = LogisticRegression()
LR_1.fit(X_train_smote, y_train_smote)
pred_1 = LR_1.predict(df_test.drop("Churn", axis = 1))
print(classification_report(df_test["Churn"], pred_1))

              precision    recall  f1-score   support

          No       0.90      0.72      0.80      1033
         Yes       0.51      0.79      0.62       374

    accuracy                           0.74      1407
   macro avg       0.71      0.76      0.71      1407
weighted avg       0.80      0.74      0.75      1407



In [21]:
# Building a decision tree model with data_smote
DT_1 = DecisionTreeClassifier(max_depth = 7)
DT_1.fit(X_train_smote, y_train_smote)
print("Accuracy: ", DT_1.score(df_test.drop("Churn", axis = 1), df_test["Churn"]))

Accuracy:  0.7142857142857143


In [22]:
# K fold cross validation
results_LR_1 = cross_validate(LR_1, df_data.drop("Churn", axis = 1), df_data["Churn"], cv = 10)
results_DT_1 = cross_validate(DT_1, df_data.drop("Churn", axis = 1), df_data["Churn"], cv = 10)
print("Accuracy mean of logistic model: " + str(results_LR_1["test_score"].mean()))
print("Accuracy mean of decision tree model: " + str(results_DT_1["test_score"].mean()))

# In average the logistic model performed better

Accuracy mean of logistic model: 0.7987753378378379
Accuracy mean of decision tree model: 0.7862583651234967


# Undersampling 

In [23]:
# Undersampling with TomekLinks method
tl = TomekLinks()
X = df_train.drop("Churn", axis = 1)
y = df_train["Churn"]
x_tl, y_tl = tl.fit_resample(X,y)

# Creating a more balanced dataframe
data_tomeklinks = pd.concat([x_tl, y_tl], axis = 1)
data_tomeklinks

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.591549,0.306468,0.237406,Yes
1,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.028169,0.349751,0.019600,Yes
2,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.760563,0.592040,0.512269,Yes
3,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.619718,0.362189,0.292507,No
4,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.760563,0.822388,0.626561,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5179,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.028169,0.319900,0.013663,No
5180,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,1.0,0.0,1.0,1.0,0.0,0.0,0.704225,0.927861,0.655947,No
5181,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.112676,0.625871,0.088346,No
5182,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.690141,0.014925,0.111955,No


In [24]:
# Computing the frequency of the unique values of the target variable after undersampling the train dataset
data_tomeklinks["Churn"].value_counts()

No     3689
Yes    1495
Name: Churn, dtype: int64

In [25]:
# Building a logistic model with data_tomeklinks
X_train_TomekLinks = data_tomeklinks.drop("Churn", axis = 1)
y_train_TomekLinks = data_tomeklinks["Churn"]

LR_2 = LogisticRegression()
LR_2.fit(X_train_TomekLinks, y_train_TomekLinks)
pred_2 = LR_2.predict(df_test.drop("Churn", axis = 1))
print(classification_report(df_test["Churn"], pred_2))

              precision    recall  f1-score   support

          No       0.85      0.85      0.85      1033
         Yes       0.58      0.59      0.59       374

    accuracy                           0.78      1407
   macro avg       0.72      0.72      0.72      1407
weighted avg       0.78      0.78      0.78      1407



In [26]:
# Building a decision tree model with data_tomeklinks
DT_2 = DecisionTreeClassifier(max_depth = 7)
DT_2.fit(X_train_TomekLinks, y_train_TomekLinks)
print("Accuracy: ", DT_2.score(df_test.drop("Churn", axis = 1), df_test["Churn"]))

Accuracy:  0.7491115849324804


In [27]:
# K fold cross validation
results_LR_2 = cross_validate(LR_2, df_data.drop("Churn", axis = 1), df_data["Churn"], cv = 10)
results_DT_2 = cross_validate(DT_2, df_data.drop("Churn", axis = 1), df_data["Churn"], cv = 10)
print("Accuracy mean of logistic model: " + str(results_LR_2["test_score"].mean()))
print("Accuracy mean of decision tree model: " + str(results_DT_2["test_score"].mean()))

# In average the logistic model performed better

Accuracy mean of logistic model: 0.7987753378378379
Accuracy mean of decision tree model: 0.7866849055993794
