In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import cross_validate
from imblearn.under_sampling import TomekLinks

In [2]:
df_data = pd.read_csv(r"C:\Users\joaoa\Desktop\Ironhack\Labs\lab-cross-validation\files_for_lab\Customer-Churn.csv")
df_data

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [3]:
# Checking null values
df_data.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [4]:
# Checking " " values in all columns
for col in df_data.columns:
    if ' ' in df_data[col].values:
        print("The variable '" + col + "' have " + str(df_data[col].value_counts()[" "]) + " empty values (' ')")
        
# Since I don't know if these empty cells are zeros or nulls, I decided to remove the rows that contain them
df_data.drop(df_data[df_data["TotalCharges"] == " "].index, inplace = True)
df_data.reset_index(drop = True, inplace = True)
df_data

The variable 'TotalCharges' have 11 empty values (' ')


Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7028,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7029,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7030,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [5]:
# Checking data types
print(df_data.dtypes)

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object


In [6]:
# Changing some data types ("TotalCharges" should be float)
df_data["TotalCharges"] = df_data["TotalCharges"].astype(float)
print(df_data.dtypes)

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object


In [7]:
# Creating a new dataframe with categorical values
df_categ = df_data.select_dtypes(include = object)

# One hot enconding
dummies = pd.get_dummies(df_categ, drop_first = True)
dummies

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,Churn_Yes
0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
1,1,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0
2,1,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1
3,1,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,1,0,0
4,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,1,1,1,1,0,1,0,0,0,1,0,1,0,1,0,1,1,0,0
7028,0,1,1,1,0,0,0,1,0,1,0,0,0,1,0,1,1,0,0
7029,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
7030,1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [8]:
# Creating a new dataframe with numerical values
df_num = df_data.select_dtypes(include = np.number)

# Standardizing numerical variables
scaler = StandardScaler()
stand_data = scaler.fit_transform(df_num)
stand_data = pd.DataFrame(stand_data, columns = df_num.columns)
stand_data

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges
0,-0.440327,-1.280248,-1.161694,-0.994194
1,-0.440327,0.064303,-0.260878,-0.173740
2,-0.440327,-1.239504,-0.363923,-0.959649
3,-0.440327,0.512486,-0.747850,-0.195248
4,-0.440327,-1.239504,0.196178,-0.940457
...,...,...,...,...
7027,-0.440327,-0.343137,0.664868,-0.129180
7028,-0.440327,1.612573,1.276493,2.241056
7029,-0.440327,-0.872808,-1.170004,-0.854514
7030,2.271039,-1.158016,0.319168,-0.872095


In [9]:
# Gathering all variables together again
df_data = pd.concat([dummies.drop("Churn_Yes", axis = 1), stand_data, df_data["Churn"]], axis = 1)
df_data

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
0,0,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,-0.440327,-1.280248,-1.161694,-0.994194,No
1,1,0,0,1,0,1,0,0,0,1,...,0,0,0,1,0,-0.440327,0.064303,-0.260878,-0.173740,No
2,1,0,0,1,0,1,0,1,0,0,...,0,0,0,0,0,-0.440327,-1.239504,-0.363923,-0.959649,Yes
3,1,0,0,0,0,1,0,0,0,1,...,0,0,0,1,0,-0.440327,0.512486,-0.747850,-0.195248,No
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,-0.440327,-1.239504,0.196178,-0.940457,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7027,1,1,1,1,0,1,0,0,0,1,...,1,0,1,1,0,-0.440327,-0.343137,0.664868,-0.129180,No
7028,0,1,1,1,0,0,0,1,0,1,...,1,0,1,1,0,-0.440327,1.612573,1.276493,2.241056,No
7029,0,1,1,0,0,1,0,0,0,0,...,0,0,0,0,0,-0.440327,-0.872808,-1.170004,-0.854514,No
7030,1,1,0,1,0,0,0,0,0,0,...,0,0,0,0,0,2.271039,-1.158016,0.319168,-0.872095,Yes


In [10]:
# Defining X and y variables
X = df_data.drop("Churn",axis = 1)
y = df_data["Churn"]

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42, test_size = 0.2)

# Train dataset
df_train = pd.concat([X_train, y_train], axis = 1)
df_train

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
6021,0,0,0,0,0,0,0,0,0,1,...,1,0,1,0,0,-0.440327,0.430998,-0.523477,-0.091370,Yes
3404,1,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,-0.440327,-1.198760,-0.378881,-0.924111,Yes
5474,0,1,0,1,0,0,0,0,0,1,...,0,0,0,0,0,-0.440327,0.919926,0.430523,0.959516,Yes
5515,1,1,1,1,0,1,0,0,0,0,...,0,0,0,0,1,-0.440327,0.512486,-0.337331,0.119297,No
6328,0,1,1,1,0,1,0,0,0,0,...,1,0,1,1,0,-0.440327,0.919926,1.200040,1.396490,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3772,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,-0.440327,-1.198760,-0.478603,-0.946811,No
5191,0,0,0,1,0,0,0,1,0,1,...,1,0,1,1,0,-0.440327,0.756950,1.552388,1.508839,No
5226,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,-0.440327,-0.954296,0.543541,-0.661274,No
5390,0,0,0,1,1,0,1,0,1,0,...,0,1,0,1,0,-0.440327,0.716206,-1.497422,-0.571007,No


In [11]:
# Computing the frequency of the unique values from the the target variable before oversampling or undersampling the train dataset
df_train["Churn"].value_counts()

No     4130
Yes    1495
Name: Churn, dtype: int64

# Oversampling

In [12]:
# Oversampling with SMOTE method
smote = SMOTE()
X = df_train.drop("Churn", axis = 1)
y = df_train["Churn"]
x_sm, y_sm = smote.fit_resample(X, y)

# Creating a balanced dataframe
data_smote = pd.concat([x_sm, y_sm], axis = 1)
data_smote

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
0,0,0,0,0,0,0,0,0,0,1,...,1,0,1,0,0,-0.440327,0.430998,-0.523477,-0.091370,Yes
1,1,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,-0.440327,-1.198760,-0.378881,-0.924111,Yes
2,0,1,0,1,0,0,0,0,0,1,...,0,0,0,0,0,-0.440327,0.919926,0.430523,0.959516,Yes
3,1,1,1,1,0,1,0,0,0,0,...,0,0,0,0,1,-0.440327,0.512486,-0.337331,0.119297,No
4,0,1,1,1,0,1,0,0,0,0,...,1,0,1,1,0,-0.440327,0.919926,1.200040,1.396490,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8255,1,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,-0.440327,-1.133479,0.530547,-0.837137,Yes
8256,0,0,0,1,1,0,1,0,1,0,...,0,1,0,0,0,-0.440327,-1.280248,-1.476260,-0.998369,Yes
8257,0,1,0,1,0,0,0,0,0,1,...,1,0,1,0,0,2.271039,-0.952340,1.270682,-0.600392,Yes
8258,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,-0.440327,-1.254147,-0.217063,-0.969772,Yes


In [13]:
# Computing the frequency of the unique values from the the target variable after oversampling the train dataset
data_smote["Churn"].value_counts()

Yes    4130
No     4130
Name: Churn, dtype: int64

In [14]:
# Building a logistic model with data_smote
X_train = data_smote.drop("Churn", axis = 1)
y_train = data_smote["Churn"]
LR_1 = LogisticRegression()
LR_1.fit(X_train, y_train)
pred = LR_1.predict(X_test)
print("Accuracy: ", LR_1.score(X_test, y_test))

Accuracy:  0.7377398720682303


In [15]:
# Building a decision tree model with data_smote
X_train = data_smote.drop("Churn", axis = 1)
y_train = data_smote["Churn"]
DT_1 = DecisionTreeClassifier(max_depth = 7)
DT_1.fit(X_train, y_train)
print("Accuracy: ", DT_1.score(X_test, y_test))

Accuracy:  0.7213930348258707


In [16]:
# K fold cross validation
results_LR_1 = cross_validate(LR_1, df_data.drop("Churn", axis = 1), df_data["Churn"], cv = 10)
results_DT_1 = cross_validate(DT_1, df_data.drop("Churn", axis = 1), df_data["Churn"], cv = 10)
print("Accuracy mean of logistic model: " + str(results_LR_1["test_score"].mean()))
print("Accuracy mean of decision tree model: " + str(results_DT_1["test_score"].mean()))

# In average the logistic model performed better

Accuracy mean of logistic model: 0.799630237294711
Accuracy mean of decision tree model: 0.7828512947756369


# Undersampling 

In [17]:
# Undersampling with SMOTE method
tl = TomekLinks()
X = df_train.drop("Churn", axis = 1)
y = df_train["Churn"]
x_tl, y_tl = tl.fit_resample(X,y)

# Creating a more balanced dataframe
data_tomeklinks = pd.concat([x_tl, y_tl], axis = 1)
data_tomeklinks

Unnamed: 0,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No internet service,DeviceProtection_Yes,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,Churn
0,0,0,0,0,0,0,0,0,0,1,...,1,0,1,0,0,-0.440327,0.430998,-0.523477,-0.091370,Yes
1,1,0,0,1,0,0,0,0,0,0,...,1,0,0,0,0,-0.440327,-1.198760,-0.378881,-0.924111,Yes
2,0,1,0,1,0,0,0,0,0,1,...,0,0,0,0,0,-0.440327,0.919926,0.430523,0.959516,Yes
3,1,1,1,1,0,1,0,0,0,0,...,0,0,0,0,1,-0.440327,0.512486,-0.337331,0.119297,No
4,0,1,1,1,0,1,0,0,0,0,...,1,0,1,1,0,-0.440327,0.919926,1.200040,1.396490,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5204,1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,-0.440327,-1.198760,-0.478603,-0.946811,No
5205,0,0,0,1,0,0,0,1,0,1,...,1,0,1,1,0,-0.440327,0.756950,1.552388,1.508839,No
5206,1,0,0,1,0,1,0,0,0,0,...,0,0,0,0,0,-0.440327,-0.954296,0.543541,-0.661274,No
5207,0,0,0,1,1,0,1,0,1,0,...,0,1,0,1,0,-0.440327,0.716206,-1.497422,-0.571007,No


In [18]:
# Computing the frequency of the unique values from the the target variable after undersampling the train dataset
data_tomeklinks["Churn"].value_counts()

No     3714
Yes    1495
Name: Churn, dtype: int64

In [19]:
# Building a logistic model with data_smote
X_train = data_tomeklinks.drop("Churn", axis = 1)
y_train = data_tomeklinks["Churn"]
LR_2 = LogisticRegression()
LR_2.fit(X_train, y_train)
pred = LR_2.predict(X_test)
print("Accuracy: ", LR_2.score(X_test, y_test))

Accuracy:  0.7768301350390903


In [20]:
# Building a decision tree model with data_smote
X_train = data_tomeklinks.drop("Churn", axis = 1)
y_train = data_tomeklinks["Churn"]
DT_2 = DecisionTreeClassifier(max_depth = 7)
DT_2.fit(X_train, y_train)
print("Accuracy: ", DT_2.score(X_test, y_test))

Accuracy:  0.746268656716418


In [21]:
# K fold cross validation
results_LR_2 = cross_validate(LR_2, df_data.drop("Churn", axis = 1), df_data["Churn"], cv = 10)
results_DT_2 = cross_validate(DT_2, df_data.drop("Churn", axis = 1), df_data["Churn"], cv = 10)
print("Accuracy mean of logistic model: " + str(results_LR_2["test_score"].mean()))
print("Accuracy mean of decision tree model: " + str(results_DT_2["test_score"].mean()))

# In average the logistic model performed better

Accuracy mean of logistic model: 0.799630237294711
Accuracy mean of decision tree model: 0.7831353856847277
