In [35]:
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_california_housing
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler
from imblearn.under_sampling import TomekLinks


In [2]:
data = pd.read_csv('Customer-Churn.csv')
data = pd.DataFrame(data) #will be easier to manipulate in the future

### Some Cleaning and Organizing

In [3]:
data.columns #check format names

Index(['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure',
       'PhoneService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection',
       'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
       'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [None]:
data.dtypes #to use SMOTE we need to encode categoricals in the correct format

In [None]:
data.isna().sum() #no null values

In [None]:
#check duplicates
duplicates_count = data.duplicated().sum()
duplicates_count

In [4]:
data = data.drop_duplicates()

In [None]:
#identify how many empty values we have (not detected by the null values)
count = (data == ' ').any(axis=1).sum() + (data.eq('').all(axis=1) & (data != '')).sum() #checks every row in every column how many spaces there are
print(count) #takes a while (4min) but works

In [5]:
#although there where no null values, some values had an empty space; we have to consider empty spaces are null
'change them to be considered null and prepare to drop'
def replace_empty(data):
    for columns in data.columns:
        data[columns] = data[columns].replace(' ', None)

replace_empty(data)

In [None]:
data.isna().sum()

In [6]:
data['TotalCharges']= data['TotalCharges'].astype(float) #change data type from string to float

As we will upsample the data and we only have 11 null values, we will drop them from the dataset.

In [7]:
data = data.dropna(subset=['TotalCharges']) #the SeniorCitizen column seems like '0' and '1' are the equivalent of 'yes' and 'no', lets adapt the answers so we can encode correctly

In [8]:
data['SeniorCitizen']= data['SeniorCitizen'].astype(object)
data['SeniorCitizen'] = data['SeniorCitizen'].replace(0, 'No')
data['SeniorCitizen'] = data['SeniorCitizen'].replace(1, 'Yes')

In [None]:
data['SeniorCitizen'].unique()

In [9]:
num = data.select_dtypes(np.number)
cat = data.select_dtypes(np.object_)

#encode categoricals
cat_dum = pd.get_dummies(cat, drop_first=True)

In [10]:
#normalize and transform the data
transformer = MinMaxScaler().fit(num)
num_norm = transformer.transform(num)
num_norm = pd.DataFrame(num_norm, columns=num.columns)
num_norm.head()


Unnamed: 0,tenure,MonthlyCharges,TotalCharges
0,0.0,0.115423,0.001275
1,0.464789,0.385075,0.215867
2,0.014085,0.354229,0.01031
3,0.619718,0.239303,0.210241
4,0.014085,0.521891,0.01533


In [13]:
X = pd.concat([num_norm, cat_dum], axis=1)
y = X['Churn_Yes']
X = X.drop(['Churn_Yes'], axis=1)

In [14]:
#had an error in the model training part. To fix it I check for null values and duplicates in the df 'X'
duplicates_count = X.duplicated().sum()
duplicates_count

2

In [15]:
X = X.drop_duplicates()
y = y.drop_duplicates()

In [18]:
X = X.dropna()
y = y.dropna()

In [24]:
print(y.shape)
print(data.shape)
print(num_norm.shape)
print(cat_dum.shape)
print(X.shape)
print(y.shape)


(6983,)
(6983, 16)
(6983, 3)
(6983, 20)
(6983, 22)
(6983,)


In [20]:
# I now deleted too many values... I will create 60 sample rows to fill the gaps and move forward with this

random_samples = X.sample(n=60, replace=True)
X = X.append(random_samples, ignore_index=True)
X

  X = X.append(random_samples, ignore_index=True)


Unnamed: 0,tenure,MonthlyCharges,TotalCharges,gender_Male,SeniorCitizen_Yes,Partner_Yes,Dependents_Yes,PhoneService_Yes,OnlineSecurity_No internet service,OnlineSecurity_Yes,...,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year
0,0.000000,0.115423,0.001275,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.464789,0.385075,0.215867,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.014085,0.354229,0.010310,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.619718,0.239303,0.210241,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0.014085,0.521891,0.015330,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6978,0.042254,0.021891,0.007391,0.0,0.0,0.0,0.0,1.0,1.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0
6979,0.788732,0.826368,0.664759,1.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0
6980,1.000000,0.464677,0.522409,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
6981,0.140845,0.326866,0.059122,0.0,1.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)

In [31]:
#apply Log Regression and compare with Decision Tree Classifier

log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

dec_tree_class = DecisionTreeClassifier()
dec_tree_class.fit(X_train, y_train)

#predict test data
y_pred_1 = log_reg.predict(X_test)
y_pred_2 = dec_tree_class.predict(X_test)

#create model
log_accuracy = log_reg.score(X_test, y_test)
dec_tree_accuracy = accuracy_score(y_test, y_pred_2)

#results
print("LogReg Accuracy:", log_accuracy)
print("Dec_Tree Accuracy:", dec_tree_accuracy)



LogReg Accuracy: 0.7871121718377089
Dec_Tree Accuracy: 0.7116945107398568


The log regression model has a better accuracy score (when no upsampling / downsampling has been applied).

Let us see what happens when we up-sample the data

In [32]:
X_upsample, y_upsample = SMOTE().fit_resample(X,y)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X_upsample, y_upsample, test_size=0.3, random_state=100)

In [34]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

dec_tree_class = DecisionTreeClassifier()
dec_tree_class.fit(X_train, y_train)

#predict test data
y_pred_1 = log_reg.predict(X_test)
y_pred_2 = dec_tree_class.predict(X_test)

#create model
log_accuracy = log_reg.score(X_test, y_test)
dec_tree_accuracy = accuracy_score(y_test, y_pred_2)

#results
print("LogReg Accuracy:", log_accuracy)
print("Dec_Tree Accuracy:", dec_tree_accuracy)

LogReg Accuracy: 0.7467532467532467
Dec_Tree Accuracy: 0.7464285714285714


After up sampling the data, both models become more equal in the type of predictions.

what happens if we downsample the data?

In [38]:
X_tomek, y_tomek = TomekLinks().fit_resample(X, y)

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_tomek, y_tomek, test_size=0.3, random_state=100)

In [40]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

dec_tree_class = DecisionTreeClassifier()
dec_tree_class.fit(X_train, y_train)

#predict test data
y_pred_1 = log_reg.predict(X_test)
y_pred_2 = dec_tree_class.predict(X_test)

#create model
log_accuracy = log_reg.score(X_test, y_test)
dec_tree_accuracy = accuracy_score(y_test, y_pred_2)

#results
print("LogReg Accuracy:", log_accuracy)
print("Dec_Tree Accuracy:", dec_tree_accuracy)

LogReg Accuracy: 0.794750656167979
Dec_Tree Accuracy: 0.7149606299212599


Log Regression becomes a better model, and also is more accurate than the model with no changes in the sampling of the data.