In [202]:
# 1. Import the required libraries and modules that you would need.
import pandas as pd
import numpy as np
import statsmodels.api as sm

In [203]:
# 2. Read that data into Python and call the dataframe churnData.
churndData = pd.read_csv('files_for_lab/Customer-Churn.csv')

In [166]:
churndData['TotalCharges'] = pd.DataFrame(list(map(lambda x: x.replace(' ','0'),churndData['TotalCharges'])))

In [167]:
churndData['TotalCharges'] = pd.to_numeric(churndData['TotalCharges'])

In [168]:
# There are not nan values.
churndData.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [169]:
# 5. Use the following features:
features = churndData[['tenure','SeniorCitizen','MonthlyCharges','TotalCharges']]
Y = churndData['Churn']

In [170]:
# Scale the features either by using normalizer or a standard scaler.
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features) 

In [171]:
# Split the data into a training set and a test set.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features,Y, random_state=0)

In [172]:
# Fit a logistic regression model on the training data
from sklearn.linear_model import LogisticRegression
classification = LogisticRegression(random_state=0, solver='lbfgs',
                  multi_class='ovr').fit(X_train, y_train)

In [173]:
# Check the accuracy on the test data.
predictions = classification.predict(X_test)
churndData_raw = classification.score(X_test, y_test)
churndData_raw

0.7830777967064169

In [174]:
# Managing imbalance in the dataset
churndData['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

### Use the resampling strategies used in class for upsampling and downsampling to create a balance  between the two classes.

In [175]:
from sklearn.utils import resample

category_N = churndData[churndData['Churn'] == 'No']
category_Y = churndData[churndData['Churn'] == 'Yes']

### Downsampling

In [176]:

category_N_undersampled = resample(category_N, 
                                   replace=False, 
                                   n_samples = len(category_Y))

In [177]:
print(category_N_undersampled.shape)
print(category_Y.shape)

(1869, 16)
(1869, 16)


In [178]:
data_downsampled = pd.concat([category_N_undersampled, category_Y], axis=0)

### Upsampling

In [179]:

category_Y_oversampled = resample(category_Y, 
                                  replace=True, 
                                  n_samples = len(category_N))

In [180]:
print(category_Y_oversampled.shape)
print(category_N.shape)

(5174, 16)
(5174, 16)


In [181]:
data_upsampled = pd.concat([category_N, category_Y_oversampled], axis=0)

### Each time fit the model and see how the accuracy of the model is.

Downsampling data

In [182]:
Y = data_downsampled['Churn']
data_downsampled_f = data_downsampled[['tenure','SeniorCitizen','MonthlyCharges','TotalCharges']]

In [183]:
scaler = MinMaxScaler()
data_downsampled_scaled = scaler.fit_transform(data_downsampled_f) 

In [184]:
X_train, X_test, y_train, y_test = train_test_split(data_downsampled_scaled,Y, random_state=0)

In [185]:
classification = LogisticRegression(random_state=0, solver='lbfgs',
                  multi_class='ovr').fit(X_train, y_train)

In [186]:
predictions = classification.predict(X_test)
churndData_down = classification.score(X_test, y_test)
churndData_down

0.7262032085561497

Upsampling data

In [187]:
Y = data_upsampled['Churn']
data_upsampled_f = data_upsampled[['tenure','SeniorCitizen','MonthlyCharges','TotalCharges']]

In [188]:
scaler = MinMaxScaler()
data_upsampled_scaled = scaler.fit_transform(data_upsampled_f) 

In [189]:
X_train, X_test, y_train, y_test = train_test_split(data_upsampled_scaled,Y, random_state=0)

In [190]:
classification = LogisticRegression(random_state=0, solver='lbfgs',
                  multi_class='ovr').fit(X_train, y_train)

In [191]:
predictions = classification.predict(X_test)
churndData_up = classification.score(X_test, y_test)
churndData_up

0.73057595670661

### Sumary 1

In [192]:
pd.DataFrame([[churndData_raw,churndData_down,churndData_up]], 
             columns = ['Raw data','Downsampled data','Upsampled data'])

Unnamed: 0,Raw data,Downsampled data,Upsampled data
0,0.783078,0.726203,0.730576


So in this case it does not make too much sense to resample the data, probably because there is not too much invalance in the target variable

## 1. Apply the Random Forests algorithm but this time only by upscaling the data.

### With raw data

In [193]:
y = churndData['Churn']
X = churndData[['tenure','SeniorCitizen','MonthlyCharges','TotalCharges']]

In [194]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [195]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
clf.fit(X_train, y_train)

r_train_sc = clf.score(X_train, y_train)
r_test_sc = clf.score(X_test, y_test)

print(r_train_sc)
print(r_test_sc)

0.802626908058218
0.7806955287437899


In [196]:
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)

r_cross_sc = np.mean(cross_val_scores)

print(r_cross_sc)

0.7909083179018178


### With upsampled data

In [197]:
y = data_upsampled['Churn']
X = data_upsampled[['tenure','SeniorCitizen','MonthlyCharges','TotalCharges']]

In [198]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [199]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
clf.fit(X_train, y_train)

up_train_sc = clf.score(X_train, y_train)
up_test_sc = clf.score(X_test, y_test)

print(up_train_sc)
print(up_test_sc)

0.7575501328823387
0.7333333333333333


In [200]:
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)

up_cross_sc = np.mean(cross_val_scores)

print(up_cross_sc)

0.7483692001238398


### Summary 2   

In [204]:
summary = {'Raw data':[r_train_sc, r_test_sc, r_cross_sc],
           'Upsampled data':[up_train_sc, up_test_sc, up_cross_sc]}
summary = pd.DataFrame(summary).T
summary.columns = 'Train score','Test score','Crosstest score'
summary 

Unnamed: 0,Train score,Test score,Crosstest score
Raw data,0.802627,0.780696,0.790908
Upsampled data,0.75755,0.733333,0.748369


We can see that the scores are better with the raw data, so in this case is better not oversample the data.