## 1. Import the required libraries and modules that you would need.

In [105]:
import pandas as pd
import numpy as np
import statsmodels.api as sm

## 2. Read that data into Python and call the dataframe churnData.

In [106]:
churndData = pd.read_csv('files_for_lab/Customer-Churn.csv')
churndData.head(3)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes


## 3. Check the datatypes of all the columns in the data. You would see that the column TotalCharges is object type.

In [107]:
churndData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [108]:
churndData['TotalCharges'] = pd.DataFrame(list(map(lambda x: x.replace(' ','0'),churndData['TotalCharges'])))

In [109]:
churndData['TotalCharges'] = pd.to_numeric(churndData['TotalCharges'])

## 4.Check for null values in the dataframe. Replace the null values.

In [110]:
# There are not nan values.
churndData.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

## 5. Use the following features:

In [111]:
features = churndData[['tenure','SeniorCitizen','MonthlyCharges','TotalCharges']]
Y = churndData['Churn']

In [112]:
features.head()

Unnamed: 0,tenure,SeniorCitizen,MonthlyCharges,TotalCharges
0,1,0,29.85,29.85
1,34,0,56.95,1889.5
2,2,0,53.85,108.15
3,45,0,42.3,1840.75
4,2,0,70.7,151.65


In [113]:
# Scale the features either by using normalizer or a standard scaler.
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features) 

In [114]:
# Split the data into a training set and a test set.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(features,Y, random_state=0)

In [115]:
# Fit a logistic regression model on the training data
from sklearn.linear_model import LogisticRegression
classification = LogisticRegression(random_state=0, solver='lbfgs',
                  multi_class='ovr').fit(X_train, y_train)

In [116]:
# Check the accuracy on the test data.
predictions = classification.predict(X_test)
churndData_raw = classification.score(X_test, y_test)
churndData_raw

0.7830777967064169

## 6. Managing imbalance in the dataset

In [117]:
churndData['Churn'].value_counts()

No     5174
Yes    1869
Name: Churn, dtype: int64

## 7. Use the resampling strategies used in class for upsampling and downsampling to create a balance between the two classes.

In [118]:
from sklearn.utils import resample

category_N = churndData[churndData['Churn'] == 'No']
category_Y = churndData[churndData['Churn'] == 'Yes']

In [119]:
# Downsampling
category_N_undersampled = resample(category_N, 
                                   replace=False, 
                                   n_samples = len(category_Y))

In [120]:
print(category_N_undersampled.shape)
print(category_Y.shape)

(1869, 16)
(1869, 16)


In [121]:
data_downsampled = pd.concat([category_N_undersampled, category_Y], axis=0)

In [122]:
# Upsampling
category_Y_oversampled = resample(category_Y, 
                                  replace=True, 
                                  n_samples = len(category_N))

In [123]:
print(category_Y_oversampled.shape)
print(category_N.shape)

(5174, 16)
(5174, 16)


In [124]:
data_upsampled = pd.concat([category_N, category_Y_oversampled], axis=0)

## 8. Each time fit the model and see how the accuracy of the model is.

Downsampling data

In [125]:
Y = data_downsampled['Churn']
data_downsampled = data_downsampled[['tenure','SeniorCitizen','MonthlyCharges','TotalCharges']]

In [126]:
scaler = MinMaxScaler()
data_downsampled_scaled = scaler.fit_transform(data_downsampled) 

In [127]:
X_train, X_test, y_train, y_test = train_test_split(data_downsampled_scaled,Y, random_state=0)

In [128]:
classification = LogisticRegression(random_state=0, solver='lbfgs',
                  multi_class='ovr').fit(X_train, y_train)

In [129]:
predictions = classification.predict(X_test)
churndData_down = classification.score(X_test, y_test)
churndData_down

0.7037433155080214

Upsampling data

In [130]:
Y = data_upsampled['Churn']
data_upsampled = data_upsampled[['tenure','SeniorCitizen','MonthlyCharges','TotalCharges']]

In [131]:
scaler = MinMaxScaler()
data_upsampled_scaled = scaler.fit_transform(data_upsampled) 

In [132]:
X_train, X_test, y_train, y_test = train_test_split(data_upsampled_scaled,Y, random_state=0)

In [133]:
classification = LogisticRegression(random_state=0, solver='lbfgs',
                  multi_class='ovr').fit(X_train, y_train)

In [134]:
predictions = classification.predict(X_test)
churndData_up = classification.score(X_test, y_test)
churndData_up

0.7313490529570932

## Summary

In [135]:
pd.DataFrame([[churndData_raw,churndData_down,churndData_up]], columns = ['Raw data','Downsampled data','Upsampled data'])

Unnamed: 0,Raw data,Downsampled data,Upsampled data
0,0.783078,0.703743,0.731349


So in this case it does not make too much sense to resample the data, probably because there is not too much invalance in the target variable