### EDA

In [1]:
#  Scale the features either by using normalizer or a standard scaler.
# Split the data into a training set and a test set.
# Fit a logistic regression model on the training data.
# Check the accuracy on the test data.
# Note: So far we have not balanced the data.

# Managing imbalance in the dataset

# Check for the imbalance.
# Use the resampling strategies used in class for upsampling and downsampling to create a balance between the two classes.
# Each time fit the model and see how the accuracy of the model is.

In [5]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import  accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.tree import DecisionTreeClassifier

import warnings

warnings.filterwarnings('ignore')

In [6]:
churndata = pd.read_csv('Customer-Churn.csv')

In [7]:
churndata.shape

(7043, 16)

In [8]:
churndata.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [9]:
churndata.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [10]:
churndata['TotalCharges'].dtypes

dtype('O')

In [11]:
churndata['TotalCharges'] = pd.to_numeric(churndata['TotalCharges'], errors = "coerce")

In [12]:
churndata = churndata.dropna(axis = "index")

In [13]:
churndata.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [14]:
num = churndata.select_dtypes(include = ['float64','int64'])
cat = churndata.select_dtypes(include = 'object')
cat = cat.drop(['Churn'], axis = 1)

In [15]:
num.columns


Index(['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges'], dtype='object')

In [16]:
num.shape

(7032, 4)

In [17]:
cat_dum = pd.get_dummies(cat, drop_first = True)

In [19]:
data_dum = pd.concat([num, churndata['Churn'],cat_dum],axis=1)
y = data_dum['Churn']
X = data_dum.drop(['Churn'], axis=1)

In [20]:
 
transformer = StandardScaler().fit(X)
X_standarized = transformer.transform(X)
X_standarized = pd.DataFrame(X_standarized)
X_standarized.columns = X_standarized.columns.astype(str)
X_standarized.rename(columns={'0':'SeniorCitizen', '1':'tenure', '2':'MonthlyCharges', '3':'TotalCharges'}, inplace=True)
X_standarized.head()


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,-0.440327,-1.280248,-1.161694,-0.994194,-1.00943,1.035617,-0.652305,-3.056334,-0.52513,-0.633746,...,-0.52513,-0.723918,-0.52513,-0.63926,-0.52513,-0.790186,-0.52513,-0.796849,-0.514537,-0.561364
1,-0.440327,0.064303,-0.260878,-0.17374,0.990658,-0.965608,-0.652305,0.327189,-0.52513,1.577918,...,-0.52513,1.381372,-0.52513,-0.63926,-0.52513,-0.790186,-0.52513,-0.796849,1.943495,-0.561364
2,-0.440327,-1.239504,-0.363923,-0.959649,0.990658,-0.965608,-0.652305,0.327189,-0.52513,1.577918,...,-0.52513,-0.723918,-0.52513,-0.63926,-0.52513,-0.790186,-0.52513,-0.796849,-0.514537,-0.561364
3,-0.440327,0.512486,-0.74785,-0.195248,0.990658,-0.965608,-0.652305,-3.056334,-0.52513,1.577918,...,-0.52513,1.381372,-0.52513,1.564308,-0.52513,-0.790186,-0.52513,-0.796849,1.943495,-0.561364
4,-0.440327,-1.239504,0.196178,-0.940457,-1.00943,-0.965608,-0.652305,0.327189,-0.52513,-0.633746,...,-0.52513,-0.723918,-0.52513,-0.63926,-0.52513,-0.790186,-0.52513,-0.796849,-0.514537,-0.561364


In [21]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_standarized, y, test_size=0.3, random_state=100)

In [22]:
model = LogisticRegression()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.7943127962085308

In [23]:
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train, y_train)
predictions_dt = model.predict(X_test)
print("The accuracy of the model is: {:4.2f}".format(model_dt.score(X_test, y_test)))

The accuracy of the model is: 0.72


###  SMOTE

In [32]:
smote = SMOTE()
X_sm, y_sm = smote.fit_resample(X, y)
unique, counts = np.unique(y_sm, return_counts=True)
print(np.asarray((unique, counts)).T)

[['No' 5163]
 ['Yes' 5163]]


In [33]:
 
transformer = StandardScaler().fit(X_sm)
X_sm_standarized = transformer.transform(X_sm)
X_sm_standarized = pd.DataFrame(X_sm_standarized)
X_sm_standarized.columns = X_sm_standarized.columns.astype(str)
X_sm_standarized.rename(columns={'0':'SeniorCitizen', '1':'tenure', '2':'MonthlyCharges', '3':'TotalCharges'}, inplace=True)
X_sm_standarized.head()


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,-0.393907,-1.105254,-1.323323,-0.916355,-0.867837,1.281248,-0.525729,-2.916897,-0.441119,-0.508732,...,-0.441119,-0.619226,-0.441119,-0.517166,-0.441119,-0.743022,-0.441119,-0.74821,-0.419302,-0.445497
1,-0.393907,0.269427,-0.380403,-0.066955,1.15229,-0.780489,-0.525729,0.34283,-0.441119,1.965673,...,-0.441119,1.61492,-0.441119,-0.517166,-0.441119,-0.743022,-0.441119,-0.74821,2.384916,-0.445497
2,-0.393907,-1.063597,-0.488265,-0.880591,1.15229,-0.780489,-0.525729,0.34283,-0.441119,1.965673,...,-0.441119,-0.619226,-0.441119,-0.517166,-0.441119,-0.743022,-0.441119,-0.74821,-0.419302,-0.445497
3,-0.393907,0.727654,-0.890137,-0.089222,1.15229,-0.780489,-0.525729,-2.916897,-0.441119,1.965673,...,-0.441119,1.61492,-0.441119,1.933616,-0.441119,-0.743022,-0.441119,-0.74821,2.384916,-0.445497
4,-0.393907,-1.063597,0.098015,-0.860723,-0.867837,-0.780489,-0.525729,0.34283,-0.441119,-0.508732,...,-0.441119,-0.619226,-0.441119,-0.517166,-0.441119,-0.743022,-0.441119,-0.74821,-0.419302,-0.445497


In [34]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_sm_standarized, y_sm, test_size=0.3, random_state=100)

In [35]:
model = LogisticRegression()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.8285990961910911

In [36]:
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train, y_train)
predictions_dt = model.predict(X_test)
print("The accuracy of the model is: {:4.2f}".format(model_dt.score(X_test, y_test)))


The accuracy of the model is: 0.78


### TOMEKLINKS 

In [37]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks('majority')

X_tl, y_tl = tl.fit_resample(np.array(X), y)

y_tl.value_counts()

No     4596
Yes    1869
Name: Churn, dtype: int64

### TRANSFORM TOMELINKS

In [38]:
 
transformer = StandardScaler().fit(X_tl)
X_tl_standarized = transformer.transform(X_tl)
X_tl_standarized = pd.DataFrame(X_tl_standarized)
X_tl_standarized.columns = X_tl_standarized.columns.astype(str)
X_tl_standarized.rename(columns={'0':'SeniorCitizen', '1':'tenure', '2':'MonthlyCharges', '3':'TotalCharges'}, inplace=True)
X_tl_standarized.head()


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,-0.43734,0.035998,-0.23307,-0.181635,0.98969,-0.972987,-0.658876,0.328008,-0.545593,1.581824,...,-0.545593,1.386136,-0.545593,-0.636258,-0.545593,-0.781417,-0.545593,-0.791464,1.934739,-0.57872
1,-0.43734,-1.259172,-0.335207,-0.96225,0.98969,-0.972987,-0.658876,0.328008,-0.545593,1.581824,...,-0.545593,-0.72143,-0.545593,-0.636258,-0.545593,-0.781417,-0.545593,-0.791464,-0.516866,-0.57872
2,-0.43734,0.481213,-0.71575,-0.202998,0.98969,-0.972987,-0.658876,-3.048702,-0.545593,1.581824,...,-0.545593,1.386136,-0.545593,1.57169,-0.545593,-0.781417,-0.545593,-0.791464,1.934739,-0.57872
3,-0.43734,-1.259172,0.219958,-0.943188,-1.010418,-0.972987,-0.658876,0.328008,-0.545593,-0.632182,...,-0.545593,-0.72143,-0.545593,-0.636258,-0.545593,-0.781417,-0.545593,-0.791464,-0.516866,-0.57872
4,-0.43734,-1.016328,1.173787,-0.650087,-1.010418,-0.972987,-0.658876,0.328008,-0.545593,-0.632182,...,-0.545593,1.386136,-0.545593,-0.636258,-0.545593,1.279726,-0.545593,1.263482,-0.516866,-0.57872


In [39]:
X_train, X_test, y_train, y_test = train_test_split(X_tl_standarized, y_tl, test_size=0.3, random_state=100)

In [40]:
model = LogisticRegression()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.8103092783505155

In [41]:
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train, y_train)
predictions_dt = model.predict(X_test)
print("The accuracy of the model is: {:4.2f}".format(model_dt.score(X_test, y_test)))

The accuracy of the model is: 0.73
