### EDA

In [1]:
# Instructions
# Apply SMOTE for upsampling the data

# Use logistic regression to fit the model and compute the accuracy of the model.
# Use decision tree classifier to fit the model and compute the accuracy of the model.
# Compare the accuracies of the two models.
# Apply TomekLinks for downsampling

# It is important to remember that it does not make the two classes equal but only removes the points from the majority class that are close to other points in minority class.
# Use logistic regression to fit the model and compute the accuracy of the model.
# Use decision tree classifier to fit the model and compute the accuracy of the model.
# Compare the accuracies of the two models.
# You can also apply this algorithm one more time and check the how the imbalance in the two classes changed from the last time.

In [20]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import  accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks
from sklearn.tree import DecisionTreeClassifier


warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('Customer-Churn.csv')

In [4]:
data.shape

(7043, 16)

In [5]:
data.describe()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges
count,7043.0,7043.0,7043.0
mean,0.162147,32.371149,64.761692
std,0.368612,24.559481,30.090047
min,0.0,0.0,18.25
25%,0.0,9.0,35.5
50%,0.0,29.0,70.35
75%,0.0,55.0,89.85
max,1.0,72.0,118.75


In [6]:
data.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [7]:
data['TotalCharges'].dtypes

dtype('O')

In [8]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors = "coerce")

In [9]:
data = data.dropna(axis = "index")

In [10]:
data.isna().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64

In [11]:
num = data.select_dtypes(include = ['float64','int64'])
cat = data.select_dtypes(include = 'object')
cat = cat.drop(['Churn'], axis = 1)

In [12]:
num.shape

(7032, 4)

In [15]:
cat_dum = pd.get_dummies(cat, drop_first = True)

###  SMOTE

In [18]:
data_dum = pd.concat([num, data['Churn'],cat_dum],axis=1)
smote = SMOTE()
y = data_dum['Churn']
X = data_dum.drop(['Churn'], axis=1)
X_sm, y_sm = smote.fit_resample(X, y)

unique, counts = np.unique(y_sm, return_counts=True)
print(np.asarray((unique, counts)).T)

[['No' 5163]
 ['Yes' 5163]]


In [22]:
 
transformer = StandardScaler().fit(X_sm)
X_sm_standarized = transformer.transform(X_sm)
X_sm_standarized = pd.DataFrame(X_sm_standarized)
X_sm_standarized.columns = X_sm_standarized.columns.astype(str)
X_sm_standarized.rename(columns={'0':'SeniorCitizen', '1':'tenure', '2':'MonthlyCharges', '3':'TotalCharges'}, inplace=True)
X_sm_standarized.head()


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,-0.395218,-1.115055,-1.333039,-0.926634,-0.875751,1.277564,-0.527079,-2.961276,-0.439708,-0.512952,...,-0.439708,-0.616981,-0.439708,-0.515059,-0.439708,-0.743807,-0.439708,-0.75152,-0.421375,-0.443466
1,-0.395218,0.263834,-0.388269,-0.071767,1.141877,-0.78274,-0.527079,0.337692,-0.439708,1.949502,...,-0.439708,1.620795,-0.439708,-0.515059,-0.439708,-0.743807,-0.439708,-0.75152,2.37318,-0.443466
2,-0.395218,-1.07327,-0.496343,-0.89064,1.141877,-0.78274,-0.527079,0.337692,-0.439708,1.949502,...,-0.439708,-0.616981,-0.439708,-0.515059,-0.439708,-0.743807,-0.439708,-0.75152,-0.421375,-0.443466
3,-0.395218,0.723463,-0.899003,-0.094177,1.141877,-0.78274,-0.527079,-2.961276,-0.439708,1.949502,...,-0.439708,1.620795,-0.439708,1.941524,-0.439708,-0.743807,-0.439708,-0.75152,2.37318,-0.443466
4,-0.395218,-1.07327,0.091088,-0.870643,-0.875751,-0.78274,-0.527079,0.337692,-0.439708,-0.512952,...,-0.439708,-0.616981,-0.439708,-0.515059,-0.439708,-0.743807,-0.439708,-0.75152,-0.421375,-0.443466


In [28]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_sm_standarized, y_sm, test_size=0.3, random_state=100)

In [30]:
model = LogisticRegression()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.8308586184635248

In [32]:
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train, y_train)
predictions_dt = model.predict(X_test)
print("The accuracy of the model is: {:4.2f}".format(model_dt.score(X_test, y_test)))


The accuracy of the model is: 0.77


### TOMEKLINKS 

In [33]:
from imblearn.under_sampling import TomekLinks

tl = TomekLinks('majority')

X_tl, y_tl = tl.fit_resample(np.array(X), y)

y_tl.value_counts()

No     4596
Yes    1869
Name: Churn, dtype: int64

### TRANSFORM TOMELINKS

In [34]:
 
transformer = StandardScaler().fit(X_tl)
X_tl_standarized = transformer.transform(X_tl)
X_tl_standarized = pd.DataFrame(X_tl_standarized)
X_tl_standarized.columns = X_tl_standarized.columns.astype(str)
X_tl_standarized.rename(columns={'0':'SeniorCitizen', '1':'tenure', '2':'MonthlyCharges', '3':'TotalCharges'}, inplace=True)
X_tl_standarized.head()


Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,-0.43734,0.035998,-0.23307,-0.181635,0.98969,-0.972987,-0.658876,0.328008,-0.545593,1.581824,...,-0.545593,1.386136,-0.545593,-0.636258,-0.545593,-0.781417,-0.545593,-0.791464,1.934739,-0.57872
1,-0.43734,-1.259172,-0.335207,-0.96225,0.98969,-0.972987,-0.658876,0.328008,-0.545593,1.581824,...,-0.545593,-0.72143,-0.545593,-0.636258,-0.545593,-0.781417,-0.545593,-0.791464,-0.516866,-0.57872
2,-0.43734,0.481213,-0.71575,-0.202998,0.98969,-0.972987,-0.658876,-3.048702,-0.545593,1.581824,...,-0.545593,1.386136,-0.545593,1.57169,-0.545593,-0.781417,-0.545593,-0.791464,1.934739,-0.57872
3,-0.43734,-1.259172,0.219958,-0.943188,-1.010418,-0.972987,-0.658876,0.328008,-0.545593,-0.632182,...,-0.545593,-0.72143,-0.545593,-0.636258,-0.545593,-0.781417,-0.545593,-0.791464,-0.516866,-0.57872
4,-0.43734,-1.016328,1.173787,-0.650087,-1.010418,-0.972987,-0.658876,0.328008,-0.545593,-0.632182,...,-0.545593,1.386136,-0.545593,-0.636258,-0.545593,1.279726,-0.545593,1.263482,-0.516866,-0.57872


In [35]:
X_train, X_test, y_train, y_test = train_test_split(X_tl_standarized, y_tl, test_size=0.3, random_state=100)

In [36]:
model = LogisticRegression()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.8103092783505155

In [37]:
model_dt = DecisionTreeClassifier()
model_dt.fit(X_train, y_train)
predictions_dt = model.predict(X_test)
print("The accuracy of the model is: {:4.2f}".format(model_dt.score(X_test, y_test)))

The accuracy of the model is: 0.73


In [38]:

X_tl_2, y_tl_2 = tl.fit_resample(np.array(X_tl), y_tl)

y_tl_2.value_counts()

No     4429
Yes    1869
Name: Churn, dtype: int64