In [1]:
# libraries

import pandas as pd

from imblearn.over_sampling import SMOTE

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from imblearn.under_sampling import TomekLinks

import warnings
warnings.filterwarnings("ignore")

In [2]:
df=pd.read_csv("Customer-Churn.csv")
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.3,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.7,151.65,Yes


In [3]:
# We need to turn all data numerical /binary

In [4]:
df["SeniorCitizen"].value_counts()

0    5901
1    1142
Name: SeniorCitizen, dtype: int64

In [5]:
df["gender"].value_counts()

Male      3555
Female    3488
Name: gender, dtype: int64

In [6]:
# lets change to binary Male = 0 and Female = 1
df["gender"] = df["gender"].replace({"Female": 1, "Male": 0})
df["gender"].dtypes

dtype('int64')

In [7]:
# lets change yes/no columns
columns_yes_no=["Partner","Dependents","PhoneService","OnlineSecurity","OnlineBackup","DeviceProtection",
               "TechSupport","StreamingTV","StreamingMovies","Churn"]

for col in columns_yes_no:
    df[col] = df[col].replace({"Yes": 1, "No": 0,"No internet service":0})

In [8]:
df["Contract"].value_counts()

Month-to-month    3875
Two year          1695
One year          1473
Name: Contract, dtype: int64

In [9]:
# lets change Contract logically
df["Contract"] = df["Contract"].replace({"Month-to-month": 1, "One year": 12, "Two year":24})
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,1,0,1,0,1,0,0,1,0,0,0,0,1,29.85,29.85,0
1,0,0,0,0,34,1,1,0,1,0,0,0,12,56.95,1889.5,0
2,0,0,0,0,2,1,1,1,0,0,0,0,1,53.85,108.15,1
3,0,0,0,0,45,0,1,0,1,1,0,0,12,42.30,1840.75,0
4,1,0,0,0,2,1,0,0,0,0,0,0,1,70.70,151.65,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,0,1,1,24,1,1,0,1,1,1,1,12,84.80,1990.5,0
7039,1,0,1,1,72,1,0,1,1,0,1,1,12,103.20,7362.9,0
7040,1,0,1,1,11,0,1,0,0,0,0,0,1,29.60,346.45,0
7041,0,1,1,0,4,1,0,0,0,0,0,0,1,74.40,306.6,1


In [10]:
df.dtypes

gender                int64
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
PhoneService          int64
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract              int64
MonthlyCharges      float64
TotalCharges         object
Churn                 int64
dtype: object

In [11]:
df["TotalCharges"] = df["TotalCharges"].replace({" ": 0})

In [12]:
df["TotalCharges"] = df["TotalCharges"].astype(float)

In [13]:
df.dtypes

gender                int64
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
PhoneService          int64
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract              int64
MonthlyCharges      float64
TotalCharges        float64
Churn                 int64
dtype: object

In [14]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,1,0,1,0,1,0,0,1,0,0,0,0,1,29.85,29.85,0
1,0,0,0,0,34,1,1,0,1,0,0,0,12,56.95,1889.50,0
2,0,0,0,0,2,1,1,1,0,0,0,0,1,53.85,108.15,1
3,0,0,0,0,45,0,1,0,1,1,0,0,12,42.30,1840.75,0
4,1,0,0,0,2,1,0,0,0,0,0,0,1,70.70,151.65,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,0,1,1,24,1,1,0,1,1,1,1,12,84.80,1990.50,0
7039,1,0,1,1,72,1,0,1,1,0,1,1,12,103.20,7362.90,0
7040,1,0,1,1,11,0,1,0,0,0,0,0,1,29.60,346.45,0
7041,0,1,1,0,4,1,0,0,0,0,0,0,1,74.40,306.60,1


In [15]:
#split X and y

X = df.drop('Churn', axis=1)
y = df['Churn']

##### SMOTE

In [16]:
# Apply SMOTE for upsampling the data
smote = SMOTE()
X_resampled, y_resampled = smote.fit_resample(X, y)

In [17]:
y_resampled

0        0
1        0
2        1
3        0
4        1
        ..
10343    1
10344    1
10345    1
10346    1
10347    1
Name: Churn, Length: 10348, dtype: int64

In [18]:
# Use logistic regression to fit the model and compute the accuracy of the model.
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)

y_pred_lr = logistic_regression.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
accuracy_lr

0.8198067632850241

In [19]:
# Use decision tree classifier to fit the model and compute the accuracy of the model.

decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)

y_pred_dt = decision_tree.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
accuracy_dt

0.7951690821256039

- 1st and 2nd time: Logistic regression offers a better accuracy.

##### Tomeklinks

In [20]:
# Apply TomekLinks for downsampling
tomek_links = TomekLinks()
X_resampled, y_resampled = tomek_links.fit_resample(X, y)

In [21]:
# Use logistic regression to fit the model and compute the accuracy of the model.
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

logistic_regression = LogisticRegression()
logistic_regression.fit(X_train, y_train)

y_pred_lr = logistic_regression.predict(X_test)
accuracy_lr = accuracy_score(y_test, y_pred_lr)
accuracy_lr

0.8232129131437356

In [22]:
# Use decision tree classifier to fit the model and compute the accuracy of the model.
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)

y_pred_dt = decision_tree.predict(X_test)
accuracy_dt = accuracy_score(y_test, y_pred_dt)
accuracy_dt

0.7671022290545734

- 1st and 2nd time: Logistic regression offers a better accuracy.