In [63]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import TomekLinks

In [38]:
ChurnData = pd.read_csv('Customer-Churn.csv')
ChurnData

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [39]:
ChurnData.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

### changing data categorical into numerics 0 and 1 for the models

In [40]:
ChurnData["gender"] = ChurnData["gender"].replace({"Female": 1, "Male": 0})
ChurnData["Contract"] = ChurnData["Contract"].replace({"Month-to-month": 1, "One year": 12, "Two year":24})

In [43]:
ChurnData["TotalCharges"] = ChurnData["TotalCharges"].replace({" ": 0})
ChurnData["TotalCharges"] = ChurnData["TotalCharges"].astype(float)

In [48]:
columns_numerical = ["Partner","Dependents","PhoneService","OnlineSecurity","OnlineBackup","DeviceProtection",
                     "TechSupport","StreamingTV","StreamingMovies","Churn"]

for col in columns_numerical:
    
    ChurnData[col] = ChurnData[col].replace({"Yes": 1, "No": 0,"No internet service":0})

In [49]:
ChurnData.dtypes

gender                int64
SeniorCitizen         int64
Partner               int64
Dependents            int64
tenure                int64
PhoneService          int64
OnlineSecurity        int64
OnlineBackup          int64
DeviceProtection      int64
TechSupport           int64
StreamingTV           int64
StreamingMovies       int64
Contract              int64
MonthlyCharges      float64
TotalCharges        float64
Churn                 int64
dtype: object

In [50]:
X = ChurnData.drop('Churn', axis=1)
y = ChurnData['Churn']

### Now we can train and compare the models

In [51]:
# SMOTE for the models

smote = SMOTE(random_state = 42)
X_sm, y_sm = smote.fit_resample(X, y)
y_sm.value_counts()

0    5174
1    5174
Name: Churn, dtype: int64

In [57]:
# Logistic Regression

X_train, X_test, y_train, y_test = train_test_split(X_sm, y_sm, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

result = accuracy_score(y_test, y_pred)
result

0.8164251207729468

In [58]:
# Decision Tree Classification

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.7156989247311828

### By comparing the 2 models, the logistic regression gets a better score

In [65]:
# TomekLinks for models

Tomeklinks_ = TomekLinks()
X_resampled, y_resampled = Tomeklinks_.fit_resample(X, y)

In [67]:
# Logistic Regression

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

result = accuracy_score(y_test, y_pred)
result

0.8201383551114527

In [69]:
# Decision Tree Classification

DTC = DecisionTreeClassifier()
DTC.fit(X_train, y_train)

y_predict = DTC.predict(X_test)
result_DTC = accuracy_score(y_test, y_predict)
result_DTC

0.7717140661029976

### and again by comparing the 2 models, the logistic regression still gets a better score.