In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from imblearn.under_sampling import TomekLinks

In [2]:
df = pd.read_csv(r"./files_for_lab/Customer-Churn.csv")

In [3]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,No
1,Male,0,No,No,34,Yes,Yes,No,Yes,No,No,No,One year,56.95,1889.5,No
2,Male,0,No,No,2,Yes,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,Yes
3,Male,0,No,No,45,No,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,No
4,Female,0,No,No,2,Yes,No,No,No,No,No,No,Month-to-month,70.70,151.65,Yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,Yes,Yes,24,Yes,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.5,No
7039,Female,0,Yes,Yes,72,Yes,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.9,No
7040,Female,0,Yes,Yes,11,No,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,No
7041,Male,1,Yes,No,4,Yes,No,No,No,No,No,No,Month-to-month,74.40,306.6,Yes


In [5]:
col_to_target=["TotalCharges","MonthlyCharges","tenure"]

In [6]:
data_types = df.dtypes
data_types

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [7]:
df["TotalCharges"] = df["TotalCharges"].replace({" ":0})

In [8]:
df["TotalCharges"]=df["TotalCharges"].astype(float)

## Binarization of data

In [9]:
col_to_binary=["Partner","Dependents","PhoneService","Churn"]

In [10]:
for i in col_to_binary:
    df[i] = df[i].replace({"Yes":1,"No":0})

In [11]:
df

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,MonthlyCharges,TotalCharges,Churn
0,Female,0,1,0,1,0,No,Yes,No,No,No,No,Month-to-month,29.85,29.85,0
1,Male,0,0,0,34,1,Yes,No,Yes,No,No,No,One year,56.95,1889.50,0
2,Male,0,0,0,2,1,Yes,Yes,No,No,No,No,Month-to-month,53.85,108.15,1
3,Male,0,0,0,45,0,Yes,No,Yes,Yes,No,No,One year,42.30,1840.75,0
4,Female,0,0,0,2,1,No,No,No,No,No,No,Month-to-month,70.70,151.65,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,Male,0,1,1,24,1,Yes,No,Yes,Yes,Yes,Yes,One year,84.80,1990.50,0
7039,Female,0,1,1,72,1,No,Yes,Yes,No,Yes,Yes,One year,103.20,7362.90,0
7040,Female,0,1,1,11,0,Yes,No,No,No,No,No,Month-to-month,29.60,346.45,0
7041,Male,1,1,0,4,1,No,No,No,No,No,No,Month-to-month,74.40,306.60,1


In [12]:
df["gender"].unique()
df["Contract"].unique()

array(['Month-to-month', 'One year', 'Two year'], dtype=object)

## Categorical data encoding

In [13]:
categorical_df=df[["gender","Contract","OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies"]]

In [14]:
columns_to_drop = ["gender", "Contract","OnlineSecurity","OnlineBackup","DeviceProtection","TechSupport","StreamingTV","StreamingMovies"]
df=df.drop(columns_to_drop, axis=1)

In [15]:
encoded_categorical=pd.get_dummies(categorical_df)

In [16]:
encoded_categorical

Unnamed: 0,gender_Female,gender_Male,Contract_Month-to-month,Contract_One year,Contract_Two year,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,...,DeviceProtection_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes
0,1,0,1,0,0,1,0,0,0,0,...,0,1,0,0,1,0,0,1,0,0
1,0,1,0,1,0,0,0,1,1,0,...,1,1,0,0,1,0,0,1,0,0
2,0,1,1,0,0,0,0,1,0,0,...,0,1,0,0,1,0,0,1,0,0
3,0,1,0,1,0,0,0,1,1,0,...,1,0,0,1,1,0,0,1,0,0
4,1,0,1,0,0,1,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,0,1,0,1,0,0,0,1,1,0,...,1,0,0,1,0,0,1,0,0,1
7039,1,0,0,1,0,1,0,0,0,0,...,1,1,0,0,0,0,1,0,0,1
7040,1,0,1,0,0,0,0,1,1,0,...,0,1,0,0,1,0,0,1,0,0
7041,0,1,1,0,0,1,0,0,1,0,...,0,1,0,0,1,0,0,1,0,0


In [17]:
merged_df=pd.concat([df,encoded_categorical], axis=1)

In [18]:
merged_df['Churn'].value_counts()

0    5174
1    1869
Name: Churn, dtype: int64

## X-Y split

In [19]:
X = merged_df.drop("Churn",axis=1)
Y = merged_df["Churn"]

In [20]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.2, random_state=42)

## Balancing

In [21]:
smote=SMOTE(random_state=42)

In [22]:
X_train_resampled, y_train_resampled = smote.fit_resample(X_train,y_train)

In [23]:
y_train_resampled.value_counts()

0    4138
1    4138
Name: Churn, dtype: int64

## Model train - Logistic Regression

In [24]:
classification = LogisticRegression(random_state=42, max_iter=100000)
classification.fit(X_train_resampled, y_train_resampled)

In [25]:
predictions = classification.predict(X_test)
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.88      0.86      0.87      1036
           1       0.64      0.66      0.65       373

    accuracy                           0.81      1409
   macro avg       0.76      0.76      0.76      1409
weighted avg       0.81      0.81      0.81      1409



## Model train - Decision Tree

In [26]:
model = DecisionTreeClassifier()
model.fit(X_train_resampled, y_train_resampled)
model.score(X_test, y_test)

0.7224982256919801

## TomekLinks for downsampling

In [27]:
tl = TomekLinks(sampling_strategy='majority')

X_tl, y_tl = tl.fit_resample(X, Y)


In [28]:
X_train_tl, X_test_tl, y_train_tl, y_test_tl = train_test_split(X_tl, y_tl, test_size=0.2, random_state=11)

## Model train - Logistic Regression

In [29]:
classification = LogisticRegression(random_state=42, max_iter=100000)
classification.fit(X_train_tl, y_train_tl)
predictions = classification.predict(X_test_tl)
print(classification_report(y_test_tl, predictions))

              precision    recall  f1-score   support

           0       0.86      0.91      0.89       903
           1       0.76      0.67      0.71       393

    accuracy                           0.84      1296
   macro avg       0.81      0.79      0.80      1296
weighted avg       0.83      0.84      0.83      1296



## Model train - Decision Tree

In [30]:
model = DecisionTreeClassifier()
model.fit(X_train_tl,y_train_tl )
model.score(X_test_tl, y_test_tl)

0.7600308641975309