In [226]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [227]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [228]:
df = pd.read_csv("data_preprocessed.csv")

In [229]:
labels = pd.DataFrame(df['Churn'])
customers = df.drop('Churn', axis='columns')

In [230]:
customers.head()

Unnamed: 0,Tenure,City_Tier,CC_Contacted_LY,Payment,Gender,Service_Score,Account_user_count,account_segment,CC_Agent_Score,Marital_Status,rev_per_month,Complain_ly,rev_growth_yoy,coupon_used_for_payment,Day_Since_CC_connect,cashback,Login_device
0,4.0,tier_3,6.0,Debit Card,Female,OK,3.0,Super,Bad,Single,9.0,Yes,11.0,1.0,5.0,160.0,Mobile
1,0.0,tier_1,8.0,UPI,Male,OK,4.0,Regular Plus,OK,Single,7.0,Yes,15.0,0.0,0.0,121.0,Mobile
2,0.0,tier_1,30.0,Debit Card,Male,Bad,4.0,Regular Plus,OK,Single,6.0,Yes,14.0,0.0,3.0,196.235376,Mobile
3,0.0,tier_3,15.0,Debit Card,Male,Bad,4.0,Super,Good,Single,8.0,No,23.0,0.0,3.0,134.0,Mobile
4,0.0,tier_1,12.0,Credit Card,Male,Bad,3.0,Regular Plus,Good,Single,3.0,No,11.0,1.0,3.0,130.0,Mobile


In [231]:
labels.head()

Unnamed: 0,Churn
0,1
1,1
2,1
3,1
4,1


In [232]:
categorical_cols = ['City_Tier', 'Payment', 'Gender', 'Service_Score', 'account_segment', 'CC_Agent_Score', 'Marital_Status', 'Complain_ly', 'Login_device']

In [233]:
customers = pd.get_dummies(customers, prefix=categorical_cols)

In [234]:
customers.head()

Unnamed: 0,Tenure,CC_Contacted_LY,Account_user_count,rev_per_month,rev_growth_yoy,coupon_used_for_payment,Day_Since_CC_connect,cashback,City_Tier_NOT_FILL,City_Tier_tier_1,City_Tier_tier_2,City_Tier_tier_3,Payment_Cash on Delivery,Payment_Credit Card,Payment_Debit Card,Payment_E wallet,Payment_NOT_FILL,Payment_UPI,Gender_Female,Gender_Male,Gender_NOT_FILL,Service_Score_Bad,Service_Score_Good,Service_Score_NOT_FILL,Service_Score_OK,account_segment_HNI,account_segment_NOT_FILL,account_segment_Regular,account_segment_Regular Plus,account_segment_Super,account_segment_Super Plus,CC_Agent_Score_Bad,CC_Agent_Score_Good,CC_Agent_Score_NOT_FILL,CC_Agent_Score_OK,Marital_Status_Divorced,Marital_Status_Married,Marital_Status_NOT_FILL,Marital_Status_Single,Complain_ly_NOT_FILL,Complain_ly_No,Complain_ly_Yes,Login_device_Computer,Login_device_Mobile,Login_device_NOT_FILL
0,4.0,6.0,3.0,9.0,11.0,1.0,5.0,160.0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0,1,0
1,0.0,8.0,4.0,7.0,15.0,0.0,0.0,121.0,0,1,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0
2,0.0,30.0,4.0,6.0,14.0,0.0,3.0,196.235376,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,1,0,0,1,0,1,0
3,0.0,15.0,4.0,8.0,23.0,0.0,3.0,134.0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0
4,0.0,12.0,3.0,3.0,11.0,1.0,3.0,130.0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,1,0,1,0,0,1,0


In [235]:
customers_train, customers_test, labels_train, labels_test = train_test_split(customers, labels, test_size= 0.2, random_state=42, stratify=labels)

6. Train model

In [236]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [237]:
decision_tree_model = DecisionTreeRegressor(random_state=42)
lg_model = LogisticRegression(random_state=42, max_iter=3000, solver='lbfgs')


In [238]:
decision_tree_model.fit(customers_train, labels_train)
lg_model.fit(customers_train, labels_train.values.ravel())

In [239]:
pred_dt = decision_tree_model.predict(customers_test)
pred_lg = lg_model.predict(customers_test)

In [240]:
def print_score(labels, pred):
    print("Accurancy score", accuracy_score(labels, pred))
    print("Precision score", precision_score(labels, pred))
    print("Recall score", recall_score(labels, pred))
    print("F1 score", f1_score(labels, pred))

In [241]:
print_score(labels=labels_test, pred=pred_dt)

Accurancy score 0.94449378330373
Precision score 0.8207070707070707
Recall score 0.8575197889182058
F1 score 0.8387096774193549


In [242]:
print_score(labels=labels_test, pred=pred_lg)

Accurancy score 0.8872113676731794
Precision score 0.7659574468085106
Recall score 0.47493403693931396
F1 score 0.5863192182410424
