## CHURN PREDICTION PROJECT

In [20]:
# i. Import necessary Libraries
# Import necessary libraries
import pandas as pd
import numpy as np
# import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')

In [2]:
# ii. Import the dataset
 
df = pd.read_csv("Telco-Customer-Churn.csv")
df.sample()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
4411,3621-CHYVB,Female,0,Yes,No,57,No,No phone service,DSL,Yes,...,Yes,Yes,No,Yes,Month-to-month,Yes,Bank transfer (automatic),49.9,2782.4,No


# Data Preparation

In [3]:
df.columns = df.columns.str.lower().str.replace(' ', '_')           # This would convert all columns to lowercase

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)  # Create a variable for all categorical columns

for c in categorical_columns:            # Convert all values in the categorical columns to lowercase for uniformity                         
    df[c] = df[c].str.lower().str.replace(' ', '_')


df.sample()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
5212,3511-bftjw,male,0,yes,yes,72,no,no_phone_service,dsl,yes,...,yes,no,no,no,two_year,no,credit_card_(automatic),38.5,2763,no


In [4]:
# Check the column type and prescence of null values in dataset

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerid        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   seniorcitizen     7043 non-null   int64  
 3   partner           7043 non-null   object 
 4   dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   phoneservice      7043 non-null   object 
 7   multiplelines     7043 non-null   object 
 8   internetservice   7043 non-null   object 
 9   onlinesecurity    7043 non-null   object 
 10  onlinebackup      7043 non-null   object 
 11  deviceprotection  7043 non-null   object 
 12  techsupport       7043 non-null   object 
 13  streamingtv       7043 non-null   object 
 14  streamingmovies   7043 non-null   object 
 15  contract          7043 non-null   object 
 16  paperlessbilling  7043 non-null   object 


In [5]:
# Change total charges to integer type

df['totalcharges'] = pd.to_numeric(df['totalcharges'], errors='coerce')
df['totalcharges'] = df['totalcharges'].fillna(df.totalcharges.mean())
df['totalcharges'].dtypes


dtype('float64')

In [8]:
# Change total charges to integer type

df['churn'] = (df['churn']=='yes').astype(int)

df['churn'].isnull().sum()

0

In [12]:
df['churn'].value_counts(normalize=True)

0.7346301292063041

Feature Importance

In [10]:
from IPython.display import display

main_churn_rate = df.churn.mean()

In [11]:
for col in categorical_columns:
    print(col)
    df_groupby = df.groupby(col).churn.agg(['mean', 'count'])
    df_groupby['diff'] = df_groupby['mean'] - main_churn_rate
    df_groupby['risk'] = df_groupby['mean'] / main_churn_rate
    display(df_groupby)
    print()

customerid


Unnamed: 0_level_0,mean,count,diff,risk
customerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0002-orfbo,0.0,1,-0.26537,0.000000
0003-mknfe,0.0,1,-0.26537,0.000000
0004-tlhlj,1.0,1,0.73463,3.768325
0011-igkff,1.0,1,0.73463,3.768325
0013-exchz,1.0,1,0.73463,3.768325
...,...,...,...,...
9987-lutyd,0.0,1,-0.26537,0.000000
9992-rramn,1.0,1,0.73463,3.768325
9992-ujoel,0.0,1,-0.26537,0.000000
9993-lhieb,0.0,1,-0.26537,0.000000



gender


Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.269209,3488,0.003839,1.014466
male,0.261603,3555,-0.003766,0.985807



partner


Unnamed: 0_level_0,mean,count,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.32958,3641,0.06421,1.241964
yes,0.196649,3402,-0.068721,0.741038



dependents


Unnamed: 0_level_0,mean,count,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.312791,4933,0.047422,1.1787
yes,0.154502,2110,-0.110868,0.582215



phoneservice


Unnamed: 0_level_0,mean,count,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.249267,682,-0.016103,0.939319
yes,0.267096,6361,0.001726,1.006506



multiplelines


Unnamed: 0_level_0,mean,count,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.250442,3390,-0.014927,0.943749
no_phone_service,0.249267,682,-0.016103,0.939319
yes,0.286099,2971,0.020729,1.078114



internetservice


Unnamed: 0_level_0,mean,count,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dsl,0.189591,2421,-0.075779,0.714441
fiber_optic,0.418928,3096,0.153558,1.578656
no,0.07405,1526,-0.19132,0.279044



onlinesecurity


Unnamed: 0_level_0,mean,count,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.417667,3498,0.152297,1.573906
no_internet_service,0.07405,1526,-0.19132,0.279044
yes,0.146112,2019,-0.119258,0.550597



onlinebackup


Unnamed: 0_level_0,mean,count,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.399288,3088,0.133918,1.504645
no_internet_service,0.07405,1526,-0.19132,0.279044
yes,0.215315,2429,-0.050055,0.811377



deviceprotection


Unnamed: 0_level_0,mean,count,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.391276,3095,0.125906,1.474456
no_internet_service,0.07405,1526,-0.19132,0.279044
yes,0.225021,2422,-0.040349,0.847951



techsupport


Unnamed: 0_level_0,mean,count,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.416355,3473,0.150985,1.56896
no_internet_service,0.07405,1526,-0.19132,0.279044
yes,0.151663,2044,-0.113706,0.571517



streamingtv


Unnamed: 0_level_0,mean,count,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.335231,2810,0.069861,1.263261
no_internet_service,0.07405,1526,-0.19132,0.279044
yes,0.300702,2707,0.035332,1.133143



streamingmovies


Unnamed: 0_level_0,mean,count,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.336804,2785,0.071434,1.269188
no_internet_service,0.07405,1526,-0.19132,0.279044
yes,0.299414,2732,0.034044,1.128291



contract


Unnamed: 0_level_0,mean,count,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month-to-month,0.427097,3875,0.161727,1.60944
one_year,0.112695,1473,-0.152675,0.424672
two_year,0.028319,1695,-0.237051,0.106714



paperlessbilling


Unnamed: 0_level_0,mean,count,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.163301,2872,-0.102069,0.615371
yes,0.335651,4171,0.070281,1.264842



paymentmethod


Unnamed: 0_level_0,mean,count,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank_transfer_(automatic),0.167098,1544,-0.098271,0.629681
credit_card_(automatic),0.152431,1522,-0.112939,0.57441
electronic_check,0.452854,2365,0.187484,1.706502
mailed_check,0.191067,1612,-0.074303,0.720003



totalcharges


Unnamed: 0_level_0,mean,count,diff,risk
totalcharges,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
18.80,0.0,1,-0.26537,0.000000
18.85,0.5,2,0.23463,1.884163
18.90,0.0,1,-0.26537,0.000000
19.00,0.0,1,-0.26537,0.000000
19.05,0.0,1,-0.26537,0.000000
...,...,...,...,...
8564.75,0.0,1,-0.26537,0.000000
8594.40,0.0,1,-0.26537,0.000000
8670.10,0.0,1,-0.26537,0.000000
8672.45,0.0,1,-0.26537,0.000000



churn


Unnamed: 0_level_0,mean,count,diff,risk
churn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.0,5174,-0.26537,0.0
1,1.0,1869,0.73463,3.768325





One-hot encoding

In [12]:
# Choose feature variables, X and target variable,y.

X = df.drop('churn', axis=1)

y= df.churn

In [15]:
# Encode the categorical variables and split te dataset to train and test sets

from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split

dv = DictVectorizer(sparse=False)

X = df.to_dict(orient='records')

x_train, x_test, y_train, y_test = train_test_split(X,y,train_size=0.8, random_state=42)

x_train = dv.fit_transform(x_train)
x_test = dv.transform(x_test)

In [16]:
# Scale the features

from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_train =scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

In [21]:
!pip install xgboost
import xgboost



In [22]:
# Now, let us define the models
models = {
    'Logistic Regression': LogisticRegression(C = 10, solver = 'liblinear'),
    'Random Forest': RandomForestClassifier(criterion = 'entropy', n_estimators = 100),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XG Boost': xgboost.XGBClassifier(objective="binary:logistic", random_state=42)
}

# Also, we will initiate an empty dataframe to store our metrics
results_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

# Then, we will loop through the models to train, make predictions, and calculate the metrics
for model_name, model in models.items():
    # Training the model
    model.fit(x_train, y_train)

    # Predict
    y_pred = model.predict(x_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # create dictionary with results
    results_dict = {
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

    # Store metrics in results_df
    results_df = pd.concat([results_df, pd.DataFrame([results_dict])], ignore_index=True)
    # as at new release of pandas, the append function has been removed

# # Print out results dataframe
# print("\nMetrics Comparison")
# print(results_df)
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.996451,1.0,0.986595,0.993252
1,Random Forest,0.976579,0.997076,0.914209,0.953846
2,Gradient Boosting,1.0,1.0,1.0,1.0
3,XG Boost,1.0,1.0,1.0,1.0


In [26]:
model = LogisticRegression(C = 10, solver = 'liblinear')

model.fit(x_train, y_train)
ypred = model.predict(x_test)

rep = classification_report(y_test,ypred)
print(rep)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1036
           1       1.00      0.99      0.99       373

    accuracy                           1.00      1409
   macro avg       1.00      0.99      1.00      1409
weighted avg       1.00      1.00      1.00      1409



In [27]:
# Save Machine Learning Model

import pickle as pkl
with open('churn_modell.pkl', 'wb') as file:
    pkl.dump((model, dv, scaler), file)
