### Step one: Importing libraries

In [1]:
#importing libraries
# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, f1_score, recall_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import the Telco Churn Dataset

df = pd.read_csv(r"C:\Users\HP\Desktop\data_talks\Telco-Customer-Churn.csv")
df.sample()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
2940,6313-GIDIT,Male,1,No,No,53,No,No phone service,DSL,No,...,Yes,No,Yes,Yes,Month-to-month,Yes,Electronic check,54.45,2854.55,Yes


In [3]:
# Check the dataset columns

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [4]:
df.columns = df.columns.str.lower() # this will keep all the columns in lowercase

In [5]:
# Check the unique values of each dataset

for col in df.columns:
    print(f"The unique values in column {col} are: ")
    print(df[col].unique())
    print(" ")

The unique values in column customerid are: 
['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK']
 
The unique values in column gender are: 
['Female' 'Male']
 
The unique values in column seniorcitizen are: 
[0 1]
 
The unique values in column partner are: 
['Yes' 'No']
 
The unique values in column dependents are: 
['No' 'Yes']
 
The unique values in column tenure are: 
[ 1 34  2 45  8 22 10 28 62 13 16 58 49 25 69 52 71 21 12 30 47 72 17 27
  5 46 11 70 63 43 15 60 18 66  9  3 31 50 64 56  7 42 35 48 29 65 38 68
 32 55 37 36 41  6  4 33 67 23 57 61 14 20 53 40 59 24 44 19 54 51 26  0
 39]
 
The unique values in column phoneservice are: 
['No' 'Yes']
 
The unique values in column multiplelines are: 
['No phone service' 'No' 'Yes']
 
The unique values in column internetservice are: 
['DSL' 'Fiber optic' 'No']
 
The unique values in column onlinesecurity are: 
['No' 'Yes' 'No internet service']
 
The unique values in column onlinebackup are: 
['Yes' 'No'

In [6]:
# Map the target variable, churn, to a numerical value as well as the totalcharges column 

df['churn'] = df['churn'].map({'Yes': 1, 'No': 0})
df['totalcharges'] = pd.to_numeric(df.totalcharges, errors="coerce")

df['totalcharges'].isnull().sum()

11

In [7]:
df['totalcharges'] = df['totalcharges'].fillna(0)

In [8]:
categorical_columns = list(df.dtypes[df.dtypes=="object"].index)

In [9]:
# Check for correlation between the target variable and the categorical columns

from sklearn.metrics import mutual_info_score

def mutual_churn(series):
    return mutual_info_score(series, df.churn)

cat_features_importance = df[categorical_columns].apply(mutual_churn)
cat_features_importance.sort_values(ascending=False)

customerid          0.578599
contract            0.098453
onlinesecurity      0.064677
techsupport         0.063021
internetservice     0.055574
onlinebackup        0.046792
paymentmethod       0.044519
deviceprotection    0.043917
streamingmovies     0.032001
streamingtv         0.031908
paperlessbilling    0.019194
dependents          0.014467
partner             0.011454
multiplelines       0.000801
phoneservice        0.000072
gender              0.000037
dtype: float64

In [10]:
# Check for correlation between the target variable and the numerical columns

numerical_columns = df.select_dtypes(exclude="object").columns

df[numerical_columns].corr(method="pearson")

Unnamed: 0,seniorcitizen,tenure,monthlycharges,totalcharges,churn
seniorcitizen,1.0,0.016567,0.220173,0.103006,0.150889
tenure,0.016567,1.0,0.2479,0.826178,-0.352229
monthlycharges,0.220173,0.2479,1.0,0.651174,0.193356
totalcharges,0.103006,0.826178,0.651174,1.0,-0.198324
churn,0.150889,-0.352229,0.193356,-0.198324,1.0


In [11]:
# Encode Categorical columns

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df['contract_n']      = le.fit_transform(df['contract'])
df['onlinesecurity_n'] = le.fit_transform(df['onlinesecurity'])
df['techsupport_n']  = le.fit_transform(df['techsupport'])
df['internetservice_n']    = le.fit_transform(df['internetservice'])
df['onlinebackup_n']  = le.fit_transform(df['onlinebackup'])
df['paymentmethod_n']    = le.fit_transform(df['paymentmethod'])

In [12]:
# Choose he churn as dependent variable and the top correlated columns as independent variables

X = df[['paymentmethod_n', 'onlinebackup_n','internetservice_n','techsupport_n',
        'onlinesecurity_n','contract_n','seniorcitizen','tenure','monthlycharges','totalcharges']]

y = df["churn"]

In [13]:
# Standardize the numerical columns of X

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

col = ['seniorcitizen', 'tenure', 'monthlycharges', 'totalcharges']
X.loc[:, col] = scaler.fit_transform(X.loc[:, col])

In [14]:
x_train,x_test,y_train,y_test = train_test_split(X, y, train_size=0.8)
x_train.shape

(5634, 10)

In [15]:
!pip install xgboost
import xgboost



In [16]:
# Now, let us define the models
models = {
    'Logistic Regression': LogisticRegression(C = 10, solver = 'liblinear'),
    'Random Forest': RandomForestClassifier(criterion = 'entropy', n_estimators = 100),
    'Gradient Boosting': GradientBoostingClassifier(),
    'XG Boost': xgboost.XGBClassifier(objective="binary:logistic", random_state=42)
}

# Also, we will initiate an empty dataframe to store our metrics
results_df = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

# Then, we will loop through the models to train, make predictions, and calculate the metrics
for model_name, model in models.items():
    # Training the model
    model.fit(x_train, y_train)

    # Predict
    y_pred = model.predict(x_test)

    # Calculate metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    # create dictionary with results
    results_dict = {
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    }

    # Store metrics in results_df
    results_df = pd.concat([results_df, pd.DataFrame([results_dict])], ignore_index=True)
    # as at new release of pandas, the append function has been removed

# # Print out results dataframe
# print("\nMetrics Comparison")
# print(results_df)
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,Logistic Regression,0.797729,0.616352,0.545961,0.579025
1,Random Forest,0.782825,0.582043,0.523677,0.55132
2,Gradient Boosting,0.793471,0.618881,0.493036,0.548837
3,XG Boost,0.782115,0.577381,0.54039,0.558273


In [17]:
# Using Logisitic Regression
log_model = LogisticRegression(C = 10, solver = 'liblinear')
log_model.fit(x_train,y_train)

y_predict = log_model.predict(x_test)

print(confusion_matrix(y_test,y_predict))
print(" ")
print(classification_report(y_test, y_predict))

[[928 122]
 [163 196]]
 
              precision    recall  f1-score   support

           0       0.85      0.88      0.87      1050
           1       0.62      0.55      0.58       359

    accuracy                           0.80      1409
   macro avg       0.73      0.71      0.72      1409
weighted avg       0.79      0.80      0.79      1409



In [18]:
#Save the model using joblib 

!pip install joblib
import joblib

joblib.dump(log_model, 'churn_prediction.joblib')   # save the random forest algorithm for this task



['churn_prediction.joblib']