## Customer Churn Prediction Model

Using the [Telco customer churn data](https://www.kaggle.com/code/mechatronixs/telco-churn-prediction-feature-engineering-eda/data) from kaggle, train_encoded_encoded_encoded_encoded_encoded a machine learning model to predict customer churn.

In [189]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import category_encoders as ce
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn import tree
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.neural_network import MLPClassifier


from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from math import sqrt

from sklearn.model_selection import cross_validate, train_test_split
from sklearn.tree import DecisionTreeRegressor

pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)
pd.set_option('display.width', 1000)

In [191]:
# get training data
train = pd.read_csv("./data/training_data.csv")
# drop customer ID: not a feature for training 
train.drop("customerID", axis=1, inplace=True)

# getting validation data
val = pd.read_csv("./data/validation_data.csv")

In [192]:
# Deal with the spaces in TotalCharges and convert to float 
spaces = train[train['TotalCharges'] == " "]
train['TotalCharges'].loc[train['TotalCharges'] == " "] = '0.0'
train['TotalCharges'] = pd.to_numeric(train['TotalCharges'], errors='coerce')
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train['TotalCharges'].loc[train['TotalCharges'] == " "] = '0.0'


In [193]:
category_columns = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
                    'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                    'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract',
                    'PaperlessBilling', 'PaymentMethod', 'Churn']


In [194]:
col_mapper = {}
for col in category_columns:
    ohe = OneHotEncoder(drop='if_binary')
    encoded_column = ohe.fit_transform(train[[col]])
    col_mapper.update({col: ohe})
    encoded_df = pd.DataFrame(encoded_column.toarray(), columns=ohe.get_feature_names_out([col]))
    train = pd.concat([train.drop(col, axis=1), encoded_df], axis=1)
train.head()

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No,MultipleLines_No phone service,MultipleLines_Yes,InternetService_DSL,InternetService_Fiber optic,InternetService_No,OnlineSecurity_No,OnlineSecurity_No internet service,OnlineSecurity_Yes,OnlineBackup_No,OnlineBackup_No internet service,OnlineBackup_Yes,DeviceProtection_No,DeviceProtection_No internet service,DeviceProtection_Yes,TechSupport_No,TechSupport_No internet service,TechSupport_Yes,StreamingTV_No,StreamingTV_No internet service,StreamingTV_Yes,StreamingMovies_No,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_Month-to-month,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Bank transfer (automatic),PaymentMethod_Credit card (automatic),PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_Yes
0,0,5,75.15,392.65,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0,66,63.85,4264.6,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0,42,73.15,3088.25,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0,19,69.6,1394.55,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,0,59,20.2,1192.3,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0


In [None]:
# create a OneHotEncoder object
ohe = OneHotEncoder(drop='if_binary')
#ohe = OneHotEncoder()

# fit and transform the categorical columns
encoded_columns = ohe.fit_transform(train[category_columns]).toarray()

# create a new dataframe with the encoded columns
encoded_df = pd.DataFrame(encoded_columns, columns=ohe.get_feature_names_out(category_columns))

# combine the encoded columns with the original dataframe
final_df = pd.concat([train.drop(category_columns, axis=1), encoded_df], axis=1)

final_df.head()

In [195]:
X = final_df.drop(['Churn_Yes'], axis=1)
y = final_df['Churn_Yes']

#train_test_split
X_train,X_test, y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [196]:
#initialize the decisiontreeclassifier
#They say that trees shouldn't be deeper than 5
dtc = tree.DecisionTreeClassifier(max_depth=5,random_state=42,criterion='gini')

#fit and return mean squared error
dtc.fit(X_train,y_train)
y_pred = dtc.predict(X_test)

mean_squared_error(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
print('DecisionTreeClassifier accuracy score: {}'.format(accuracy))

DecisionTreeClassifier accuracy score: 0.7975402081362346


## THE EXTRA CELLS 

In [None]:
print(f'colum number: {len(train.columns)}')
train.head()

In [None]:
# Create a label encoder object

le = LabelEncoder()
encoded_columns = train[category_columns].apply(le.fit_transform)
train[category_columns] = encoded_columns

train.head()

In [None]:
xgb_model = xgb.XGBClassifier(n_estimators=1000, learning_rate=0.06, 
                              n_jobs=4, max_depth=5, random_state=42)

# Train the model on the training data
xgb_model.fit(X_train, y_train)

# Evaluate the model on the test data
accuracy = xgb_model.score(X_test, y_test)
print('Accuracy:', accuracy)

In [None]:
# Create a neural network with two hidden layers
clf = MLPClassifier(hidden_layer_sizes=(10, 10), max_iter=1000, random_state=42)

# Train the neural network on the training data
clf.fit(X_train, y_train)

# Evaluate the performance of the neural network on the testing data
score = clf.score(X_test, y_test)
print("Accuracy:", score)

In [None]:
column_names = list(train.columns)
print(column_names)
category_columns = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
                    'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                    'TechSupport', 'StreamingTV', 'StreamingMovies', 
                    'PaperlessBilling', 'PaymentMethod', 'Churn']
ordered_columns = ['Contract']
numeric_columns = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']
#train.describe()

In [None]:
category_columns = ['gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 
                    'InternetService', 'PaperlessBilling', 'PaymentMethod', 'Churn']
ordered_columns = ['Contract','OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 
                               'TechSupport', 'StreamingTV', 'StreamingMovies']
numeric_columns = ['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

In [None]:
final_df = final_df.rename(columns={'OnlineSecurity_No internet service': 'No internet service'})

# drop the rest of the no internet columns 
cols_to_drop = final_df.filter(regex='_No internet service$').columns
final_df.drop(cols_to_drop, axis=1, inplace=True)

# print the resulting dataframe
final_df.head()