# **CUSTOMER CHURN PREDICTION**

## LIBRARIES

In [None]:
import pandas as pd
import numpy as np
import missingno as msno
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
warnings.filterwarnings('ignore')

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.metrics import recall_score, confusion_matrix, precision_score, f1_score, accuracy_score, classification_report
#from imblearn.combine import SMOTEENN
from sklearn.feature_selection import SelectKBest
from collections import Counter
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout, GRU, Bidirectional
from keras.optimizers import SGD

## Undertanding the data

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
df = pd.read_csv("/content/drive/MyDrive/WA_Fn-UseC_-Telco-Customer-Churn.csv")
df.head(5)

FileNotFoundError: [Errno 2] No such file or directory: '/content/drive/MyDrive/WA_Fn-UseC_-Telco-Customer-Churn.csv'

In [None]:
df.shape

In [None]:
df.info()

In [None]:
df.columns.values

## DATA MANIPULATION

In [None]:
df = df.drop(['customerID'], axis = 1)
df.head()

Here we see that the TotalCharges has 11 missing values


In [None]:
df['TotalCharges'] = pd.to_numeric(df.TotalCharges, errors='coerce')
df.isnull().sum()

 selecting rows from the DataFrame df where the 'TotalCharges' column contains NaN values.


In [None]:
df[np.isnan(df['TotalCharges'])]

the index labels of the rows where the 'tenure' column has a value of 0 in the DataFrame df

In [None]:
df[df['tenure'] == 0].index

There are no additional missing values in the Tenure column.
Let's delete the rows with missing values in Tenure columns since there are only 11 rows and deleting them will not affect the data.
1st-> code drops the rows from the df where the 'tenure' column has a value of 0

2nd->index verifies if there are still rows with 'tenure' equal to 0.

In [None]:
df.drop(labels=df[df['tenure'] == 0].index, axis=0, inplace=True)
df[df['tenure'] == 0].index

 then verify that there are no missing values left by checking df.isnull().sum() again

In [None]:
print(df.isnull().sum())
print(df.shape)

mapping the senior citizen
No - 0
yes - 1

In [None]:
df["SeniorCitizen"]= df["SeniorCitizen"].map({0: "No", 1: "Yes"})
df.head()

In [None]:
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
df[numerical_cols].describe()

## DATA VISUALIZATION

In [None]:
g_labels = ['Male', 'Female']
c_labels = ['No', 'Yes']
# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=g_labels, values=df['gender'].value_counts(), name="Gender"),
              1, 1)
fig.add_trace(go.Pie(labels=c_labels, values=df['Churn'].value_counts(), name="Churn"),
              1, 2)

# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.4, hoverinfo="label+percent+name", textfont_size=16)


*   26.6 % of customers switched to another firm.

*   Customers are 49.5 % female and 50.5 % male.



In [None]:
plt.figure(figsize=(6, 6))
labels =["Churn: Yes","Churn:No"]
values = [1869,5163]
labels_gender = ["F","M","F","M"]
sizes_gender = [939,930 , 2544,2619]
colors = ['#ff6666', '#66b3ff']
colors_gender = ['#c2c2f0','#ffb3e6', '#c2c2f0','#ffb3e6']
explode = (0.3,0.3)
explode_gender = (0.1,0.1,0.1,0.1)
textprops = {"fontsize":15}
plt.pie(values, labels=labels,autopct='%1.1f%%',pctdistance=1.08, labeldistance=0.8,colors=colors, startangle=90,frame=True, explode=explode,radius=10, textprops =textprops, counterclock = True, )
plt.pie(sizes_gender,labels=labels_gender,colors=colors_gender,startangle=90, explode=explode_gender,radius=7, textprops =textprops, counterclock = True, )
#Draw circle
centre_circle = plt.Circle((0,0),5,color='black', fc='white',linewidth=0)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.title('Churn Distribution on Gender: Male(M), Female(F)', fontsize=15, y=1.1)
plt.axis('equal')
plt.tight_layout()
plt.show()

There is negligible difference in customer percentage/ count who chnaged the service provider. Both genders behaved in similar fashion when it comes to migrating to another service provider/firm.

In [None]:
fig = px.histogram(df, x="Churn", color="Contract", barmode="group", title="<b>Customer contract distribution<b>")
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

About 75% of customer with Month-to-Month Contract opted to move out as compared to 13% of customrs with One Year Contract and 3% with Two Year Contract

In [None]:
fig = px.histogram(df, x="Churn", color="PaymentMethod", title="<b>Customer Payment Method distribution Churn</b>")
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()



1.   Major customers who moved out were having Electronic Check as Payment Method.
2.   Customers who opted for Credit-Card automatic transfer or Bank Automatic Transfer and Mailed Check as Payment Method were less likely to move out.

In [None]:
fig = go.Figure()

fig.add_trace(go.Bar(
  x = ['Churn:No', 'Churn:Yes'],
  y = [965+992, 219+240],
  name = 'DSL',
))

fig.add_trace(go.Bar(
  x = ['Churn:No', 'Churn:Yes'],
  y = [889+910, 664+633],
  name = 'Fiber optic',
))

fig.add_trace(go.Bar(
  x = ['Churn:No', 'Churn:Yes'],
  y = [690+717, 56+57],
  name = 'No Internet',
))

fig.update_layout(width=700, height=500, bargap=0.1, title_text="<b>Churn Distribution on Internet Service and Gender</b>")

fig.show()



1.   A lot of customers choose the Fiber optic service and it's also evident that the customers who use Fiber optic have high churn rate, this might suggest a dissatisfaction with this type of internet service.


In [None]:
color_map = {"Yes", "No"}
fig = px.histogram(df, x="Churn", color="Dependents", barmode="group", title="<b>Dependents distribution</b>")
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

Customers without dependents are more likely to churn

1.  It can be observed that the fraction of senior citizen is very less.

2.   Most of the senior citizens churn



## Data Preprocessing
# Splitting the data into train and test sets

This function `object_to_int` takes a series df and checks if its data type is 'object'. If it is, it uses LabelEncoder to transform categorical data into integers. Finally, it returns the modified series.

In [None]:
def object_to_int(dataframe_series):
    if dataframe_series.dtype=='object':
        dataframe_series = LabelEncoder().fit_transform(dataframe_series)
    return dataframe_series

This code applies `object_to_int` to each column of df converting categorical data into integers using LabelEncoder. Then, it display first few rows

In [None]:
df = df.apply(lambda x: object_to_int(x))
df.head()

In [None]:
X = df.drop(columns = ['Churn'])
y = df['Churn'].values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.8, random_state = 4, stratify=y)

In [None]:
def distplot(feature, frame, color='r'):
    plt.figure(figsize=(8,3))
    plt.title("Distribution for {}".format(feature))
    ax = sns.distplot(frame[feature], color= color)
num_cols = ["tenure", 'MonthlyCharges', 'TotalCharges']
for feat in num_cols: distplot(feat, df)

In [None]:
cat_cols_ohe =['PaymentMethod', 'Contract', 'InternetService'] # those that need one-hot encoding
cat_cols_le = list(set(X_train.columns)- set(num_cols) - set(cat_cols_ohe)) #those that need label encoding
scaler= StandardScaler()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

## LSTM

In [None]:
regressor = Sequential()
# First LSTM layer with Dropout regularisation
regressor.add(LSTM(units=50, return_sequences=True, input_shape=(X_train.shape[1], 1)))
regressor.add(Dropout(0.2))
# Second LSTM layer
regressor.add(LSTM(units=50, return_sequences=True))
regressor.add(Dropout(0.2))
# Third LSTM layer
regressor.add(LSTM(units=50, return_sequences=True))
regressor.add(Dropout(0.2))
# Fourth LSTM layer
regressor.add(LSTM(units=50))
regressor.add(Dropout(0.2))
# The output layer
regressor.add(Dense(units=1))

# Compile the model
regressor.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
regressor.fit(X_train, y_train, epochs=10, batch_size=32)

# Evaluate the model
loss, accuracy = regressor.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

regressor.evaluate(X_test, y_test) indicates that the model was evaluated on 176 samples from the test dataset.

Loss: The loss calculated during evaluation is approximately 4.09.
Accuracy: The accuracy achieved on the test dataset is around  0.73.

In [None]:
regressor.evaluate(X_test, y_test)

here the model's predictions for the first three samples from the test dataset

In [None]:
yp = regressor.predict(X_test)
yp[:3]

 predicted by the model (yp) into binary churn predictions (0 or 1) based on a threshold of 0.5.

 For each predicted probability, if it's greater than 0.5, it means the model predicts churn (1), so it adds 1 to y_pred. Otherwise, if the probability is less than or equal to 0.5, it means the model predicts no churn (0), so it adds 0 to y_pred.

In [None]:
#converting the yp into 0,1
y_pred = []
for element in yp:
    if element > 0.5:
        y_pred.append(1)
    else:
        y_pred.append(0)

the first 10 elements
 1 for positive predictions and 0 for negative predictions

In [None]:
y_pred[:10]

 the first 10 elements of the true churn labels stored in y_test.

In [None]:
y_test[:10]

In [None]:
round((841+502)/(861+158+166+242),2)

accuracy_score(y_test, y_pred) calculates the accuracy score by comparing the predicted labels (y_pred) with the true labels (y_test).
print(f'LSTM Accuracy score : {accuracy_score(y_test, y_pred)}')prints the accuracy score of the LSTM model.

In [None]:
accuracy_lstm = accuracy_score(y_test,y_pred)
print(f'LSTM Accuracy score : {accuracy_score(y_test,y_pred)}')

it generates a classification report, including precision, recall, F1-score, and support metrics, by comparing the true labels (y_test) with the predicted labels (y_pred).

It then prints the classification report, displaying detailed metrics for each class

In [None]:
print(f'Classification report :\n {classification_report(y_test,y_pred)}')

In [None]:
plt.figure(figsize=(4,3))
sns.heatmap(confusion_matrix(y_pred, y_test),annot=True,fmt = "d",linecolor="k",linewidths=3)

plt.title("LSTM CONFUSION MATRIX",fontsize=14)
plt.xlabel('Predicted')
plt.ylabel('Truth')
plt.show()

In [None]:
algorithms = ['Precision', 'Recall', 'F1 Score', 'Accuracy']
accuracy_scores = [0.71, 0.74, 0.72, 0.84]
plt.figure(figsize=(8, 6))
plt.bar(algorithms, accuracy_scores, color=['red', 'yellow', 'green', 'blue'])
plt.title('Performance metrics')
plt.ylim(0.5, 1.0)
for i in range(len(algorithms)):
    plt.text(i, accuracy_scores[i], f'{accuracy_scores[i]:.2f}', ha='center', va='bottom')
plt.tight_layout()
plt.show()

## LOGISTIC REGRESSION

In [None]:
lr_model = LogisticRegression()
lr_model.fit(X_train,y_train)
accuracy_lr = lr_model.score(X_test,y_test)
print("Logistic Regression accuracy is :",accuracy_lr)

In [None]:
lr_pred= lr_model.predict(X_test)
report = classification_report(y_test,lr_pred)
print(report)

In [None]:
plt.figure(figsize=(4,3))
sns.heatmap(confusion_matrix(y_test, lr_pred),
                annot=True,fmt = "d",linecolor="k",linewidths=3)

plt.title("LOGISTIC REGRESSION CONFUSION MATRIX",fontsize=14)
plt.show()

## DECISION TREE

In [None]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train,y_train)
predictdt_y = dt_model.predict(X_test)
accuracy_dt = dt_model.score(X_test,y_test)
print("Decision Tree accuracy is :",accuracy_dt)


In [None]:
print(classification_report(y_test, predictdt_y))

In [None]:
plt.figure(figsize=(4,3))
sns.heatmap(confusion_matrix(y_test, predictdt_y),
                annot=True,fmt = "d",linecolor="k",linewidths=3)

plt.title("DECISION TREE CONFUSION MATRIX",fontsize=14)
plt.show()

## KNN

In [None]:
knn_model = KNeighborsClassifier(n_neighbors = 11)
knn_model.fit(X_train,y_train)
predicted_y = knn_model.predict(X_test)
accuracy_knn = knn_model.score(X_test,y_test)
print("KNN accuracy:",accuracy_knn)

In [None]:
print(classification_report(y_test, predicted_y))

In [None]:
plt.figure(figsize=(4,3))
sns.heatmap(confusion_matrix(y_test, y_pred),
                annot=True,fmt = "d",linecolor="k",linewidths=3)

plt.title(" KNN CONFUSION MATRIX",fontsize=14)
plt.show()

## RANDOM FOREST

In [None]:
model_rf = RandomForestClassifier(n_estimators=500 , oob_score = True, n_jobs = -1,
                                  random_state =50, max_features = "auto",
                                  max_leaf_nodes = 30)
model_rf.fit(X_train, y_train)
prediction_test = model_rf.predict(X_test)
accuracy_rf = model_rf.score(X_test,y_test)
print("Random Forest accuracy:",accuracy_rf)

In [None]:
print(classification_report(y_test, prediction_test))

In [None]:
plt.figure(figsize=(4,3))
sns.heatmap(confusion_matrix(y_test, prediction_test),
                annot=True,fmt = "d",linecolor="k",linewidths=3)

plt.title("RANDOM FOREST CONFUSION MATRIX",fontsize=14)
plt.show()

## COMPARISION

In [None]:
import matplotlib.pyplot as plt

# Define algorithm names and their corresponding accuracy scores
algorithms = ["LSTM", "Logistic Regression", "Decision Tree", "K-Nearest Neighbors", "Random Forest"]
accuracy_scores = [accuracy_lstm, accuracy_lr, accuracy_dt, accuracy_knn, accuracy_rf]

# Create bar graph
plt.figure(figsize=(10, 6))
plt.bar(algorithms, accuracy_scores, color=['blue', 'green', 'red', 'purple', 'orange'])

# Display values on top of bars
for i in range(len(algorithms)):
    plt.text(i, accuracy_scores[i], round(accuracy_scores[i], 4), ha='center', va='bottom')

# Add titles and labels
plt.title('Accuracy Scores of Different Algorithms')
plt.xlabel('Algorithms')
plt.ylabel('Accuracy')
plt.ylim(0, 1)  # Set y-axis limit to ensure accuracy scores are within the plot range
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.tight_layout()

# Show the plot
plt.show()

In [None]:
models = pd.DataFrame({'Model': ['LSTM', 'Logistic Regression', 'Decision tree','KNN', 'Random forest'],
                       'Score': [accuracy_lstm, accuracy_lr, accuracy_dt, accuracy_knn, accuracy_rf]})

models.sort_values(by = 'Score', ascending = False, ignore_index = True)