### Introduction ###
Customer Churn Prediction

Customer churn prediction aims to identify customers likely to leave a service. Key factors include tenure (the duration of a customer's relationship with the company), monthly charges (the regular fees customers pay), and total charges (the accumulated amount paid by a customer). By analyzing these metrics, businesses can detect patterns and trends that signal potential churn. Accurate predictions enable companies to implement targeted retention strategies, improving customer satisfaction and reducing turnover. Leveraging data analytics, companies can proactively address issues and enhance customer loyalty.



In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as ex
import plotly.graph_objs as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots
import plotly.offline as pyo
# pyo.init_notebook_mode()
# sns.set_style('darkgrid')
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score as f1
from sklearn.metrics import confusion_matrix
# import scikitplot as skplt

In [3]:
#load dataset
df =pd.read_csv(r'C:\Users\Theodore\PycharmProjects\Customer_churn1\customer_churn_data.csv')

df.head()




Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,CUST0000,Male,0,No,Yes,23,No,No phone service,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Bank transfer,49.85,1146.55,No
1,CUST0001,Female,0,Yes,No,43,No,No phone service,DSL,Yes,...,Yes,No,Yes,No,Month-to-month,No,Mailed check,100.7,4330.1,Yes
2,CUST0002,Male,1,No,No,51,Yes,No,DSL,No,...,Yes,Yes,No,No,One year,No,Electronic check,97.33,4963.83,Yes
3,CUST0003,Male,1,No,No,72,Yes,Yes,DSL,Yes,...,Yes,No,No,No,Month-to-month,No,Credit card,101.38,7299.36,No
4,CUST0004,Male,1,No,No,25,Yes,Yes,DSL,No,...,No,Yes,No,Yes,Month-to-month,No,Electronic check,52.22,1305.5,Yes


In [4]:
df.shape

(5880, 21)

In [5]:
df.info

<bound method DataFrame.info of      customerID  gender  SeniorCitizen Partner Dependents  tenure  \
0      CUST0000    Male              0      No        Yes      23   
1      CUST0001  Female              0     Yes         No      43   
2      CUST0002    Male              1      No         No      51   
3      CUST0003    Male              1      No         No      72   
4      CUST0004    Male              1      No         No      25   
...         ...     ...            ...     ...        ...     ...   
5875   CUST5875    Male              0     Yes        Yes      71   
5876   CUST5876    Male              0      No         No      22   
5877   CUST5877  Female              0      No         No      68   
5878   CUST5878  Female              0     Yes        Yes      14   
5879   CUST5879  Female              1     Yes         No      23   

     PhoneService     MultipleLines InternetService       OnlineSecurity  ...  \
0              No  No phone service              No  No in

In [6]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
Churn                object
dtype: object

In [7]:
df["SeniorCitizen"]= df["SeniorCitizen"].map({0: "No", 1: "Yes"})
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,CUST0000,Male,No,No,Yes,23,No,No phone service,No,No internet service,...,No internet service,No internet service,No internet service,No internet service,Month-to-month,Yes,Bank transfer,49.85,1146.55,No
1,CUST0001,Female,No,Yes,No,43,No,No phone service,DSL,Yes,...,Yes,No,Yes,No,Month-to-month,No,Mailed check,100.7,4330.1,Yes
2,CUST0002,Male,Yes,No,No,51,Yes,No,DSL,No,...,Yes,Yes,No,No,One year,No,Electronic check,97.33,4963.83,Yes
3,CUST0003,Male,Yes,No,No,72,Yes,Yes,DSL,Yes,...,Yes,No,No,No,Month-to-month,No,Credit card,101.38,7299.36,No
4,CUST0004,Male,Yes,No,No,25,Yes,Yes,DSL,No,...,No,Yes,No,Yes,Month-to-month,No,Electronic check,52.22,1305.5,Yes


In [8]:
g_labels = ['Male', 'Female']
c_labels = ['No', 'Yes']
# Create subplots: use 'domain' type for Pie subplot
fig = make_subplots(rows=1, cols=2, specs=[[{'type':'domain'}, {'type':'domain'}]])
fig.add_trace(go.Pie(labels=g_labels, values=df['gender'].value_counts(), name="Gender"),
              1, 1)
fig.add_trace(go.Pie(labels=c_labels, values=df['Churn'].value_counts(), name="Churn"),
              1, 2)

# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=.4, hoverinfo="label+percent+name", textfont_size=16)

fig.update_layout(
    title_text="Gender and Churn Distributions",
    # Add annotations in the center of the donut pies.
    annotations=[dict(text='Gender', x=0.16, y=0.5, font_size=20, showarrow=False),
                 dict(text='Churn', x=0.84, y=0.5, font_size=20, showarrow=False)])
fig.show()

In [9]:
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
df[numerical_cols].describe()

Unnamed: 0,tenure,MonthlyCharges,TotalCharges
count,5880.0,5880.0,5880.0
mean,36.54915,70.157779,2566.813165
std,20.909674,28.804615,1910.017743
min,1.0,20.0,20.03
25%,18.0,45.7175,1020.2175
50%,37.0,70.155,2136.445
75%,55.0,95.4575,3767.665
max,72.0,119.99,8589.6


In [10]:
df["Churn"][df["Churn"]=="No"].groupby(by=df["gender"]).count()

gender
Female    1498
Male      1484
Name: Churn, dtype: int64

In [11]:
df["Churn"][df["Churn"]=="Yes"].groupby(by=df["gender"]).count()


gender
Female    1432
Male      1466
Name: Churn, dtype: int64

In [12]:
# customer payment method

labels = df['PaymentMethod'].unique()
values = df['PaymentMethod'].value_counts()

fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=.3)])
fig.update_layout(title_text="<b>Payment Method Distribution</b>")
fig.show()

In [13]:
# Customer payment Method Churn

fig = ex.histogram(df, x="Churn", color="PaymentMethod", title="<b>Customer Payment Method distribution  Churn</b>")
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

In [14]:
df["InternetService"].unique()

array(['No', 'DSL', 'Fiber optic'], dtype=object)

In [15]:

 df[df["gender"]=="Male"][["InternetService", "Churn"]].value_counts()

InternetService  Churn
DSL              No       513
No               No       498
                 Yes      496
DSL              Yes      485
Fiber optic      Yes      485
                 No       473
Name: count, dtype: int64

In [16]:
fig = go.Figure()

fig.add_trace(go.Bar(
  x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'],
       ["Female", "Male", "Female", "Male"]],
  y = [965, 992, 219, 240],
  name = 'DSL',
))

fig.add_trace(go.Bar(
  x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'],
       ["Female", "Male", "Female", "Male"]],
  y = [889, 910, 664, 633],
  name = 'Fiber optic',
))

fig.add_trace(go.Bar(
  x = [['Churn:No', 'Churn:No', 'Churn:Yes', 'Churn:Yes'],
       ["Female", "Male", "Female", "Male"]],
  y = [690, 717, 56, 57],
  name = 'No Internet',
))

fig.update_layout(title_text="<b>Churn Distribution  Internet Service and Gender</b>")

fig.show()



Many customers choose Fiber optic service, but it has a high churn rate, indicating possible dissatisfaction. In contrast, DSL service has more customers and a lower churn rate compared to Fiber optic service.
