# Churnobyl
### Data exploration

In [2]:
# Importing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pathlib import Path
import plotly.express as px
from IPython.display import Image
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

warnings.filterwarnings("ignore")

In [3]:
# Reading in the data
df = pd.read_csv("../data/WA_Fn-UseC_-Telco-Customer-Churn.csv")
print(f"Shape of the data: {df.shape}")
df.head(3)

Shape of the data: (7043, 21)


Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes


In [4]:
# Checking NaN values
df.isnull().any().any()

False

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [6]:
# Dropping customerID column
df = df.drop(columns=["customerID"])
df.head(2)

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,0,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,0,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No


In [7]:
df[df["TotalCharges"] == " "].shape

(11, 20)

In [8]:
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")
df.isnull().sum()

gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64

In [9]:
df[df["tenure"] == 0].shape

(11, 20)

In [10]:
df = df.drop(labels=df[df["tenure"] == 0].index, axis=0)
df["TotalCharges"] = pd.to_numeric(df["TotalCharges"], errors="coerce")

In [11]:
df.SeniorCitizen.unique()

array([0, 1], dtype=int64)

In [12]:
# Encoding values
df.SeniorCitizen = df.SeniorCitizen.map({0: "No", 1: "Yes"})
df.head()

Unnamed: 0,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,Female,No,Yes,No,1,No,No phone service,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,Male,No,No,No,34,Yes,No,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,Male,No,No,No,2,Yes,No,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,Male,No,No,No,45,No,No phone service,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,Female,No,No,No,2,Yes,No,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [13]:
df.InternetService.describe(include=["object", "bool"])

count            7032
unique              3
top       Fiber optic
freq             3096
Name: InternetService, dtype: object

In [15]:
type_ = ["No", "yes"]
fig = make_subplots(rows=1, cols=1)

fig.add_trace(go.Pie(labels=type_, values=df["Churn"].value_counts(), name="Churn"))

# Use `hole` to create a donut-like pie chart
fig.update_traces(hole=0.4, hoverinfo="label+percent+name", textfont_size=16)

fig.update_layout(
    title_text="Churn Distributions",
    # Add annotations in the center of the donut pies.
    annotations=[dict(text="Churn", x=0.5, y=0.5, font_size=20, showarrow=False)],
)
fig.write_html("../figures/target_dist_pie.html")
fig

ValueError: Cannot embed the 'html' image format

In [None]:
df.Churn[df.Churn == "No"].groupby(by=df.gender).count()

In [None]:
df.Churn[df.Churn == "Yes"].groupby(by=df.gender).count()

In [None]:
plt.figure(figsize=(6, 6))
labels = ["Churn: Yes", "Churn:No"]
values = [1869, 5163]
labels_gender = ["F", "M", "F", "M"]
sizes_gender = [939, 930, 2544, 2619]
colors = ["#ff6666", "#66b3ff"]
colors_gender = ["#c2c2f0", "#ffb3e6", "#c2c2f0", "#ffb3e6"]
explode = (0.3, 0.3)
explode_gender = (0.1, 0.1, 0.1, 0.1)
textprops = {"fontsize": 15}
# Plot
plt.pie(
    values,
    labels=labels,
    autopct="%1.1f%%",
    pctdistance=1.08,
    labeldistance=0.8,
    colors=colors,
    startangle=90,
    frame=True,
    explode=explode,
    radius=10,
    textprops=textprops,
    counterclock=True,
)
plt.pie(
    sizes_gender,
    labels=labels_gender,
    colors=colors_gender,
    startangle=90,
    explode=explode_gender,
    radius=7,
    textprops=textprops,
    counterclock=True,
)
# Draw circle
centre_circle = plt.Circle((0, 0), 5, color="black", fc="white", linewidth=0)
fig = plt.gcf()
fig.gca().add_artist(centre_circle)

plt.title("Churn Distribution w.r.t Gender: Male(M), Female(F)", fontsize=15, y=1.1)

# show plot

plt.axis("equal")
plt.tight_layout()
plt.show()

In [None]:
fig = px.histogram(
    df,
    x="Churn",
    color="Contract",
    barmode="group",
    title="<b>Customer contract distribution<b>",
)
fig.update_layout(width=700, height=500, bargap=0.2)
fig

In [None]:
labels = df["PaymentMethod"].unique()
values = df["PaymentMethod"].value_counts()

fig = go.Figure(data=[go.Pie(labels=labels, values=values, hole=0.3)])
fig.update_layout(title_text="<b>Payment Method Distribution</b>")
fig.show()

fig = px.histogram(
    df,
    x="Churn",
    color="PaymentMethod",
    title="<b>Customer Payment Method distribution w.r.t. Churn</b>",
)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

In [None]:
df[df["gender"] == "Male"][["InternetService", "Churn"]].value_counts()

In [None]:
df[df["gender"] == "Female"][["InternetService", "Churn"]].value_counts()

In [None]:
fig = go.Figure()

fig.add_trace(
    go.Bar(
        x=[
            ["Churn:No", "Churn:No", "Churn:Yes", "Churn:Yes"],
            ["Female", "Male", "Female", "Male"],
        ],
        y=[965, 992, 219, 240],
        name="DSL",
    )
)

fig.add_trace(
    go.Bar(
        x=[
            ["Churn:No", "Churn:No", "Churn:Yes", "Churn:Yes"],
            ["Female", "Male", "Female", "Male"],
        ],
        y=[889, 910, 664, 633],
        name="Fiber optic",
    )
)

fig.add_trace(
    go.Bar(
        x=[
            ["Churn:No", "Churn:No", "Churn:Yes", "Churn:Yes"],
            ["Female", "Male", "Female", "Male"],
        ],
        y=[690, 717, 56, 57],
        name="No Internet",
    )
)

fig.update_layout(
    title_text="<b>Churn Distribution w.r.t. Internet Service and Gender</b>"
)

fig.show()

In [None]:
color_map = {"Yes": "#FF97FF", "No": "#AB63FA"}
fig = px.histogram(
    df,
    x="Churn",
    color="Dependents",
    barmode="group",
    title="<b>Dependents distribution</b>",
    color_discrete_map=color_map,
)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

In [None]:
color_map = {"Yes": "#FFA15A", "No": "#00CC96"}
fig = px.histogram(
    df,
    x="Churn",
    color="Partner",
    barmode="group",
    title="<b>Chrun distribution w.r.t. Partners</b>",
    color_discrete_map=color_map,
)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

In [None]:
color_map = {"Yes": "#00CC96", "No": "#B6E880"}
fig = px.histogram(
    df,
    x="Churn",
    color="SeniorCitizen",
    title="<b>Chrun distribution w.r.t. Senior Citizen</b>",
    color_discrete_map=color_map,
)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

In [None]:
color_map = {"Yes": "#FF97FF", "No": "#AB63FA"}
fig = px.histogram(
    df,
    x="Churn",
    color="OnlineSecurity",
    barmode="group",
    title="<b>Churn w.r.t Online Security</b>",
    color_discrete_map=color_map,
)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

In [None]:
color_map = {"Yes": "#FFA15A", "No": "#00CC96"}
fig = px.histogram(
    df,
    x="Churn",
    color="PaperlessBilling",
    title="<b>Chrun distribution w.r.t. Paperless Billing</b>",
    color_discrete_map=color_map,
)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

In [None]:
fig = px.histogram(
    df,
    x="Churn",
    color="TechSupport",
    barmode="group",
    title="<b>Chrun distribution w.r.t. TechSupport</b>",
)
fig.update_layout(width=700, height=500, bargap=0.1)
fig.show()

In [None]:
sns.set_context("paper", font_scale=1.1)
ax = sns.kdeplot(df.MonthlyCharges[(df["Churn"] == "No")], color="Red", shade=True)
ax = sns.kdeplot(
    df.MonthlyCharges[(df["Churn"] == "Yes")], ax=ax, color="Blue", shade=True
)
ax.legend(["Not Churn", "Churn"], loc="upper right")
ax.set_ylabel("Density")
ax.set_xlabel("Monthly Charges")
ax.set_title("Distribution of monthly charges by churn")

In [None]:
ax = sns.kdeplot(df.TotalCharges[(df["Churn"] == "No")], color="Gold", shade=True)
ax = sns.kdeplot(
    df.TotalCharges[(df["Churn"] == "Yes")], ax=ax, color="Green", shade=True
)
ax.legend(["Not Chuurn", "Churn"], loc="upper right")
ax.set_ylabel("Density")
ax.set_xlabel("Total Charges")
ax.set_title("Distribution of total charges by churn")

In [None]:
fig = px.box(df, x="Churn", y="tenure")

# Update yaxis properties
fig.update_yaxes(title_text="Tenure (Months)", row=1, col=1)
# Update xaxis properties
fig.update_xaxes(title_text="Churn", row=1, col=1)

# Update size and title
fig.update_layout(
    autosize=True,
    width=750,
    height=600,
    title_font=dict(size=25, family="Courier"),
    title="<b>Tenure vs Churn</b>",
)

fig.show()

In [None]:
# Create a label encoder object
le = LabelEncoder()
# Label Encoding will be used for columns with 2 or less unique
values
le_count = 0
for col in df.columns[1:]:
    if df[col].dtype == "object":
        if len(list(df[col].unique())) <= 2:
            le.fit(df[col])
            df[col] = le.transform(df[col])
            le_count += 1
print("{} columns were label encoded.".format(le_count))

In [None]:
data2 = df[
    [
        "SeniorCitizen",
        "Partner",
        "Dependents",
        "tenure",
        "PhoneService",
        "PaperlessBilling",
        "MonthlyCharges",
        "TotalCharges",
    ]
]

correlations = data2.corrwith(df.Churn)
correlations = correlations[correlations != 1]
positive_correlations = correlations[correlations > 0].sort_values(ascending=False)
negative_correlations = correlations[correlations < 0].sort_values(ascending=False)

correlations.plot.bar(figsize=(18, 10), fontsize=15, color="grey", rot=45, grid=True)
plt.title(
    "Correlation with Churn Rate \n",
    horizontalalignment="center",
    fontstyle="normal",
    fontsize="22",
    fontfamily="sans-serif",
)

In [None]:
sns.set(style="white")
plt.figure(figsize=(18, 15))

corr = df.apply(lambda x: pd.factorize(x)[0]).corr()

mask = np.triu(np.ones_like(corr, dtype=bool))

ax = sns.heatmap(
    corr,
    mask=mask,
    xticklabels=corr.columns,
    yticklabels=corr.columns,
    annot=True,
    linewidths=0.2,
    cmap="coolwarm",
    vmin=-1,
    vmax=1,
)

In [None]:
# Set and compute the Correlation Matrix:
sns.set(style="white")
corr = data2.corr()

# Generate a mask for the upper triangle:

mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure and a diverging colormap:
f, ax = plt.subplots(figsize=(18, 15))
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio:
sns.heatmap(
    corr,
    mask=mask,
    cmap=cmap,
    vmax=0.3,
    center=0,
    square=True,
    annot=True,
    linewidths=0.5,
    cbar_kws={"shrink": 0.5},
)