# Analyzing Telco Customer Churn

## Import Telco Churn Dataset

In [3]:
# import required libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
import imblearn
from imblearn.over_sampling import RandomOverSampler

In [None]:
# import data
df = pd.read_excel('Telco_customer_churn.xlsx')


## Basic Descriptive Analysis
This will help to familiarize with the data / identify missings etc.

In [None]:
# Display the first few rows of the DataFrame to get a glimpse of the data
df.head()

In [None]:
# Get an overview of the dataset's structure and information: 
df.info()

In [None]:
# Calculate summary statistics for numerical variables:
df.describe()

In [None]:
# Check for missing values:
df.isnull().sum()

## Generate demographics distribution plots

In [None]:
# 2 by 2 descriptive Plots

# Calculate proportions
proportions_gender = df['Gender'].value_counts(normalize=True) * 100
proportions_senior_citizen = df['Senior Citizen'].value_counts(normalize=True) * 100
proportions_partner = df['Partner'].value_counts(normalize=True) * 100
proportions_dependents = df['Dependents'].value_counts(normalize=True) * 100

# Create a 2x2 grid of subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 10))

# Plot Gender distribution with proportions
ax = axes[0, 0]
sns.barplot(x=proportions_gender.index, y=proportions_gender.values, ax=ax)
ax.set_title('Gender Distribution')
ax.set_ylabel('Proportion (%)')
for i, proportion in enumerate(proportions_gender):
    ax.annotate(f'{proportion:.1f}%', (i, proportion), ha='center', va='bottom')

# Plot Senior Citizen distribution with proportions
ax = axes[0, 1]
sns.barplot(x=proportions_senior_citizen.index, y=proportions_senior_citizen.values, ax=ax)
ax.set_title('Senior Citizen Distribution')
ax.set_ylabel('Proportion (%)')
for i, proportion in enumerate(proportions_senior_citizen):
    ax.annotate(f'{proportion:.1f}%', (i, proportion), ha='center', va='bottom')

# Plot Partner distribution with proportions
ax = axes[1, 0]
sns.barplot(x=proportions_partner.index, y=proportions_partner.values, ax=ax)
ax.set_title('Partner Distribution')
ax.set_ylabel('Proportion (%)')
for i, proportion in enumerate(proportions_partner):
    ax.annotate(f'{proportion:.1f}%', (i, proportion), ha='center', va='bottom')

# Plot Dependents distribution with proportions
ax = axes[1, 1]
sns.barplot(x=proportions_dependents.index, y=proportions_dependents.values, ax=ax)
ax.set_title('Dependents Distribution')
ax.set_ylabel('Proportion (%)')
for i, proportion in enumerate(proportions_dependents):
    ax.annotate(f'{proportion:.1f}%', (i, proportion), ha='center', va='bottom')

# Adjust spacing between subplots
plt.tight_layout()

# Display the plots
plt.show()

In [None]:
# 1 by 2 descriptive Plots

# Calculate proportions
proportions_gender = df['Gender'].value_counts(normalize=True) * 100
proportions_senior_citizen = df['Senior Citizen'].value_counts(normalize=True) * 100
proportions_partner = df['Partner'].value_counts(normalize=True) * 100
proportions_dependents = df['Dependents'].value_counts(normalize=True) * 100

# Create a 2x2 grid of subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))

# Plot Gender distribution with proportions
ax = axes[0]
sns.barplot(x=proportions_gender.index, y=proportions_gender.values, ax=ax)
ax.set_title('Gender Distribution')
ax.set_ylabel('Proportion (%)')
for i, proportion in enumerate(proportions_gender):
    ax.annotate(f'{proportion:.1f}%', (i, proportion), ha='center', va='bottom')

# Plot Senior Citizen distribution with proportions
ax = axes[1]
sns.barplot(x=proportions_senior_citizen.index, y=proportions_senior_citizen.values, ax=ax)
ax.set_title('Senior Citizen Distribution')
ax.set_ylabel('Proportion (%)')
for i, proportion in enumerate(proportions_senior_citizen):
    ax.annotate(f'{proportion:.1f}%', (i, proportion), ha='center', va='bottom')

# Adjust spacing between subplots
plt.tight_layout()

# Display the plots
plt.show()


In [None]:
# 1 by 2 descriptive Plots

# Calculate proportions
proportions_gender = df['Gender'].value_counts(normalize=True) * 100
proportions_senior_citizen = df['Senior Citizen'].value_counts(normalize=True) * 100
proportions_partner = df['Partner'].value_counts(normalize=True) * 100
proportions_dependents = df['Dependents'].value_counts(normalize=True) * 100

# Create a 2x2 grid of subplots
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))

# Plot Gender distribution with proportions
ax = axes[0]
sns.barplot(x=proportions_partner.index, y=proportions_partner.values, ax=ax)
ax.set_title('Partner Distribution')
ax.set_ylabel('Proportion (%)')
for i, proportion in enumerate(proportions_partner):
    ax.annotate(f'{proportion:.1f}%', (i, proportion), ha='center', va='bottom')

# Plot Senior Citizen distribution with proportions
ax = axes[1]
sns.barplot(x=proportions_dependents.index, y=proportions_dependents.values, ax=ax)
ax.set_title('Dependents Distribution')
ax.set_ylabel('Proportion (%)')
for i, proportion in enumerate(proportions_dependents):
    ax.annotate(f'{proportion:.1f}%', (i, proportion), ha='center', va='bottom')

# Adjust spacing between subplots
plt.tight_layout()

# Display the plots
plt.show()

## Explorative Data Analysis

In [None]:
# Plotting Churn distribution by Internet Service
sns.countplot(x='Internet Service', hue='Churn Label', data=df)
plt.title('Churn Distribution by Internet Service')
plt.show()

# Plotting Churn distribution by Contract type
sns.countplot(x='Contract', hue='Churn Label', data=df)
plt.title('Churn Distribution by Contract Type')
plt.show()

# Plotting Churn distribution by Payment Method
ax = sns.countplot(x='Payment Method', hue='Churn Label', data=df)
plt.title('Churn Distribution by Payment Method')
# Adjust x-axis tick label alignment
ax.set_xticklabels(ax.get_xticklabels(), rotation=30, ha='right')
plt.show()

In [None]:
# Create a 2x2 grid of subplots
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(10, 10))

# Plotting Churn distribution by Internet Service
ax1 = axes[0, 0]
sns.countplot(x='Internet Service', hue='Churn Label', data=df, ax=ax1)
ax1.set_title('Churn Distribution by Internet Service')

# Plotting Churn distribution by Contract type
ax2 = axes[0, 1]
sns.countplot(x='Contract', hue='Churn Label', data=df, ax=ax2)
ax2.set_title('Churn Distribution by Contract Type')

# Plotting Churn distribution by Payment Method
ax3 = axes[1, 0]
sns.countplot(x='Payment Method', hue='Churn Label', data=df, ax=ax3)
ax3.set_title('Churn Distribution by Payment Method')
ax3.set_xticklabels(ax3.get_xticklabels(), rotation=30, ha='right')

# Remove empty subplot
axes[1, 1].axis('off')

# Adjust spacing between subplots
plt.tight_layout()

# Display the plots
plt.show()

In [None]:
# Plotting Monthly Charges distribution by Churn
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='Monthly Charges', hue='Churn Label', multiple='dodge', kde=False)
plt.title('Distribution of Monthly Charges by Churn')
plt.xlabel('Monthly Charges')
plt.ylabel('Count')
plt.show()

In [None]:
# Plotting Tenure Months distribution by Churn
plt.figure(figsize=(10, 6))
sns.histplot(data=df, x='Tenure Months', hue='Churn Label', multiple='dodge', kde=False)
plt.title('Distribution of Tenure Months by Churn')
plt.xlabel('Tenure Months')
plt.ylabel('Count')
plt.show()


In [None]:
# Plotting the count of churn reasons
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='Churn Reason', order=df['Churn Reason'].value_counts().index)
plt.title('Churn Reasons')
plt.xlabel('Reason')
plt.ylabel('Count')
plt.xticks(rotation=90)
plt.show()


In [None]:
# Select the categorical columns for dummy coding
categorical_columns = ['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Phone Service', 'Multiple Lines', 'Internet Service', 'Online Security', 'Online Backup', 'Device Protection', 'Tech Support', 'Streaming TV', 'Streaming Movies', 'Contract', 'Paperless Billing', 'Payment Method', 'Churn Label']


# Print the unique values of each categorical variable
for column in categorical_columns:
    unique_values = df[column].unique()
    print(f"Unique values in {column}: {unique_values}")


In [None]:
# Select the categorical columns to convert to numeric values
categorical_columns = ['Gender', 'Senior Citizen', 'Partner', 'Dependents', 'Phone Service', 'Paperless Billing']

# Create a new DataFrame to store the encoded numeric values
df_numeric = df.copy()

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Convert the selected categorical columns to numeric values
for column in categorical_columns:
    df_numeric[column] = label_encoder.fit_transform(df_numeric[column])


In [None]:
df_numeric.select_dtypes(include='number')

In [None]:
# Create Dataframe for Heatmap

# mutate Total Charges
df_numeric['Total Charges'] = pd.to_numeric(df_numeric['Total Charges'], errors='coerce')

df_numeric = df_numeric.select_dtypes(include='number')
# List of variables to exclude
exclude_variables = ['Count', 'Zip Code', 'Latitude', 'Longitude', 'Churn Score', 'CLTV']

# Create a new DataFrame excluding the specified variables
df_numeric = df_numeric.drop(exclude_variables, axis=1)

In [None]:
# Calculate the correlation matrix
corr_matrix = df_numeric.corr()

# Create a mask to hide the upper triangle
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

# Plot the heatmap with the mask to show only the lower triangle
plt.figure(figsize=(10, 10))
sns.heatmap(corr_matrix, mask=mask, annot=True, cmap="coolwarm")
plt.title("Correlation Heatmap of Numerical and Dichotomous Variables")
plt.show()
