# **Telco Customers Churn Analysis and Predictive Models**

Khanh Tran

## Exploratory Data Analysis

In [1]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# Read dataset
df = pd.read_csv('../01.Data/telecom_customer_churn.csv')

# Display the first five rows to understand about dataset
df.head()

Unnamed: 0,Customer ID,Gender,Age,Married,Number of Dependents,City,Zip Code,Latitude,Longitude,Number of Referrals,...,Payment Method,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue,Customer Status,Churn Category,Churn Reason
0,0002-ORFBO,Female,37,Yes,0,Frazier Park,93225,34.827662,-118.999073,2,...,Credit Card,65.6,593.3,0.0,0,381.51,974.81,Stayed,,
1,0003-MKNFE,Male,46,No,0,Glendale,91206,34.162515,-118.203869,0,...,Credit Card,-4.0,542.4,38.33,10,96.21,610.28,Stayed,,
2,0004-TLHLJ,Male,50,No,0,Costa Mesa,92627,33.645672,-117.922613,0,...,Bank Withdrawal,73.9,280.85,0.0,0,134.6,415.45,Churned,Competitor,Competitor had better devices
3,0011-IGKFF,Male,78,Yes,0,Martinez,94553,38.014457,-122.115432,1,...,Bank Withdrawal,98.0,1237.85,0.0,0,361.66,1599.51,Churned,Dissatisfaction,Product dissatisfaction
4,0013-EXCHZ,Female,75,Yes,0,Camarillo,93010,34.227846,-119.079903,3,...,Credit Card,83.9,267.4,0.0,0,22.14,289.54,Churned,Dissatisfaction,Network reliability


In [3]:
# Shape of dataset
df.shape

(7043, 38)

In [4]:
# Data types of each variales
df.dtypes

Customer ID                           object
Gender                                object
Age                                    int64
Married                               object
Number of Dependents                   int64
City                                  object
Zip Code                               int64
Latitude                             float64
Longitude                            float64
Number of Referrals                    int64
Tenure in Months                       int64
Offer                                 object
Phone Service                         object
Avg Monthly Long Distance Charges    float64
Multiple Lines                        object
Internet Service                      object
Internet Type                         object
Avg Monthly GB Download              float64
Online Security                       object
Online Backup                         object
Device Protection Plan                object
Premium Tech Support                  object
Streaming 

In [5]:
# Convert variables to the correct data types
categorical_columns = [
    'Gender', 'Married', 'Phone Service', 'Multiple Lines', 'Internet Service',
    'Online Security', 'Online Backup', 'Device Protection Plan', 'Premium Tech Support',
    'Streaming TV', 'Streaming Movies', 'Streaming Music', 'Unlimited Data', 'Paperless Billing',
    'Payment Method', 'Customer Status', 'Churn Category', 'Churn Reason', 'Contract',
    'Internet Type', 'Offer', 'City'
]

for column in categorical_columns:
    df[column] = df[column].astype('category')

# Check the data types again to confirm the changes
df.dtypes


Customer ID                            object
Gender                               category
Age                                     int64
Married                              category
Number of Dependents                    int64
City                                 category
Zip Code                                int64
Latitude                              float64
Longitude                             float64
Number of Referrals                     int64
Tenure in Months                        int64
Offer                                category
Phone Service                        category
Avg Monthly Long Distance Charges     float64
Multiple Lines                       category
Internet Service                     category
Internet Type                        category
Avg Monthly GB Download               float64
Online Security                      category
Online Backup                        category
Device Protection Plan               category
Premium Tech Support              

In [6]:
# Number of missing values
df.isnull().sum()

Customer ID                             0
Gender                                  0
Age                                     0
Married                                 0
Number of Dependents                    0
City                                    0
Zip Code                                0
Latitude                                0
Longitude                               0
Number of Referrals                     0
Tenure in Months                        0
Offer                                3877
Phone Service                           0
Avg Monthly Long Distance Charges     682
Multiple Lines                        682
Internet Service                        0
Internet Type                        1526
Avg Monthly GB Download              1526
Online Security                      1526
Online Backup                        1526
Device Protection Plan               1526
Premium Tech Support                 1526
Streaming TV                         1526
Streaming Movies                  

In [7]:
# Summary statistics for numerical features
df.describe()

Unnamed: 0,Age,Number of Dependents,Zip Code,Latitude,Longitude,Number of Referrals,Tenure in Months,Avg Monthly Long Distance Charges,Avg Monthly GB Download,Monthly Charge,Total Charges,Total Refunds,Total Extra Data Charges,Total Long Distance Charges,Total Revenue
count,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,6361.0,5517.0,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0
mean,46.509726,0.468692,93486.070567,36.197455,-119.756684,1.951867,32.386767,25.420517,26.189958,63.596131,2280.381264,1.962182,6.860713,749.099262,3034.379056
std,16.750352,0.962802,1856.767505,2.468929,2.154425,3.001199,24.542061,14.200374,19.586585,31.204743,2266.220462,7.902614,25.104978,846.660055,2865.204542
min,19.0,0.0,90001.0,32.555828,-124.301372,0.0,1.0,1.01,2.0,-10.0,18.8,0.0,0.0,0.0,21.36
25%,32.0,0.0,92101.0,33.990646,-121.78809,0.0,9.0,13.05,13.0,30.4,400.15,0.0,0.0,70.545,605.61
50%,46.0,0.0,93518.0,36.205465,-119.595293,0.0,29.0,25.69,21.0,70.05,1394.55,0.0,0.0,401.44,2108.64
75%,60.0,0.0,95329.0,38.161321,-117.969795,3.0,55.0,37.68,30.0,89.75,3786.6,0.0,0.0,1191.1,4801.145
max,80.0,9.0,96150.0,41.962127,-114.192901,11.0,72.0,49.99,85.0,118.75,8684.8,49.79,150.0,3564.72,11979.34


In [None]:
# To perform a comprehensive exploratory data analysis (EDA), we will look into the following aspects:
# 1. Basic information about the dataset: shape, data types, and missing values
# 2. Summary statistics for numerical features
# 3. Distribution of key categorical features
# 4. Distribution of numerical features
# 5. Correlation analysis among numerical features
# 6. Analysis of churn rate

# 1. Basic information about the dataset
basic_info = {
    "Shape": data.shape,
    "Data Types": data.dtypes,
    "Missing Values": data.isnull().sum()
}

# 2. Summary statistics for numerical features
summary_statistics = data.describe()

# For distributions and correlations, we will selectively explore key variables.
# 3. Distribution of key categorical features
categorical_features = ['Gender', 'Married', 'Internet Service', 'Contract', 'Customer Status']

# 4. Distribution of numerical features
numerical_features = ['Age', 'Tenure in Months', 'Monthly Charge', 'Total Charges', 'Total Revenue']

# 5. Correlation analysis among numerical features
correlation_matrix = data[numerical_features].corr()

# Display basic info, summary statistics, and correlation matrix for now.
basic_info, summary_statistics, correlation_matrix
