In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
df=pd.read_csv('WA_Fn-UseC_-Telco-Customer-Churn.csv')

In [4]:
df.head()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [5]:
print(df.shape)
print('Row {} columns {}'.format(df.shape[0],df.shape[1]))

(7043, 21)
Row 7043 columns 21


In [6]:
df.dtypes

customerID           object
gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
Churn                object
dtype: object

In [7]:
df['churn_class'] = df['Churn'].apply(lambda x:1 if x == "Yes" else 0)
df['churn_class'].value_counts(normalize=True)

df['SeniorCitizen'] = df['SeniorCitizen'].apply(lambda x:"Yes" if x == 1 else "No")
df['SeniorCitizen'].value_counts()

No     5901
Yes    1142
Name: SeniorCitizen, dtype: int64

In [8]:
df['TotalCharges'].describe()

count     7043
unique    6531
top           
freq        11
Name: TotalCharges, dtype: object

In [9]:
df['TotalCharges']=df['TotalCharges'].replace(' ',np.nan)
df['TotalCharges']=pd.to_numeric(df['TotalCharges'])

In [10]:
value = (df['TotalCharges']/df['MonthlyCharges']).median()*df['MonthlyCharges']
df['TotalCharges'] = value.where(df['TotalCharges'] == np.nan, other =df['TotalCharges'])

In [11]:
df['TotalCharges'].describe()

count    7032.000000
mean     2283.300441
std      2266.771362
min        18.800000
25%       401.450000
50%      1397.475000
75%      3794.737500
max      8684.800000
Name: TotalCharges, dtype: float64

In [12]:
#categorical features
categorical = df.select_dtypes(include =[np.object])
print("Categorical Features in DataSet:",categorical.shape[1])
print(categorical.columns)

#numerical features
numerical= df.select_dtypes(include =[np.float64,np.int64])
print("Numerical Features in DataSet:",numerical.shape[1])
print(numerical.columns)

Categorical Features in DataSet: 18
Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod',
       'Churn'],
      dtype='object')
Numerical Features in DataSet: 4
Index(['tenure', 'MonthlyCharges', 'TotalCharges', 'churn_class'], dtype='object')


In [13]:
df.isna().sum()

customerID           0
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
churn_class          0
dtype: int64

In [21]:
numerical.fillna(0,inplace=True)
numerical.isna().sum()

tenure            0
MonthlyCharges    0
TotalCharges      0
churn_class       0
dtype: int64

In [22]:
from statsmodels.stats.outliers_influence import variance_inflation_factor as vif

In [23]:
x_data = numerical.drop(columns= ['churn_class'],axis=1)
x_data['Intercept'] = 1

In [24]:
VIF = pd.DataFrame()
VIF['Independent_Variables'] = x_data.columns
VIF['VIF'] = [vif(x_data.values, i) for i in range(x_data.shape[1])]
VIF = VIF.set_index('Independent_Variables').drop(index = 'Intercept')
VIF

Unnamed: 0_level_0,VIF
Independent_Variables,Unnamed: 1_level_1
tenure,5.836728
MonthlyCharges,3.21673
TotalCharges,9.510931


We can see that the VIF is more than 5 for tenure and TotalCharges which indicates the presence of multicollinearity. As we already have Monthly Charges so we can remove Total Charges.

In [25]:
x_data = numerical.drop(columns= ['churn_class','TotalCharges'],axis=1)
x_data['Intercept'] = 1

In [26]:
VIF = pd.DataFrame()
VIF['Independent_Variables'] = x_data.columns
VIF['VIF'] = [vif(x_data.values, i) for i in range(x_data.shape[1])]
VIF = VIF.set_index('Independent_Variables').drop(index = 'Intercept')
VIF

Unnamed: 0_level_0,VIF
Independent_Variables,Unnamed: 1_level_1
tenure,1.065478
MonthlyCharges,1.065478


Now the VIF is less than 2 which indicates that no multicolinearity is present which is a good thing for our model.