In [51]:
# Preprocessing for churn model.
# it is preferable to have features like 'Churn' encoded as 0 and 1 instead of no and yes, 
#so that you can then feed it into machine learning algorithms that only accept numeric values.
#Besides 'Churn', other features that are of type object can be converted into 0s and 1s
pd.set_option('display.max_columns', 10)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
telco = pd.read_csv('Churn.csv')
print(telco.dtypes)

Account_Length      int64
Vmail_Message       int64
Day_Mins          float64
Eve_Mins          float64
Night_Mins        float64
Intl_Mins         float64
CustServ_Calls      int64
Churn              object
Intl_Plan          object
Vmail_Plan         object
Day_Calls           int64
Day_Charge        float64
Eve_Calls           int64
Eve_Charge        float64
Night_Calls         int64
Night_Charge      float64
Intl_Calls          int64
Intl_Charge       float64
State              object
Area_Code           int64
Phone              object
dtype: object


In [9]:
# Replace 'no' with 0 and 'yes' with 1 in 'Vmail_Plan'
telco['Vmail_Plan'] = telco['Vmail_Plan'].replace({'no':0, 'yes':1})

# Replace 'no' with 0 and 'yes' with 1 in 'Churn'
telco['Churn'] = telco['Churn'].replace({'no':0, 'yes':1})

# Print the results to verify
print(telco['Vmail_Plan'].head())
print(telco['Churn'].head())

0    1
1    1
2    0
3    0
4    0
Name: Vmail_Plan, dtype: int64
0    0
1    0
2    0
3    0
4    0
Name: Churn, dtype: int64


In [16]:
#Use the pd.get_dummies() function to apply one hot encoding on the 'State' feature of telco.
telco_state = pd.get_dummies(telco['State'])
print(telco_state.head())

   AK  AL  AR  AZ  CA ...  VT  WA  WI  WV  WY
0   0   0   0   0   0 ...   0   0   0   0   0
1   0   0   0   0   0 ...   0   0   0   0   0
2   0   0   0   0   0 ...   0   0   0   0   0
3   0   0   0   0   0 ...   0   0   0   0   0
4   0   0   0   0   0 ...   0   0   0   0   0

[5 rows x 51 columns]


In [23]:
print(telco['Intl_Calls'].describe())

count    3333.000000
mean        4.479448
std         2.461214
min         0.000000
25%         3.000000
50%         4.000000
75%         6.000000
max        20.000000
Name: Intl_Calls, dtype: float64


In [52]:
#Feature scaling
# Import StandardScaler
from sklearn.preprocessing import StandardScaler
telco1 =telco[['Intl_Calls', 'Night_Mins']]
# scale telco using StandardScaler
telco_scaled = StandardScaler().fit_transform(telco1)

# Add column names back for readability
telco_scaled_df = pd.DataFrame(telco_scaled, columns=["Intl_Calls", "Night_Mins"])

# Print summary statistics
print(telco_scaled_df.describe())

         Intl_Calls    Night_Mins
count  3.333000e+03  3.333000e+03
mean  -1.264615e-16  6.602046e-17
std    1.000150e+00  1.000150e+00
min   -1.820289e+00 -3.513648e+00
25%   -6.011951e-01 -6.698545e-01
50%   -1.948306e-01  6.485803e-03
75%    6.178983e-01  6.808485e-01
max    6.307001e+00  3.839081e+00


In [54]:
# Drop the unnecessary features
telco = telco.drop(telco[['Area_Code','Phone']], axis=1)

# Verify dropped features
print(telco.columns)

Index(['Account_Length', 'Vmail_Message', 'Day_Mins', 'Eve_Mins', 'Night_Mins',
       'Intl_Mins', 'CustServ_Calls', 'Churn', 'Intl_Plan', 'Vmail_Plan',
       'Day_Calls', 'Day_Charge', 'Eve_Calls', 'Eve_Charge', 'Night_Calls',
       'Night_Charge', 'Intl_Calls', 'Intl_Charge', 'State'],
      dtype='object')


In [55]:
# Create the new feature
telco['Avg_Night_Calls'] = telco['Night_Mins']/ telco['Night_Calls']

# Print the first five rows of 'Avg_Night_Calls'
print(telco['Avg_Night_Calls'].head())

0    2.689011
1    2.469903
2    1.563462
3    2.212360
4    1.544628
Name: Avg_Night_Calls, dtype: float64
