# Initializing

In [7]:
#Active environment should be aws_env
!conda info | grep 'active env'

     active environment : aws_env
    active env location : /home/hassan101/anaconda3/envs/aws_env


In [8]:
#Get AWS credentials from environment
import os
import boto3
aws_akid = os.environ['AWS_KID']
aws_sak = os.environ['AWS_AK']

client = boto3.client('iam', aws_access_key_id=aws_akid, aws_secret_access_key= aws_sak)
users = client.list_users()
for key in users['Users']:
    print('Active User:', key['UserName'])

Active User: usr_hassan


In [9]:
#Importing libraries
import pandas as pd
import io
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

%matplotlib inline

# Getting processed data from S3 bucket

In [10]:
#Authentication
res_s3 = boto3.resource('s3', aws_access_key_id=aws_akid, aws_secret_access_key= aws_sak)
client_s3 = boto3.client('s3', aws_access_key_id=aws_akid, aws_secret_access_key= aws_sak) 

In [11]:
# List all objects in bucket
bucket_name = 'class-dataset-processed'

response = client_s3.list_objects_v2(Bucket=bucket_name)
for obj in response['Contents']:
    print(obj)

{'Key': 'customer-churn-processed.csv', 'LastModified': datetime.datetime(2023, 2, 15, 1, 28, 41, tzinfo=tzutc()), 'ETag': '"e400bc059fcd5a5c9eb4371eb0a49426"', 'Size': 971184, 'StorageClass': 'STANDARD'}


In [12]:
response = client_s3.get_object(Bucket=bucket_name, Key="customer-churn-processed.csv")

status = response.get("ResponseMetadata", {}).get("HTTPStatusCode")

if status == 200:
    print(f"Successful S3 get_object response. Status - {status}")
    df = pd.read_csv(response.get("Body"))
else:
    print(f"Unsuccessful S3 get_object response. Status - {status}")

Successful S3 get_object response. Status - 200


In [13]:
df.head(20)

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,7590-vhveg,female,no,yes,no,1,no,no_phone_service,dsl,no,...,no,no,no,no,month-to-month,yes,electronic_check,29.85,29.85,0
1,5575-gnvde,male,no,no,no,34,yes,no,dsl,yes,...,yes,no,no,no,one_year,no,mailed_check,56.95,1889.5,0
2,3668-qpybk,male,no,no,no,2,yes,no,dsl,yes,...,no,no,no,no,month-to-month,yes,mailed_check,53.85,108.15,1
3,7795-cfocw,male,no,no,no,45,no,no_phone_service,dsl,yes,...,yes,yes,no,no,one_year,no,bank_transfer_(automatic),42.3,1840.75,0
4,9237-hqitu,female,no,no,no,2,yes,no,fiber_optic,no,...,no,no,no,no,month-to-month,yes,electronic_check,70.7,151.65,1
5,9305-cdskc,female,no,no,no,8,yes,yes,fiber_optic,no,...,yes,no,yes,yes,month-to-month,yes,electronic_check,99.65,820.5,1
6,1452-kiovk,male,no,no,yes,22,yes,yes,fiber_optic,no,...,no,no,yes,no,month-to-month,yes,credit_card_(automatic),89.1,1949.4,0
7,6713-okomc,female,no,no,no,10,no,no_phone_service,dsl,yes,...,no,no,no,no,month-to-month,no,mailed_check,29.75,301.9,0
8,7892-pookp,female,no,yes,no,28,yes,yes,fiber_optic,no,...,yes,yes,yes,yes,month-to-month,yes,electronic_check,104.8,3046.05,1
9,6388-tabgu,male,no,no,yes,62,yes,no,dsl,yes,...,no,no,no,no,one_year,no,bank_transfer_(automatic),56.15,3487.95,0


In [14]:
df.dtypes

customerid           object
gender               object
seniorcitizen        object
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                 int64
dtype: object

## Separate numerical and categorical columns

In [16]:
from pandas.api.types import is_object_dtype, is_numeric_dtype, is_bool_dtype

num_col = []
cat_col = []

#Scanning all columns except the label
for col in df.columns[:-1]: 
    if is_object_dtype(df[col]):
        cat_col.append(col)
    else:
        num_col.append(col)

print('Num Col:',num_col)
print('Cat Col:', cat_col)

Num Col: ['tenure', 'monthlycharges', 'totalcharges']
Cat Col: ['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents', 'phoneservice', 'multiplelines', 'internetservice', 'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport', 'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling', 'paymentmethod']


In [17]:
#Count distinct values in cat columns
df[cat_col].nunique()

customerid          7043
gender                 2
seniorcitizen          2
partner                2
dependents             2
phoneservice           2
multiplelines          3
internetservice        3
onlinesecurity         3
onlinebackup           3
deviceprotection       3
techsupport            3
streamingtv            3
streamingmovies        3
contract               3
paperlessbilling       2
paymentmethod          4
dtype: int64

Remove customerid from cat_col list. It is an outlier.

In [18]:
cat_col.remove('customerid')
cat_col

['gender',
 'seniorcitizen',
 'partner',
 'dependents',
 'phoneservice',
 'multiplelines',
 'internetservice',
 'onlinesecurity',
 'onlinebackup',
 'deviceprotection',
 'techsupport',
 'streamingtv',
 'streamingmovies',
 'contract',
 'paperlessbilling',
 'paymentmethod']

## EDA within groups

We will look at the churn rate in the global dataset before splitting into train set sets

In [15]:
print('Number of values:', df.churn.value_counts())
print('\nFraction:', df.churn.value_counts(normalize=True))
global_churn_rate=df.churn.value_counts(normalize=True)[1]
print('\nglobal churn rate:', round(global_churn_rate,2))

Number of values: 0    5174
1    1869
Name: churn, dtype: int64

Fraction: 0    0.73463
1    0.26537
Name: churn, dtype: float64

global churn rate: 0.27


Check the churn rates for different categories

In [19]:
male_churn_rate = df[df.gender == 'male'].churn.value_counts(normalize=True)[1]
female_churn_rate = df[df.gender == 'female'].churn.value_counts(normalize=True)[1]

print('Male churn rate:', round(male_churn_rate,3))
print('Female churn rate:', round(female_churn_rate,3))
print('Global churn rate:', round(global_churn_rate,3))

Male churn rate: 0.262
Female churn rate: 0.269
Global churn rate: 0.265


In [20]:
wpartner_churn_rate = df[df.partner == 'yes'].churn.value_counts(normalize=True)[1]
wopartner_churn_rate = df[df.partner == 'no'].churn.value_counts(normalize=True)[1]

print('With partner churn rate:', round(wpartner_churn_rate,3))
print('Without partner churn rate:', round(wopartner_churn_rate,3))
print('Global churn rate:', round(global_churn_rate,3))

With partner churn rate: 0.197
Without partner churn rate: 0.33
Global churn rate: 0.265


It seems churn rate isn't impacted by gender, but impacted significantly by partner.

`churn.value_counts(normalize=True)[1]` and `churn.mean()` will give the same results for churn rate, so we can use groupby function to process data effectively.


In [21]:
df.groupby('gender').churn.mean()


gender
female    0.269209
male      0.261603
Name: churn, dtype: float64

In [22]:
df_group = df.groupby('gender').churn.agg(['mean', 'count'])
df_group['diff'] = df_group['mean'] - global_churn_rate
df_group['risk'] = df_group['mean'] / global_churn_rate
df_group

Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.269209,3488,0.003839,1.014466
male,0.261603,3555,-0.003766,0.985807


df_group = df.groupby('gender').churn.agg(['mean', 'count'])
df_group['diff'] = df_group['mean'] - global_churn_rate
df_group['risk'] = df_group['mean'] / global_churn_rate

There are two metric that we have made, "diff" and "risk", former is the difference between group churn rate vs global churn rate and latter is group churn rate divided by global churn rate.

If diff > 0, group more likely to churn, if < 0 then less likely to churn. It is an absolute measure.

If risk > 1, more likely to churn. Risk is relative measure, which shows how significant are churn chances for the group. e.g. People without partner have 22% higher chances to churn compared to average. People with partner have 25% less chances to churn.

In [23]:
# Lets loop this over all categorical columns
from IPython.display import display
for cat in cat_col:
    df_group = df.groupby(cat).churn.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_churn_rate
    df_group['risk'] = df_group['mean'] / global_churn_rate
    display(df_group)

Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.269209,3488,0.003839,1.014466
male,0.261603,3555,-0.003766,0.985807


Unnamed: 0_level_0,mean,count,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.236062,5901,-0.029308,0.889557
yes,0.416813,1142,0.151443,1.570686


Unnamed: 0_level_0,mean,count,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.32958,3641,0.06421,1.241964
yes,0.196649,3402,-0.068721,0.741038


Unnamed: 0_level_0,mean,count,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.312791,4933,0.047422,1.1787
yes,0.154502,2110,-0.110868,0.582215


Unnamed: 0_level_0,mean,count,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.249267,682,-0.016103,0.939319
yes,0.267096,6361,0.001726,1.006506


Unnamed: 0_level_0,mean,count,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.250442,3390,-0.014927,0.943749
no_phone_service,0.249267,682,-0.016103,0.939319
yes,0.286099,2971,0.020729,1.078114


Unnamed: 0_level_0,mean,count,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dsl,0.189591,2421,-0.075779,0.714441
fiber_optic,0.418928,3096,0.153558,1.578656
no,0.07405,1526,-0.19132,0.279044


Unnamed: 0_level_0,mean,count,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.417667,3498,0.152297,1.573906
no_internet_service,0.07405,1526,-0.19132,0.279044
yes,0.146112,2019,-0.119258,0.550597


Unnamed: 0_level_0,mean,count,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.399288,3088,0.133918,1.504645
no_internet_service,0.07405,1526,-0.19132,0.279044
yes,0.215315,2429,-0.050055,0.811377


Unnamed: 0_level_0,mean,count,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.391276,3095,0.125906,1.474456
no_internet_service,0.07405,1526,-0.19132,0.279044
yes,0.225021,2422,-0.040349,0.847951


Unnamed: 0_level_0,mean,count,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.416355,3473,0.150985,1.56896
no_internet_service,0.07405,1526,-0.19132,0.279044
yes,0.151663,2044,-0.113706,0.571517


Unnamed: 0_level_0,mean,count,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.335231,2810,0.069861,1.263261
no_internet_service,0.07405,1526,-0.19132,0.279044
yes,0.300702,2707,0.035332,1.133143


Unnamed: 0_level_0,mean,count,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.336804,2785,0.071434,1.269188
no_internet_service,0.07405,1526,-0.19132,0.279044
yes,0.299414,2732,0.034044,1.128291


Unnamed: 0_level_0,mean,count,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month-to-month,0.427097,3875,0.161727,1.60944
one_year,0.112695,1473,-0.152675,0.424672
two_year,0.028319,1695,-0.237051,0.106714


Unnamed: 0_level_0,mean,count,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.163301,2872,-0.102069,0.615371
yes,0.335651,4171,0.070281,1.264842


Unnamed: 0_level_0,mean,count,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank_transfer_(automatic),0.167098,1544,-0.098271,0.629681
credit_card_(automatic),0.152431,1522,-0.112939,0.57441
electronic_check,0.452854,2365,0.187484,1.706502
mailed_check,0.191067,1612,-0.074303,0.720003


Contract seems the most important factor for churn, as people with month-to-month contract have 60% higher risk to churn, while one year and two year contracts are unlikely to churn.

To get more insights, we check mutual information between variables.

## Mutual information - Categorical features vs label

Mutual info describes how much we can learn about one variable looking at the other. It shows how much information one variable has on the other.

In [26]:
from sklearn.metrics import mutual_info_score
display(mutual_info_score(df.contract, df.churn))
display(mutual_info_score(df.gender, df.churn))

0.09845305342598942

3.7082914405128786e-05

The above shows that we learn more about the churn rate if we look at the contract variable rather than gender variable

Now we apply this for all categorical columns

In [45]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, df.churn)

interactions = df[cat_col].apply(mutual_info_churn_score)
interactions.sort_values(ascending=False)

contract            0.098453
onlinesecurity      0.064677
techsupport         0.063021
internetservice     0.055574
onlinebackup        0.046792
paymentmethod       0.044519
deviceprotection    0.043917
streamingmovies     0.032001
streamingtv         0.031908
paperlessbilling    0.019194
dependents          0.014467
partner             0.011454
seniorcitizen       0.010577
multiplelines       0.000801
phoneservice        0.000072
gender              0.000037
dtype: float64

The above shows us the ranking of features in terms of importance towards churn. These are the signals ML model will pick.

## Mutual information - Numerical features vs label

We will look at correlation coefficient r for numerical features

In [46]:
df[num_col].corrwith(df.churn)

tenure           -0.352229
monthlycharges    0.193356
totalcharges     -0.198324
dtype: float64

Seems tenure is most strongly and inversely correlated with churn rate. It indicates the more time customers spend with company, less likely they churn.

In [50]:
print('Churn rate for customers less than 2 years with company', df[df.tenure <= 2].churn.mean())
print('Churn rate for customers between 2-10 years with company', df[(df.tenure  > 2) & (df.tenure <= 10)].churn.mean())
print('Churn rate for customers more than 10 years with company', df[df.tenure  > 10].churn.mean())

Churn rate for customers less than 2 years with company 0.5835266821345708
Churn rate for customers between 2-10 years with company 0.4196750902527076
Churn rate for customers more than 10 years with company 0.17760693869505223


We can see a trend with monthly charges too, but it is posistive correlation

In [53]:
print('Churn rate for customers with less than $20 monthly charges', df[df.monthlycharges <= 20].churn.mean())
print('Churn rate for customers between $20-50 monthly charges', df[(df.monthlycharges  > 20) & (df.monthlycharges <= 50)].churn.mean())
print('Churn rate for customers more than $50 monthly charges', df[df.monthlycharges  > 50].churn.mean())

Churn rate for customers with less than $20 monthly charges 0.08841463414634146
Churn rate for customers between $20-50 monthly charges 0.18441874619598295
Churn rate for customers more than $50 monthly charges 0.3178752107925801


## Split into test/train sets, and do analysis on that too

In [74]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,:-1], df.iloc[:,[-1]], test_size=0.3, random_state=1)

print(df.shape)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

Xy_train = pd.concat([X_train, y_train], axis = 1)
Xy_test = pd.concat([X_test, y_test], axis = 1)

(7043, 21)
(4930, 20)
(4930, 1)
(2113, 20)
(2113, 1)


In [75]:
print('Churn fraction - Global:', df.churn.value_counts(normalize=True)[1])
print('Churn fraction - Train:', Xy_train.churn.value_counts(normalize=True)[1])

local_churn_rate = y_train.value_counts(normalize=True)[1]

Churn fraction - Global: 0.2653698707936959
Churn fraction - Train: 0.27200811359026367


In [76]:
from IPython.display import display
for cat in cat_col:
    df_group_local = Xy_train.groupby(cat).churn.agg(['mean', 'count'])
    df_group_local['diff'] = df_group_local['mean'] - local_churn_rate
    df_group_local['risk'] = df_group_local['mean'] / local_churn_rate
    display(df_group_local)

Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.27703,2451,0.005022,1.018461
male,0.267043,2479,-0.004965,0.981747


Unnamed: 0_level_0,mean,count,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.242798,4131,-0.02921,0.892614
yes,0.423029,799,0.151021,1.555206


Unnamed: 0_level_0,mean,count,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.331656,2584,0.059648,1.219288
yes,0.206309,2346,-0.0657,0.758465


Unnamed: 0_level_0,mean,count,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.315245,3483,0.043237,1.158956
yes,0.167934,1447,-0.104074,0.617385


Unnamed: 0_level_0,mean,count,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.246835,474,-0.025173,0.907456
yes,0.274686,4456,0.002678,1.009844


Unnamed: 0_level_0,mean,count,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.255804,2369,-0.016204,0.940428
no_phone_service,0.246835,474,-0.025173,0.907456
yes,0.296119,2087,0.024111,1.08864


Unnamed: 0_level_0,mean,count,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dsl,0.197024,1680,-0.074984,0.724331
fiber_optic,0.424437,2177,0.152429,1.560385
no,0.080149,1073,-0.191859,0.294657


Unnamed: 0_level_0,mean,count,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.422873,2457,0.150865,1.554635
no_internet_service,0.080149,1073,-0.191859,0.294657
yes,0.154286,1400,-0.117722,0.56721


Unnamed: 0_level_0,mean,count,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.404412,2176,0.132404,1.486764
no_internet_service,0.080149,1073,-0.191859,0.294657
yes,0.223081,1681,-0.048927,0.820128


Unnamed: 0_level_0,mean,count,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.39843,2166,0.126422,1.464774
no_internet_service,0.080149,1073,-0.191859,0.294657
yes,0.231815,1691,-0.040193,0.852237


Unnamed: 0_level_0,mean,count,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.420859,2445,0.148851,1.547229
no_internet_service,0.080149,1073,-0.191859,0.294657
yes,0.160057,1412,-0.111951,0.588426


Unnamed: 0_level_0,mean,count,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.344196,1964,0.072187,1.265387
no_internet_service,0.080149,1073,-0.191859,0.294657
yes,0.305864,1893,0.033856,1.124465


Unnamed: 0_level_0,mean,count,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.342009,1921,0.070001,1.25735
no_internet_service,0.080149,1073,-0.191859,0.294657
yes,0.308884,1936,0.036876,1.13557


Unnamed: 0_level_0,mean,count,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month-to-month,0.431452,2728,0.159443,1.586172
one_year,0.122605,1044,-0.149403,0.450742
two_year,0.031088,1158,-0.24092,0.114291


Unnamed: 0_level_0,mean,count,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.17515,2004,-0.096858,0.643914
yes,0.338346,2926,0.066338,1.243882


Unnamed: 0_level_0,mean,count,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank_transfer_(automatic),0.173132,1057,-0.098877,0.636494
credit_card_(automatic),0.166974,1084,-0.105034,0.613857
electronic_check,0.455918,1656,0.18391,1.676119
mailed_check,0.19594,1133,-0.076068,0.720346


In [77]:
# Check mutual info for cat features
def mutual_info_churn_score(series):
    return mutual_info_score(series, Xy_train.churn)

interactions = Xy_train[cat_col].apply(mutual_info_churn_score)
interactions.sort_values(ascending=False)

contract            0.095584
onlinesecurity      0.062868
techsupport         0.061018
internetservice     0.054367
onlinebackup        0.045686
deviceprotection    0.043365
paymentmethod       0.042050
streamingtv         0.031593
streamingmovies     0.031427
paperlessbilling    0.016845
dependents          0.012096
seniorcitizen       0.010384
partner             0.010012
multiplelines       0.001090
phoneservice        0.000173
gender              0.000063
dtype: float64

In [78]:
# Check mutual info for num features
Xy_train[num_col].corrwith(Xy_train.churn)

tenure           -0.347831
monthlycharges    0.194976
totalcharges     -0.191520
dtype: float64

In [79]:
print('Churn rate for customers less than 2 years with company', Xy_train[Xy_train.tenure <= 2].churn.mean())
print('Churn rate for customers between 2-10 years with company', Xy_train[(Xy_train.tenure  > 2) & (df.tenure <= 10)].churn.mean())
print('Churn rate for customers more than 10 years with company', Xy_train[Xy_train.tenure  > 10].churn.mean())

print('\nChurn rate for customers with less than $20 monthly charges', Xy_train[Xy_train.monthlycharges <= 20].churn.mean())
print('Churn rate for customers between $20-50 monthly charges', Xy_train[(Xy_train.monthlycharges  > 20) & (df.monthlycharges <= 50)].churn.mean())
print('Churn rate for customers more than $50 monthly charges', Xy_train[Xy_train.monthlycharges  > 50].churn.mean())

Churn rate for customers less than 2 years with company 0.5933884297520661
Churn rate for customers between 2-10 years with company 0.415929203539823
Churn rate for customers more than 10 years with company 0.18477645727221279

Churn rate for customers with less than $20 monthly charges 0.0911062906724512
Churn rate for customers between $20-50 monthly charges 0.18826619964973731
Churn rate for customers more than $50 monthly charges 0.3258190562067929


  print('Churn rate for customers between 2-10 years with company', Xy_train[(Xy_train.tenure  > 2) & (df.tenure <= 10)].churn.mean())
  print('Churn rate for customers between $20-50 monthly charges', Xy_train[(Xy_train.monthlycharges  > 20) & (df.monthlycharges <= 50)].churn.mean())


Local trends in train datasets are consistent with global dataset

## Storing training and testing datasets locally

In [80]:
Xy_train.to_csv('Xy_train.csv')
Xy_test.to_csv('Xy_test.csv')