# ***Classification***
---

# Churn prediction project

g(xi) ~ yi

yi that is a value between {0, 1}

In [30]:
import pandas as pd
import numpy as np

In [31]:
import matplotlib.pyplot as plt

In [32]:
data = "https://raw.githubusercontent.com/alexeygrigorev/mlbookcamp-code/master/chapter-03-churn-prediction/WA_Fn-UseC_-Telco-Customer-Churn.csv"
# !wget $data -O data-week-3.csv

In [33]:
df = pd.read_csv('data-week-3.csv')
df.head().T # to see all the columns

Unnamed: 0,0,1,2,3,4
customerID,7590-VHVEG,5575-GNVDE,3668-QPYBK,7795-CFOCW,9237-HQITU
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No


In [34]:
df.columns = df.columns.str.lower().str.replace(' ', '_')
categorial_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorial_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [35]:
df.head().T

Unnamed: 0,0,1,2,3,4
customerid,7590-vhveg,5575-gnvde,3668-qpybk,7795-cfocw,9237-hqitu
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no


df.dtypes

df.totalcharges should be a number but has dtype = object

In [36]:
# pd.to_numeric(df.totalcharges)
# gives error therefore totalcharges contains other data types also
# empty rows had space we replaced them with _ that's why we getting error

In [37]:
tc = pd.to_numeric(df.totalcharges, errors = 'coerce') #if can't read replace with NaN

In [38]:
tc.isnull().sum()

np.int64(11)

In [39]:
df.totalcharges = pd.to_numeric(df.totalcharges, errors = 'coerce')

In [40]:
df.totalcharges = df.totalcharges.fillna(0)

### churn var to binary column

In [41]:
df.churn = (df.churn == 'yes').astype('int')

In [42]:
df.churn.head()

0    0
1    0
2    1
3    0
4    1
Name: churn, dtype: int64

# Setting up validation Framework

In [43]:
from sklearn.model_selection import train_test_split

In [44]:
train_test_split?

[31mSignature:[39m
train_test_split(
    *arrays,
    test_size=[38;5;28;01mNone[39;00m,
    train_size=[38;5;28;01mNone[39;00m,
    random_state=[38;5;28;01mNone[39;00m,
    shuffle=[38;5;28;01mTrue[39;00m,
    stratify=[38;5;28;01mNone[39;00m,
)
[31mDocstring:[39m
Split arrays or matrices into random train and test subsets.

Quick utility that wraps input validation,
``next(ShuffleSplit().split(X, y))``, and application to input data
into a single call for splitting (and optionally subsampling) data into a
one-liner.

Read more in the :ref:`User Guide <cross_validation>`.

Parameters
----------
*arrays : sequence of indexables with same length / shape[0]
    Allowed inputs are lists, numpy arrays, scipy-sparse
    matrices or pandas dataframes.

test_size : float or int, default=None
    If float, should be between 0.0 and 1.0 and represent the proportion
    of the dataset to include in the test split. If int, represents the
    absolute number of test samples. If None

In [45]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state = 1) 

In [46]:
len(df_full_train), len(df_test)

(5634, 1409)

In [49]:
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 1) # 20% of 80% = 25%

In [50]:
len(df_full_train), len(df_test), len(df_val)

(5634, 1409, 1409)

In [52]:
df_train = df_train.reset_index(drop = True)
df_val = df_val.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

In [53]:
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

In [55]:
del df_train['churn']
del df_val['churn']
del df_test['churn']

# EDA (Exploratory data analysis)

- Check missing values
- Look at the target variable(churn)
- Loot at numerical and categorical variables

In [56]:
df_full_train = df_full_train.reset_index(drop = True)

In [57]:
df_full_train.isnull().sum()

customerid          0
gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

In [63]:
df_full_train.churn.value_counts(normalize = True) # normalize tells the percentage

churn
0    0.730032
1    0.269968
Name: proportion, dtype: float64

In [65]:
df_full_train.churn.mean() # as values are only 0 and 1 

np.float64(0.26996805111821087)

In [66]:
global_churn_rate = df_full_train.churn.mean()

In [67]:
numerical = ['tenure', 'monthlycharges', 'totalcharges'] # look at dtypes for numerical columns

In [68]:
df_full_train.columns

Index(['customerid', 'gender', 'seniorcitizen', 'partner', 'dependents',
       'tenure', 'phoneservice', 'multiplelines', 'internetservice',
       'onlinesecurity', 'onlinebackup', 'deviceprotection', 'techsupport',
       'streamingtv', 'streamingmovies', 'contract', 'paperlessbilling',
       'paymentmethod', 'monthlycharges', 'totalcharges', 'churn'],
      dtype='object')

In [69]:
categorial = ['gender', 'seniorcitizen', 'partner', 'dependents',
              'phoneservice', 'multiplelines', 'internetservice',
              'onlinesecurity', 'onlinebackup', 'deviceprotection',
              'techsupport', 'streamingtv', 'streamingmovies',
              'contract', 'paperlessbilling', 'paymentmethod']

In [70]:
df_full_train[categorial].nunique()

gender              2
seniorcitizen       2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
dtype: int64

# Feature importance: churn rate and risk ratio 

### churn rate

In [72]:
df_full_train.head()

Unnamed: 0,customerid,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,...,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,5442-pptjy,male,0,yes,yes,12,yes,no,no,no_internet_service,...,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.7,258.35,0
1,6261-rcvns,female,0,no,no,42,yes,no,dsl,yes,...,yes,yes,no,yes,one_year,no,credit_card_(automatic),73.9,3160.55,1
2,2176-osjuv,male,0,yes,no,71,yes,yes,dsl,yes,...,no,yes,no,no,two_year,no,bank_transfer_(automatic),65.15,4681.75,0
3,6161-erdgd,male,0,yes,yes,71,yes,yes,dsl,yes,...,yes,yes,yes,yes,one_year,no,electronic_check,85.45,6300.85,0
4,2364-ufrom,male,0,no,no,30,yes,no,dsl,yes,...,no,yes,yes,no,one_year,no,electronic_check,70.4,2044.75,0


In [73]:
churn_female = df_full_train[df_full_train.gender == 'female'].churn.mean()
churn_female

np.float64(0.27682403433476394)

In [74]:
churn_male = df_full_train[df_full_train.gender == 'male'].churn.mean()
churn_male

np.float64(0.2632135306553911)

In [75]:
df_full_train.partner.value_counts()

partner
no     2932
yes    2702
Name: count, dtype: int64

In [78]:
churn_partner = df_full_train[df_full_train.partner == 'yes'].churn.mean()
churn_partner 

np.float64(0.20503330866025166)

In [80]:
churn_no_partner = df_full_train[df_full_train.partner == 'no'].churn.mean()
churn_no_partner 

np.float64(0.3298090040927694)

1. difference
    - global - group
    - if >0 -> less likely to churn
    - <0 ->  more likely to churn

In [86]:
global_churn_rate - churn_female

np.float64(-0.006855983216553063)

In [87]:
global_churn_rate - churn_male

np.float64(0.006754520462819769)

In [88]:
global_churn_rate - churn_partner

np.float64(0.06493474245795922)

In [89]:
global_churn_rate - churn_no_partner

np.float64(-0.05984095297455855)

2. risk ratio
   - group / global
   - if >1 more likely to churn
   - <1 less likely to churn

In [90]:
churn_no_partner / global_churn_rate

np.float64(1.2216593879412643)

In [91]:
churn_partner / global_churn_rate

np.float64(0.7594724924338315)