In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_csv("../datasets/telco.csv")

In [3]:
del df['customerID']

In [4]:
# We can show all columns with a simple transpose
df.head().T

Unnamed: 0,0,1,2,3,4
gender,Female,Male,Male,Male,Female
SeniorCitizen,0,0,0,0,0
Partner,Yes,No,No,No,No
Dependents,No,No,No,No,No
tenure,1,34,2,45,2
PhoneService,No,Yes,Yes,No,Yes
MultipleLines,No phone service,No,No,No phone service,No
InternetService,DSL,DSL,DSL,DSL,Fiber optic
OnlineSecurity,No,Yes,Yes,Yes,No
OnlineBackup,Yes,No,Yes,No,No


### DATA PREPARATION

In [5]:
# Normalize the column names & categorical values

df.columns = df.columns.str.lower()
categorical_columns = list(df.dtypes[df.dtypes=="object"].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [6]:
df.head().T

Unnamed: 0,0,1,2,3,4
gender,female,male,male,male,female
seniorcitizen,0,0,0,0,0
partner,yes,no,no,no,no
dependents,no,no,no,no,no
tenure,1,34,2,45,2
phoneservice,no,yes,yes,no,yes
multiplelines,no_phone_service,no,no,no_phone_service,no
internetservice,dsl,dsl,dsl,dsl,fiber_optic
onlinesecurity,no,yes,yes,yes,no
onlinebackup,yes,no,yes,no,no


In [7]:
df.dtypes
# See that seniorcitizen column is not categorical, and totalcharges is not numeric

gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges         object
churn                object
dtype: object

In [8]:
tc = pd.to_numeric(df.totalcharges, errors="coerce")

In [9]:
df[tc.isnull()][['totalcharges']]

Unnamed: 0,totalcharges
488,_
753,_
936,_
1082,_
1340,_
3331,_
3826,_
4380,_
5218,_
6670,_


In [10]:
df.totalcharges = tc
df.totalcharges = df.totalcharges.fillna(0)
df.totalcharges.isnull().sum()

np.int64(0)

In [11]:
df.churn = (df.churn=="yes").astype(int)

In [12]:
df.dtypes

gender               object
seniorcitizen         int64
partner              object
dependents           object
tenure                int64
phoneservice         object
multiplelines        object
internetservice      object
onlinesecurity       object
onlinebackup         object
deviceprotection     object
techsupport          object
streamingtv          object
streamingmovies      object
contract             object
paperlessbilling     object
paymentmethod        object
monthlycharges      float64
totalcharges        float64
churn                 int64
dtype: object

### VALIDATION FRAMEWORK

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
len(df_full_train), len(df_test)

(5634, 1409)

In [15]:
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
len(df_train), len(df_val)

(4225, 1409)

In [16]:
df_full_train = df_full_train.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [17]:
y_full_train = df_full_train.churn.values
y_train = df_train.churn.values
y_val = df_val.churn.values
y_test = df_test.churn.values

In [18]:
# del df_full_train['churn']
# del df_train['churn']
# del df_val['churn']
# del df_test['churn'] 

### EDA

In [19]:
df_train.isnull().sum()

gender              0
seniorcitizen       0
partner             0
dependents          0
tenure              0
phoneservice        0
multiplelines       0
internetservice     0
onlinesecurity      0
onlinebackup        0
deviceprotection    0
techsupport         0
streamingtv         0
streamingmovies     0
contract            0
paperlessbilling    0
paymentmethod       0
monthlycharges      0
totalcharges        0
churn               0
dtype: int64

<p> Checking churn rate

In [20]:
print("Churn")
print((y_full_train == 1).sum())
print("Not churn")
print((y_full_train == 0).sum())

Churn
1521
Not churn
4113


<p> In binary classification, the mean of the target variable is the same as the proportion of ones in the variable, cause of binary properties

In [21]:
print("Churn rate")
print(round(y_full_train.mean(),2))

print((y_full_train == 1).sum()/((y_full_train == 1).sum() + (y_full_train == 0).sum()))

Churn rate
0.27
0.26996805111821087


In [22]:
numeric_columns = list(df_full_train.select_dtypes(include=['int64', 'float64']).columns)

categorical_columns = list(df_full_train.select_dtypes(include=["object"]).columns)

In [23]:
del numeric_columns[0] # remove seniorcitizen

In [24]:
categorical_columns.append('seniorcitizen') # add seniorcitizen

In [25]:
df_full_train[categorical_columns].nunique()

gender              2
partner             2
dependents          2
phoneservice        2
multiplelines       3
internetservice     3
onlinesecurity      3
onlinebackup        3
deviceprotection    3
techsupport         3
streamingtv         3
streamingmovies     3
contract            3
paperlessbilling    2
paymentmethod       4
seniorcitizen       2
dtype: int64

### FEATURE IMPORTANCE
<p>we can check which feature holds more weight by checking its churn, grouped by its feature

In [26]:
global_churn = y_full_train.mean()
global_churn

np.float64(0.26996805111821087)

In [27]:
churn_female = y_full_train[df_full_train.gender == 'female'].mean()
churn_female

np.float64(0.27682403433476394)

In [28]:
churn_male = y_full_train[df_full_train.gender == 'male'].mean()
churn_male

np.float64(0.2632135306553911)

<p> Female groups are more likely to churn, now lets check the partner feature

In [29]:
churn_partner = y_full_train[df_full_train.partner == "yes"].mean()
churn_partner

np.float64(0.20503330866025166)

In [30]:
churn_nopartner = y_full_train[df_full_train.partner == "no"].mean()
churn_nopartner

np.float64(0.3298090040927694)

#### RISK RATIO
##### In the context of machine learning and classification, the “risk ratio” typically refers to a statistical measure used to assess the likelihood or probability of a certain event occurring in one group compared to another

In [31]:
churn_nopartner/global_churn

np.float64(1.2216593879412643)

In [32]:
df_full_train.groupby('gender').churn.mean()

gender
female    0.276824
male      0.263214
Name: churn, dtype: float64

In [33]:
df_full_train.groupby('gender').churn.agg(['mean', 'count'])

Unnamed: 0_level_0,mean,count
gender,Unnamed: 1_level_1,Unnamed: 2_level_1
female,0.276824,2796
male,0.263214,2838


In [34]:
df_group = df_full_train.groupby('gender').churn.agg(['mean', 'count'])
df_group['diff'] = df_group['mean'] - global_churn
df_group['risk'] = df_group['mean'] / global_churn
df_group

Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.276824,2796,0.006856,1.025396
male,0.263214,2838,-0.006755,0.97498


<p> Let's do this for all categorical variable

In [35]:
from IPython.display import display

for c in categorical_columns:
    df_group = df_full_train.groupby(c).churn.agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - global_churn
    df_group['risk'] = df_group['mean'] / global_churn
    display(df_group)
    print()
    print()

Unnamed: 0_level_0,mean,count,diff,risk
gender,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.276824,2796,0.006856,1.025396
male,0.263214,2838,-0.006755,0.97498






Unnamed: 0_level_0,mean,count,diff,risk
partner,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.329809,2932,0.059841,1.221659
yes,0.205033,2702,-0.064935,0.759472






Unnamed: 0_level_0,mean,count,diff,risk
dependents,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.31376,3968,0.043792,1.162212
yes,0.165666,1666,-0.104302,0.613651






Unnamed: 0_level_0,mean,count,diff,risk
phoneservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.241316,547,-0.028652,0.89387
yes,0.273049,5087,0.003081,1.011412






Unnamed: 0_level_0,mean,count,diff,risk
multiplelines,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.257407,2700,-0.012561,0.953474
no_phone_service,0.241316,547,-0.028652,0.89387
yes,0.290742,2387,0.020773,1.076948






Unnamed: 0_level_0,mean,count,diff,risk
internetservice,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
dsl,0.192347,1934,-0.077621,0.712482
fiber_optic,0.425171,2479,0.155203,1.574895
no,0.077805,1221,-0.192163,0.288201






Unnamed: 0_level_0,mean,count,diff,risk
onlinesecurity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.420921,2801,0.150953,1.559152
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.153226,1612,-0.116742,0.56757






Unnamed: 0_level_0,mean,count,diff,risk
onlinebackup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.404323,2498,0.134355,1.497672
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.217232,1915,-0.052736,0.80466






Unnamed: 0_level_0,mean,count,diff,risk
deviceprotection,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.395875,2473,0.125907,1.466379
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.230412,1940,-0.039556,0.85348






Unnamed: 0_level_0,mean,count,diff,risk
techsupport,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.418914,2781,0.148946,1.551717
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.159926,1632,-0.110042,0.59239






Unnamed: 0_level_0,mean,count,diff,risk
streamingtv,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.342832,2246,0.072864,1.269897
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.302723,2167,0.032755,1.121328






Unnamed: 0_level_0,mean,count,diff,risk
streamingmovies,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.338906,2213,0.068938,1.255358
no_internet_service,0.077805,1221,-0.192163,0.288201
yes,0.307273,2200,0.037305,1.138182






Unnamed: 0_level_0,mean,count,diff,risk
contract,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
month-to-month,0.431701,3104,0.161733,1.599082
one_year,0.120573,1186,-0.149395,0.446621
two_year,0.028274,1344,-0.241694,0.10473






Unnamed: 0_level_0,mean,count,diff,risk
paperlessbilling,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
no,0.172071,2313,-0.097897,0.637375
yes,0.338151,3321,0.068183,1.25256






Unnamed: 0_level_0,mean,count,diff,risk
paymentmethod,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bank_transfer_(automatic),0.168171,1219,-0.101797,0.622928
credit_card_(automatic),0.164339,1217,-0.10563,0.608733
electronic_check,0.45589,1893,0.185922,1.688682
mailed_check,0.19387,1305,-0.076098,0.718121






Unnamed: 0_level_0,mean,count,diff,risk
seniorcitizen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.24227,4722,-0.027698,0.897403
1,0.413377,912,0.143409,1.531208






### MUTUAL INFORMATION -> FOR CATEGORICAL VARIABLE
<p> We need to know the correlation from one feature to the other, how important the feature compared to the other

In [36]:
from sklearn.metrics import mutual_info_score

In [37]:
mutual_info_score(df_full_train.churn, df_full_train.contract)

np.float64(0.0983203874041556)

In [38]:
mutual_info_score(df_full_train.churn, df_full_train.gender)

np.float64(0.0001174846211139946)

In [39]:
mutual_info_score(df_full_train.churn, df_full_train.partner)

np.float64(0.009967689095399745)

In [40]:
def mutual_info_churn_score(series):
    return mutual_info_score(series, df_full_train.churn)

In [41]:
df_full_train[categorical_columns].apply(mutual_info_churn_score).sort_values(ascending=False)

contract            0.098320
onlinesecurity      0.063085
techsupport         0.061032
internetservice     0.055868
onlinebackup        0.046923
deviceprotection    0.043453
paymentmethod       0.043210
streamingtv         0.031853
streamingmovies     0.031581
paperlessbilling    0.017589
dependents          0.012346
partner             0.009968
seniorcitizen       0.009410
multiplelines       0.000857
phoneservice        0.000229
gender              0.000117
dtype: float64

### CORRELATION COEFFICIENT -> NUMERICAL VARIABLE
<p> Pearson's correlation coefficient

In [42]:
numeric_columns = numeric_columns[:-1]

In [43]:
df_full_train[numeric_columns].corrwith(df_full_train.churn).abs().sort_values(ascending=False)

tenure            0.351885
monthlycharges    0.196805
totalcharges      0.196353
dtype: float64

In [44]:
df_full_train.tenure.unique()

array([12, 42, 71, 30,  9, 72, 28,  6, 47, 22,  2,  3,  4, 60, 69, 26, 33,
        8, 27, 25, 23, 34, 59, 66,  5, 45, 49, 24,  1, 40, 51, 68, 32, 44,
       67, 11, 53,  7, 55, 31, 13, 63, 64, 10, 62, 35, 57, 20, 65, 18, 46,
       29, 37, 48, 15, 61, 14, 70, 52, 54, 19, 50, 43, 16, 36, 38, 56, 17,
       41, 39, 58, 21,  0])

In [45]:
df_full_train[df_full_train.tenure <= 4].churn.mean()

np.float64(0.5561122244488977)

In [46]:
df_full_train[df_full_train.tenure > 10].churn.mean()

np.float64(0.18211020509019027)

### ONE HOT ENCODING

In [47]:
dicts = df_train[['gender', 'contract']].iloc[:100].to_dict(orient='records')
dicts

[{'gender': 'female', 'contract': 'two_year'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'two_year'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'two_year'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'two_year'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'two_year'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'male', 'contract': 'month-to-month'},
 {'gender': 'female', 'contract': 'two_year'},
 {'gender': 'female', 'contract': 'month-to-month'},
 {'gender': 'male', 'contract': 'one_year'},
 {'gender': 'male', 'contract': 'two_year'},
 {

In [48]:
from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer()

dv.fit(dicts)

In [49]:
dv.transform(dicts)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 200 stored elements and shape (100, 5)>

In [50]:
dv = DictVectorizer(sparse=False)
dv.fit(dicts)

dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'gender=female', 'gender=male'], dtype=object)

In [51]:
dv.transform(dicts)

array([[0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 0.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 1., 0.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 0.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 1.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 0., 1.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 1., 0.],
       [0., 1., 0., 0., 1.],
       [0., 0., 1., 0., 1.],
       [1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0.],
       [1., 0., 0., 1., 0.],
       [0., 0., 1., 1., 0.],
       [1., 0., 0., 0., 1.],
       [0., 0., 1., 0., 1.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 0.],
       [1., 0., 0., 1., 0.],
       [0., 1., 0., 1., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0.],
       [0., 1., 0., 1., 0.],
       [1., 0.

In [52]:
train_dicts = df_train[categorical_columns + numeric_columns].to_dict(orient="records")
train_dicts[0]

{'gender': 'female',
 'partner': 'yes',
 'dependents': 'yes',
 'phoneservice': 'yes',
 'multiplelines': 'yes',
 'internetservice': 'fiber_optic',
 'onlinesecurity': 'yes',
 'onlinebackup': 'yes',
 'deviceprotection': 'yes',
 'techsupport': 'yes',
 'streamingtv': 'yes',
 'streamingmovies': 'yes',
 'contract': 'two_year',
 'paperlessbilling': 'yes',
 'paymentmethod': 'electronic_check',
 'seniorcitizen': 0,
 'tenure': 72,
 'monthlycharges': 115.5,
 'totalcharges': 8425.15}

In [53]:
dv = DictVectorizer(sparse=False)
dv.fit(train_dicts)

In [54]:
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'gender=female', 'gender=male',
       'internetservice=dsl', 'internetservice=fiber_optic',
       'internetservice=no', 'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',

In [55]:
X_train = dv.transform(train_dicts)
# instead of last two lines, you can also use
# X_train = dv.fit_transform(train_dicts)
 
X_train.shape

(4225, 45)

In [56]:
val_dict = df_val[categorical_columns + numeric_columns].to_dict(orient="records")

X_val = dv.transform(val_dict)

### LOGISTIC REGRESSION

In [93]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

In [57]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [61]:
# To get the weight term
model.coef_[0].round(3)

array([ 0.471, -0.174, -0.404, -0.028, -0.079,  0.063, -0.088, -0.082,
       -0.034, -0.073, -0.334,  0.315, -0.088,  0.004, -0.257,  0.14 ,
        0.01 ,  0.062, -0.088, -0.081,  0.265, -0.088, -0.284, -0.231,
        0.124, -0.164,  0.057, -0.087, -0.032,  0.072, -0.06 ,  0.14 ,
       -0.246,  0.215, -0.12 , -0.088,  0.101, -0.071, -0.088,  0.051,
        0.213, -0.088, -0.232, -0.071,  0.   ])

In [63]:
# TO get the bias term
model.intercept_[0]

np.float64(-0.10819072735550182)

In [65]:
model.predict_proba(X_train)

array([[0.903815  , 0.096185  ],
       [0.32091512, 0.67908488],
       [0.36627175, 0.63372825],
       ...,
       [0.47110439, 0.52889561],
       [0.95699186, 0.04300814],
       [0.30148157, 0.69851843]])

In [66]:
y_pred = model.predict_proba(X_val)[:, 1]
y_pred

In [74]:
churn_decision = y_pred > 0.6

df_val[churn_decision]

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
12,male,0,no,no,4,yes,no,fiber_optic,no,no,no,no,yes,yes,month-to-month,yes,electronic_check,88.45,370.65,1
19,female,0,no,no,7,yes,yes,fiber_optic,no,no,no,no,no,no,month-to-month,yes,electronic_check,74.90,541.15,1
24,female,0,no,no,12,yes,yes,fiber_optic,no,no,yes,no,yes,yes,month-to-month,yes,bank_transfer_(automatic),100.15,1164.30,1
30,male,0,no,no,5,yes,yes,fiber_optic,no,no,no,no,no,no,month-to-month,yes,bank_transfer_(automatic),75.90,357.75,1
40,male,1,yes,no,2,yes,no,fiber_optic,no,no,yes,no,no,no,month-to-month,yes,electronic_check,74.20,140.10,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1397,male,0,yes,no,10,yes,no,fiber_optic,no,no,no,no,no,no,month-to-month,yes,electronic_check,70.30,738.20,1
1398,male,1,no,no,24,yes,yes,fiber_optic,no,yes,yes,no,yes,yes,month-to-month,yes,credit_card_(automatic),102.95,2496.70,1
1399,female,1,no,no,2,yes,no,fiber_optic,no,no,no,no,no,no,month-to-month,yes,electronic_check,69.60,131.65,1
1407,male,0,no,yes,1,yes,yes,fiber_optic,no,no,no,no,yes,yes,month-to-month,yes,electronic_check,93.30,93.30,1


In [75]:
y_val

array([0, 0, 0, ..., 0, 1, 1])

In [77]:
(y_val == churn_decision).mean()
# THe model perform with accuracy of 79.5 %

np.float64(0.794889992902768)

### TRAINING WITH SMALLER SET OF FEATURES

In [78]:
dv.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'dependents=no', 'dependents=yes',
       'deviceprotection=no', 'deviceprotection=no_internet_service',
       'deviceprotection=yes', 'gender=female', 'gender=male',
       'internetservice=dsl', 'internetservice=fiber_optic',
       'internetservice=no', 'monthlycharges', 'multiplelines=no',
       'multiplelines=no_phone_service', 'multiplelines=yes',
       'onlinebackup=no', 'onlinebackup=no_internet_service',
       'onlinebackup=yes', 'onlinesecurity=no',
       'onlinesecurity=no_internet_service', 'onlinesecurity=yes',
       'paperlessbilling=no', 'paperlessbilling=yes', 'partner=no',
       'partner=yes', 'paymentmethod=bank_transfer_(automatic)',
       'paymentmethod=credit_card_(automatic)',
       'paymentmethod=electronic_check', 'paymentmethod=mailed_check',
       'phoneservice=no', 'phoneservice=yes', 'seniorcitizen',
       'streamingmovies=no', 'streamingmovies=no_internet_service',

In [79]:
list(zip(dv.get_feature_names_out(), model.coef_[0].round(3)))

[('contract=month-to-month', np.float64(0.471)),
 ('contract=one_year', np.float64(-0.174)),
 ('contract=two_year', np.float64(-0.404)),
 ('dependents=no', np.float64(-0.028)),
 ('dependents=yes', np.float64(-0.079)),
 ('deviceprotection=no', np.float64(0.063)),
 ('deviceprotection=no_internet_service', np.float64(-0.088)),
 ('deviceprotection=yes', np.float64(-0.082)),
 ('gender=female', np.float64(-0.034)),
 ('gender=male', np.float64(-0.073)),
 ('internetservice=dsl', np.float64(-0.334)),
 ('internetservice=fiber_optic', np.float64(0.315)),
 ('internetservice=no', np.float64(-0.088)),
 ('monthlycharges', np.float64(0.004)),
 ('multiplelines=no', np.float64(-0.257)),
 ('multiplelines=no_phone_service', np.float64(0.14)),
 ('multiplelines=yes', np.float64(0.01)),
 ('onlinebackup=no', np.float64(0.062)),
 ('onlinebackup=no_internet_service', np.float64(-0.088)),
 ('onlinebackup=yes', np.float64(-0.081)),
 ('onlinesecurity=no', np.float64(0.265)),
 ('onlinesecurity=no_internet_service',

<p> Trying model with little subset of feature

In [80]:
small = ['contract', 'tenure', 'monthlycharges']
 
df_train[small].iloc[:10]

Unnamed: 0,contract,tenure,monthlycharges
0,two_year,72,115.5
1,month-to-month,10,95.25
2,month-to-month,5,75.55
3,month-to-month,5,80.85
4,two_year,18,20.1
5,month-to-month,4,30.5
6,month-to-month,1,75.1
7,month-to-month,1,70.3
8,two_year,72,19.75
9,month-to-month,6,109.9


In [82]:
dicts_train_small = df_train[small].to_dict(orient="records")
dicts_val_small = df_val[small].to_dict(orient="records")

In [83]:
dv_small = DictVectorizer(sparse=False)
dv_small.fit(dicts_train_small)

In [84]:
dv_small.get_feature_names_out()

array(['contract=month-to-month', 'contract=one_year',
       'contract=two_year', 'monthlycharges', 'tenure'], dtype=object)

In [85]:
X_train_small = dv_small.transform(dicts_train_small)
model_small = LogisticRegression()
model_small.fit(X_train_small, y_train)

In [87]:
w0 = model_small.intercept_[0]
w = model_small.coef_[0]

In [88]:
w0, w

(np.float64(-2.4779575969847394),
 array([ 0.9711394 , -0.02379507, -0.94828863,  0.02748534, -0.03619005]))

In [92]:
dict(zip(dv_small.get_feature_names_out(), w.round(3)))

{'contract=month-to-month': np.float64(0.971),
 'contract=one_year': np.float64(-0.024),
 'contract=two_year': np.float64(-0.948),
 'monthlycharges': np.float64(0.027),
 'tenure': np.float64(-0.036)}

### TRAINING MODEL WITH TRAIN+VAL DATASET

In [94]:
df_full_train

Unnamed: 0,gender,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,churn
0,male,0,yes,yes,12,yes,no,no,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.70,258.35,0
1,female,0,no,no,42,yes,no,dsl,yes,yes,yes,yes,no,yes,one_year,no,credit_card_(automatic),73.90,3160.55,1
2,male,0,yes,no,71,yes,yes,dsl,yes,yes,no,yes,no,no,two_year,no,bank_transfer_(automatic),65.15,4681.75,0
3,male,0,yes,yes,71,yes,yes,dsl,yes,no,yes,yes,yes,yes,one_year,no,electronic_check,85.45,6300.85,0
4,male,0,no,no,30,yes,no,dsl,yes,yes,no,yes,yes,no,one_year,no,electronic_check,70.40,2044.75,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5629,male,1,no,no,9,yes,yes,fiber_optic,no,no,yes,no,yes,yes,month-to-month,yes,electronic_check,100.50,918.60,1
5630,male,0,no,yes,60,yes,no,no,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,no_internet_service,two_year,no,mailed_check,19.95,1189.90,0
5631,male,0,no,no,28,yes,yes,fiber_optic,no,yes,yes,no,yes,yes,month-to-month,yes,electronic_check,105.70,2979.50,1
5632,male,0,no,no,2,yes,yes,dsl,no,yes,no,no,no,no,month-to-month,yes,mailed_check,54.40,114.10,1


In [95]:
dicts_full_train = df_full_train[categorical_columns+numeric_columns].to_dict(orient="records")

In [101]:
#OHE
full_dv = DictVectorizer(sparse=False)
X_full_train = full_dv.fit_transform(dicts_full_train)

In [102]:
model = LogisticRegression()
model.fit(X_full_train, y_full_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [103]:
dicts_test = df_test[categorical_columns+numeric_columns].to_dict(orient="records")
X_test = full_dv.transform(dicts_test)

In [104]:
y_pred = model.predict_proba(X_test)[:,1]

In [106]:
churn_decision = (y_pred >= 0.5)
(churn_decision == y_test).mean()

np.float64(0.8147622427253371)