### Import Libraries & Data

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

In [4]:
df = pd.read_csv('train.csv')
df.drop(columns = 'id', inplace = True)

### Data Cleaning

In [5]:
# response variable
df = df[df['cancel'] != -1]

# Specify features
primary_x = ['claim.ind', 'sales.channel','dwelling.type','coverage.type','credit','tenure']

# Drop null records for a primary set of features
df.dropna(axis = 0, subset = primary_x, inplace = True)

# Impute missing premiums as median of premiums:
df['premium'] = df['premium'].fillna(df['premium'].mean())

# Change Data Type:
#df['cancel'] = df['cancel'].astype('category')
df['claim.ind'] = df['claim.ind'].astype('int')
df['tenure'] = df['tenure'].astype('int')
df['sales.channel'] = df['sales.channel'].astype('category')
df['dwelling.type'] = df['dwelling.type'].astype('category')
df['coverage.type'] = df['coverage.type'].astype('category')
df['credit'] = df['credit'].astype('category')
df['year'] = df['year'].astype('category')

# Create corrected columns for plotting purposes:
df['claim'] = df['claim.ind'].apply(lambda x: 'Y' if x == 1 else 'N')
df['claim'] = df['claim'].astype('category')

In [6]:
# Specify features
secondary_x = ['ni.gender','ni.marital.status','n.adults','n.children']

# Drop null records for a secondary set of features
df.dropna(axis = 0, subset = secondary_x, inplace = True)

# Change Data Type:
df['ni.marital.status'] = df['ni.marital.status'].astype('int').astype('category')
df['n.adults'] = df['n.adults'].astype('int')
df['n.children'] = df['n.children'].astype('int')
df['ni.gender'] = df['ni.gender'].astype('category')

# Create corrected columns for plotting purposes:
df['married'] = df['ni.marital.status'].apply(lambda x: 'Y' if x == 1 else 'N')

In [7]:
# Specify features
tertiary_x = ['zip.code','ni.age','len.at.res']

# Drop null records for a tertiary set of features
df.dropna(axis = 0, subset = tertiary_x, inplace = True)

# Change Data Type:
df['zip.code'] = df['zip.code'].astype('int').astype('str')

# Zonal Info
df['zone'] = df['zip.code'].astype(str).str[0:2]
df['zone'] = df['zone'].astype('category')

## Dummy Variable Encoding

In [8]:
df.rename(columns = {'coverage.type': 'plan',
                     'dwelling.type': 'property',
                     'sales.channel': 'channel',
                     'ni.gender': 'gender'}, inplace = True)

In [12]:
#dummy_cols = ['credit','plan','property','channel','claim','zone','married','gender','year']
#df = pd.get_dummies(df, columns = dummy_cols, drop_first = True)
#df.info()

# Logistic Regression: Hypothesis Testing for Interactions

In [9]:
from sklearn.linear_model import LogisticRegression
import statsmodels.formula.api as smf
import sklearn

**Test 1: Interactions with Credit**

In [10]:
# Base Model

formula = "cancel ~ credit"
model = smf.mnlogit(formula=formula, data=df).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.733412
         Iterations 6
                          MNLogit Regression Results                          
Dep. Variable:                 cancel   No. Observations:              1032625
Model:                        MNLogit   Df Residuals:                  1032619
Method:                           MLE   Df Model:                            4
Date:                Sat, 30 Apr 2022   Pseudo R-squ.:                 0.04240
Time:                        22:43:35   Log-Likelihood:            -7.5734e+05
converged:                       True   LL-Null:                   -7.9087e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
        cancel=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           -2.8024      0.006   -476.376      0.000      -2.814      -2.791
credit[T.lo

In [11]:
# Interaction 1: 

formula = "cancel ~ credit*channel"
model = smf.mnlogit(formula=formula, data=df).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.701195
         Iterations 7
                          MNLogit Regression Results                          
Dep. Variable:                 cancel   No. Observations:              1032625
Model:                        MNLogit   Df Residuals:                  1032607
Method:                           MLE   Df Model:                           16
Date:                Sat, 30 Apr 2022   Pseudo R-squ.:                 0.08446
Time:                        22:44:50   Log-Likelihood:            -7.2407e+05
converged:                       True   LL-Null:                   -7.9087e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                          cancel=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
Intercept                             -4.0613      0.014   -

In [None]:
['id','tenure','claim','adults','children','gender','married','premium','channel',
'plan','property','len_residence','credit','house_color','age','year','zone']

In [27]:
df2.head()

Unnamed: 0,cancel,year,zip.code,house.color,ni.age,len.at.res,credit,plan,property,premium,...,n.adults,n.children,tenure,claim.ind,claim,married,zone,age_bins,num_children,num_adults
0,0,2013,98004,blue,37.0,18.0,high,C,Tenant,950.507336,...,2,0,15,0,N,N,98,35-45,0,2
1,0,2013,85017,white,40.0,17.0,medium,A,Condo,909.346046,...,5,0,15,0,N,Y,85,35-45,0,G2
2,0,2013,80021,white,45.0,14.0,high,A,House,897.084502,...,1,0,14,0,N,N,80,35-45,0,1
3,0,2013,85002,red,45.0,24.366136,high,A,House,979.039007,...,5,0,22,1,Y,N,85,35-45,0,G2
4,0,2013,50001,yellow,36.0,16.0,high,C,House,932.379027,...,2,4,4,0,N,Y,50,35-45,G3,2


In [38]:
# Interaction 1: 

formula = "cancel ~ year*zone"
model = smf.mnlogit(formula=formula, data=df2).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.751424
         Iterations 7
                          MNLogit Regression Results                          
Dep. Variable:                 cancel   No. Observations:              1032625
Model:                        MNLogit   Df Residuals:                  1032577
Method:                           MLE   Df Model:                           46
Date:                Sat, 30 Apr 2022   Pseudo R-squ.:                 0.01888
Time:                        23:21:07   Log-Likelihood:            -7.7594e+05
converged:                       True   LL-Null:                   -7.9087e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
               cancel=1       coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------
Intercept                  -3.0973      0.029   -105.735      0.000      -3.155   

In [149]:
# Interaction 2: 

formula = "cancel ~ credit*channel*gender"
model = smf.mnlogit(formula=formula, data=df).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.701182
         Iterations 7
                          MNLogit Regression Results                          
Dep. Variable:                 cancel   No. Observations:              1032625
Model:                        MNLogit   Df Residuals:                  1032589
Method:                           MLE   Df Model:                           34
Date:                Sat, 30 Apr 2022   Pseudo R-squ.:                 0.08448
Time:                        02:01:10   Log-Likelihood:            -7.2406e+05
converged:                       True   LL-Null:                   -7.9087e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                      cancel=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------------------
Intercept                           

In [151]:
# Interaction 3: 

formula = "cancel ~ credit*channel + property"
model = smf.mnlogit(formula=formula, data=df).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.700441
         Iterations 7
                          MNLogit Regression Results                          
Dep. Variable:                 cancel   No. Observations:              1032625
Model:                        MNLogit   Df Residuals:                  1032603
Method:                           MLE   Df Model:                           20
Date:                Sat, 30 Apr 2022   Pseudo R-squ.:                 0.08545
Time:                        02:02:58   Log-Likelihood:            -7.2329e+05
converged:                       True   LL-Null:                   -7.9087e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                          cancel=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
Intercept                             -3.9837      0.015   -

In [152]:
# Interaction 4: 

formula = "cancel ~ credit*channel + property + claim"
model = smf.mnlogit(formula=formula, data=df).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.697598
         Iterations 8
                          MNLogit Regression Results                          
Dep. Variable:                 cancel   No. Observations:              1032625
Model:                        MNLogit   Df Residuals:                  1032601
Method:                           MLE   Df Model:                           22
Date:                Sat, 30 Apr 2022   Pseudo R-squ.:                 0.08916
Time:                        02:06:19   Log-Likelihood:            -7.2036e+05
converged:                       True   LL-Null:                   -7.9087e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                          cancel=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
Intercept                             -4.0916      0.015   -

In [157]:
# Additive Model 1: 

formula = "cancel ~ credit*channel + property + plan + claim + tenure*married + zone + year"
model = smf.mnlogit(formula=formula, data=df).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.680857
         Iterations 8
                          MNLogit Regression Results                          
Dep. Variable:                 cancel   No. Observations:              1032625
Model:                        MNLogit   Df Residuals:                  1032575
Method:                           MLE   Df Model:                           48
Date:                Sat, 30 Apr 2022   Pseudo R-squ.:                  0.1110
Time:                        02:24:11   Log-Likelihood:            -7.0307e+05
converged:                       True   LL-Null:                   -7.9087e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                          cancel=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
Intercept                             -4.9822      0.027   -

In [183]:
# Additive Model 2: 

formula = "cancel ~ credit*channel + property + married + plan + premium + claim + zone + year"
model = smf.mnlogit(formula=formula, data=df).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.680383
         Iterations 8
                          MNLogit Regression Results                          
Dep. Variable:                 cancel   No. Observations:              1032625
Model:                        MNLogit   Df Residuals:                  1032577
Method:                           MLE   Df Model:                           46
Date:                Sat, 30 Apr 2022   Pseudo R-squ.:                  0.1116
Time:                        02:50:30   Log-Likelihood:            -7.0258e+05
converged:                       True   LL-Null:                   -7.9087e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                          cancel=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
Intercept                             -3.9908      0.052    