### Import Libraries & Data

In [170]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

In [171]:
df = pd.read_csv('train.csv')
df.drop(columns = 'id', inplace = True)

In [172]:
df2 = pd.read_csv('test.csv')
df2.drop(columns = 'id', inplace = True)

### Data Cleaning

In [173]:
# response variable
df = df[df['cancel'] != -1]

# Specify features
primary_x = ['claim.ind', 'sales.channel','dwelling.type','coverage.type','credit','tenure']

# Drop null records for a primary set of features
df.dropna(axis = 0, subset = primary_x, inplace = True)

# Impute missing premiums as median of premiums:
df['premium'] = df['premium'].fillna(df['premium'].mean())

# Change Data Type:
#df['cancel'] = df['cancel'].astype('category')
df['claim.ind'] = df['claim.ind'].astype('int')
df['tenure'] = df['tenure'].astype('int')
df['sales.channel'] = df['sales.channel'].astype('category')
df['dwelling.type'] = df['dwelling.type'].astype('category')
df['coverage.type'] = df['coverage.type'].astype('category')
df['credit'] = df['credit'].astype('category')
df['year'] = df['year'].astype('category')

# Create corrected columns for plotting purposes:
df['claim'] = df['claim.ind'].apply(lambda x: 'Y' if x == 1 else 'N')
df['claim'] = df['claim'].astype('category')

In [174]:
# response variable
#df2 = df[df['cancel'] != -1]

# Specify features
primary_x = ['claim.ind', 'sales.channel','dwelling.type','coverage.type','credit','tenure']

# Drop null records for a primary set of features
df2.dropna(axis = 0, subset = primary_x, inplace = True)

# Impute missing premiums as median of premiums:
df2['premium'] = df2['premium'].fillna(df2['premium'].mean())

# Change Data Type:
#df['cancel'] = df['cancel'].astype('category')
df2['claim.ind'] = df2['claim.ind'].astype('int')
df2['tenure'] = df2['tenure'].astype('int')
df2['sales.channel'] = df2['sales.channel'].astype('category')
df2['dwelling.type'] = df2['dwelling.type'].astype('category')
df2['coverage.type'] = df2['coverage.type'].astype('category')
df2['credit'] = df2['credit'].astype('category')
df2['year'] = df2['year'].astype('category')

# Create corrected columns for plotting purposes:
df2['claim'] = df2['claim.ind'].apply(lambda x: 'Y' if x == 1 else 'N')
df2['claim'] = df2['claim'].astype('category')

In [175]:
# Specify features
secondary_x = ['ni.gender','ni.marital.status','n.adults','n.children']

# Drop null records for a secondary set of features
df.dropna(axis = 0, subset = secondary_x, inplace = True)

# Change Data Type:
df['ni.marital.status'] = df['ni.marital.status'].astype('int').astype('category')
df['n.adults'] = df['n.adults'].astype('int')
df['n.children'] = df['n.children'].astype('int')
df['ni.gender'] = df['ni.gender'].astype('category')

# Create corrected columns for plotting purposes:
df['married'] = df['ni.marital.status'].apply(lambda x: 'Y' if x == 1 else 'N')

In [176]:
# Specify features
secondary_x = ['ni.gender','ni.marital.status','n.adults','n.children']

# Drop null records for a secondary set of features
df2.dropna(axis = 0, subset = secondary_x, inplace = True)

# Change Data Type:
df2['ni.marital.status'] = df2['ni.marital.status'].astype('int').astype('category')
df2['n.adults'] = df2['n.adults'].astype('int')
df2['n.children'] = df2['n.children'].astype('int')
df2['ni.gender'] = df2['ni.gender'].astype('category')

# Create corrected columns for plotting purposes:
df2['married'] = df2['ni.marital.status'].apply(lambda x: 'Y' if x == 1 else 'N')

In [177]:
# Specify features
tertiary_x = ['zip.code','ni.age','len.at.res']

# Drop null records for a tertiary set of features
df.dropna(axis = 0, subset = tertiary_x, inplace = True)

# Change Data Type:
df['zip.code'] = df['zip.code'].astype('int').astype('str')

# Zonal Info
df['zone'] = df['zip.code'].astype(str).str[0:2]
df['zone'] = df['zone'].astype('category')

In [178]:
# Specify features
tertiary_x = ['zip.code','ni.age','len.at.res']

# Drop null records for a tertiary set of features
df2.dropna(axis = 0, subset = tertiary_x, inplace = True)

# Change Data Type:
df2['zip.code'] = df2['zip.code'].astype('int').astype('str')

# Zonal Info
df2['zone'] = df2['zip.code'].astype(str).str[0:2]
df2['zone'] = df2['zone'].astype('category')

## Dummy Variable Encoding

In [179]:
df.rename(columns = {'coverage.type': 'plan',
                     'dwelling.type': 'property',
                     'sales.channel': 'channel',
                     'ni.gender': 'gender'}, inplace = True)

In [180]:
df2.rename(columns = {'coverage.type': 'plan',
                     'dwelling.type': 'property',
                     'sales.channel': 'channel',
                     'ni.gender': 'gender'}, inplace = True)

In [134]:
#dummy_cols = ['credit','plan','property','channel','claim','zone','married','gender','year']
#df = pd.get_dummies(df, columns = dummy_cols, drop_first = True)
#df.info()

# Logistic Regression: Hypothesis Testing for Interactions

In [182]:
from sklearn.linear_model import LogisticRegression
import statsmodels.formula.api as smf
import sklearn
df2.head()

Unnamed: 0,tenure,claim.ind,n.adults,n.children,gender,ni.marital.status,premium,channel,plan,property,len.at.res,credit,house.color,ni.age,year,zip.code,claim,married,zone
0,5,0,1,3,F,1,913.623201,Broker,C,Tenant,18.0,medium,blue,35.0,2017,85066,N,Y,85
1,15,1,1,4,F,1,1018.426114,Phone,A,House,15.0,medium,white,59.0,2017,98013,Y,Y,98
2,1,1,3,2,M,1,928.418533,Broker,B,Condo,13.0,medium,blue,27.0,2017,15049,Y,Y,15
3,12,0,4,0,M,0,877.649917,Broker,B,Condo,19.0,high,blue,35.0,2017,15026,N,N,15
4,11,0,1,9,F,1,947.281913,Broker,A,Condo,18.0,high,red,34.0,2017,15025,N,Y,15


**Test 1: Interactions with Credit**

In [141]:
# Base Model

formula = "cancel ~ credit"
model = smf.mnlogit(formula=formula, data=df).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.733412
         Iterations 6
                          MNLogit Regression Results                          
Dep. Variable:                 cancel   No. Observations:              1032625
Model:                        MNLogit   Df Residuals:                  1032619
Method:                           MLE   Df Model:                            4
Date:                Sat, 30 Apr 2022   Pseudo R-squ.:                 0.04240
Time:                        01:51:46   Log-Likelihood:            -7.5734e+05
converged:                       True   LL-Null:                   -7.9087e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
        cancel=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           -2.8024      0.006   -476.376      0.000      -2.814      -2.791
credit[T.lo

In [146]:
# Interaction 1: 

formula = "cancel ~ credit*channel"
model = smf.mnlogit(formula=formula, data=df).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.701195
         Iterations 7
                          MNLogit Regression Results                          
Dep. Variable:                 cancel   No. Observations:              1032625
Model:                        MNLogit   Df Residuals:                  1032607
Method:                           MLE   Df Model:                           16
Date:                Sat, 30 Apr 2022   Pseudo R-squ.:                 0.08446
Time:                        01:59:03   Log-Likelihood:            -7.2407e+05
converged:                       True   LL-Null:                   -7.9087e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                          cancel=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
Intercept                             -4.0613      0.014   -

In [149]:
# Interaction 2: 

formula = "cancel ~ credit*channel*gender"
model = smf.mnlogit(formula=formula, data=df).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.701182
         Iterations 7
                          MNLogit Regression Results                          
Dep. Variable:                 cancel   No. Observations:              1032625
Model:                        MNLogit   Df Residuals:                  1032589
Method:                           MLE   Df Model:                           34
Date:                Sat, 30 Apr 2022   Pseudo R-squ.:                 0.08448
Time:                        02:01:10   Log-Likelihood:            -7.2406e+05
converged:                       True   LL-Null:                   -7.9087e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                                      cancel=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------------------
Intercept                           

In [151]:
# Interaction 3: 

formula = "cancel ~ credit*channel + property"
model = smf.mnlogit(formula=formula, data=df).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.700441
         Iterations 7
                          MNLogit Regression Results                          
Dep. Variable:                 cancel   No. Observations:              1032625
Model:                        MNLogit   Df Residuals:                  1032603
Method:                           MLE   Df Model:                           20
Date:                Sat, 30 Apr 2022   Pseudo R-squ.:                 0.08545
Time:                        02:02:58   Log-Likelihood:            -7.2329e+05
converged:                       True   LL-Null:                   -7.9087e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                          cancel=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
Intercept                             -3.9837      0.015   -

In [152]:
# Interaction 4: 

formula = "cancel ~ credit*channel + property + claim"
model = smf.mnlogit(formula=formula, data=df).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.697598
         Iterations 8
                          MNLogit Regression Results                          
Dep. Variable:                 cancel   No. Observations:              1032625
Model:                        MNLogit   Df Residuals:                  1032601
Method:                           MLE   Df Model:                           22
Date:                Sat, 30 Apr 2022   Pseudo R-squ.:                 0.08916
Time:                        02:06:19   Log-Likelihood:            -7.2036e+05
converged:                       True   LL-Null:                   -7.9087e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                          cancel=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
Intercept                             -4.0916      0.015   -

In [157]:
# Additive Model 1: 

formula = "cancel ~ credit*channel + property + plan + claim + tenure*married + zone + year"
model = smf.mnlogit(formula=formula, data=df).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.680857
         Iterations 8
                          MNLogit Regression Results                          
Dep. Variable:                 cancel   No. Observations:              1032625
Model:                        MNLogit   Df Residuals:                  1032575
Method:                           MLE   Df Model:                           48
Date:                Sat, 30 Apr 2022   Pseudo R-squ.:                  0.1110
Time:                        02:24:11   Log-Likelihood:            -7.0307e+05
converged:                       True   LL-Null:                   -7.9087e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                          cancel=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
Intercept                             -4.9822      0.027   -

In [183]:
# Additive Model 2: 

formula = "cancel ~ credit*channel + property + married + plan + premium + claim + zone + year"
model = smf.mnlogit(formula=formula, data=df).fit()
print(model.summary())

Optimization terminated successfully.
         Current function value: 0.680383
         Iterations 8
                          MNLogit Regression Results                          
Dep. Variable:                 cancel   No. Observations:              1032625
Model:                        MNLogit   Df Residuals:                  1032577
Method:                           MLE   Df Model:                           46
Date:                Sat, 30 Apr 2022   Pseudo R-squ.:                  0.1116
Time:                        02:50:30   Log-Likelihood:            -7.0258e+05
converged:                       True   LL-Null:                   -7.9087e+05
Covariance Type:            nonrobust   LLR p-value:                     0.000
                          cancel=1       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------------------
Intercept                             -3.9908      0.052    

In [188]:
df2 = df2[df2['property'] != 'Landlord']

In [191]:
df2[df2['property'] == 'Landlord']

Unnamed: 0,tenure,claim.ind,n.adults,n.children,gender,ni.marital.status,premium,channel,plan,property,len.at.res,credit,house.color,ni.age,year,zip.code,claim,married,zone


In [192]:
X = ['credit', 'channel', 'property', 'married', 'plan', 'premium', 'claim', 'zone', 'year']
model.predict(df2[X])

PatsyError: predict requires that you use a DataFrame when predicting from a model
that was created using the formula api.

The original error message returned by patsy is:
mismatching levels: expected ('Condo', 'House', 'Tenant'), got ('Condo', 'House', 'Landlord', 'Tenant')
    cancel ~ credit*channel + property + married + plan + premium + claim + zone + year
                              ^^^^^^^^