# Proof of concept claims model

## Import -> requirements.txt

In [86]:
import pandas as pd
import statsmodels.api as sm
from statsmodels.iolib.smpickle import load_pickle

## Read in data

In [2]:
df = pd.read_csv("data/insurance.csv")

## Process data
- Ignore scaling for now, use parametric ML model

### One hot encoding

In [13]:
RESPONSE = "charges"

In [40]:
cat_features = [x for x in df.columns if (x != RESPONSE) & (df[x].dtype == "O")]
num_features = [x for x in df.columns if (x != RESPONSE) & (x not in cat_features)]

In [46]:
cat_features_ohe = pd.get_dummies(df[cat_features]).reset_index(drop=True)

In [62]:
total_features_ohe = num_features + list(cat_features_ohe.columns)

In [49]:
df_ohe = pd.concat([
    df[num_features],
    cat_features_ohe,
    df[RESPONSE]
], axis=1)

## Model

In [108]:
df_features_intercept = df_ohe[total_features_ohe]
gamma_model = sm.GLM(df_ohe[RESPONSE], df_features_intercept, family=sm.families.Gamma(sm.families.links.log()))
glm_results = gamma_model.fit()

In [109]:
glm_results.summary()

0,1,2,3
Dep. Variable:,charges,No. Observations:,1338.0
Model:,GLM,Df Residuals:,1329.0
Model Family:,Gamma,Df Model:,8.0
Link Function:,log,Scale:,0.46695
Method:,IRLS,Log-Likelihood:,-13307.0
Date:,"Tue, 19 Apr 2022",Deviance:,337.73
Time:,00:10:06,Pearson chi2:,621.0
No. Iterations:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
age,0.0286,0.001,21.351,0.000,0.026,0.031
bmi,0.0141,0.003,4.381,0.000,0.008,0.020
children,0.0842,0.016,5.419,0.000,0.054,0.115
sex_female,3.2372,0.047,68.646,0.000,3.145,3.330
sex_male,3.1801,0.048,66.376,0.000,3.086,3.274
smoker_no,2.4585,0.047,51.978,0.000,2.366,2.551
smoker_yes,3.9589,0.052,76.725,0.000,3.858,4.060
region_northeast,1.6906,0.037,45.153,0.000,1.617,1.764
region_northwest,1.6327,0.038,43.380,0.000,1.559,1.706


## Export

In [110]:
glm_results.save("model/claims_model.pickle")

## Import and predict

In [111]:
loaded_model = load_pickle("model/claims_model.pickle")

In [127]:
json_call = {
        "age": 19,
        "bmi": 30,
        "children": 0,
        "sex_female": 1,
        "sex_male": 0,
        "smoker_no": 1,
        "smoker_yes": 0,
        "region_northeast": 0,
        "region_northwest": 0,
        "region_southeast": 0,
        "region_southwest": 1
    }

In [139]:
json_call_formatted = np.array([x[-1] for x in list(json_call.items())])

In [140]:
loaded_model.predict(json_call_formatted)

array([3672.7133053])