In [101]:
%%capture
#### Load Libraries (without cell output)

import autograd.numpy as np # import autograd wrapped numpy
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
import sklearn
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import neighbors, metrics
from sklearn.preprocessing import scale

import statsmodels.api as sm
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod.families import Gamma

In [102]:
claims = pd.read_csv("./Qualification_Package/Claims_Years_1_to_3.csv")
print(claims['claim_amount'].mean())

print(claims.shape)

1187.758451280604
(14173, 24)


In [103]:
claims['pol_pay_freq'] = claims['pol_pay_freq'].replace( {'Biannual': 2, 'Yearly': 1, 'Monthly': 12, 'Quarterly': 4} )
claims['pol_payd'] = claims['pol_payd'].replace( {'No': 0, 'Yes': 1} )
claims['drv_sex1'] = claims['drv_sex1'].replace( {'M': 1, 'F': 0} )
claims['vh_type'] = claims['vh_type'].replace( {'Tourism': 1, 'Commercial': 0} )
claims['drv_drv2'] = claims['drv_drv2'].replace( {'No': 0, 'Yes': 1} )

  claims['pol_pay_freq'] = claims['pol_pay_freq'].replace( {'Biannual': 2, 'Yearly': 1, 'Monthly': 12, 'Quarterly': 4} )
  claims['pol_payd'] = claims['pol_payd'].replace( {'No': 0, 'Yes': 1} )
  claims['drv_sex1'] = claims['drv_sex1'].replace( {'M': 1, 'F': 0} )
  claims['vh_type'] = claims['vh_type'].replace( {'Tourism': 1, 'Commercial': 0} )
  claims['drv_drv2'] = claims['drv_drv2'].replace( {'No': 0, 'Yes': 1} )


In [104]:
print(claims.dtypes)
print(claims.columns.to_list())


id_policy                  object
year                        int64
pol_no_claims_discount    float64
pol_duration                int64
pol_pay_freq                int64
pol_payd                    int64
pol_usage                  object
drv_sex1                    int64
drv_age1                    int64
drv_age_lic1                int64
drv_drv2                    int64
drv_sex2                   object
drv_age2                  float64
drv_age_lic2              float64
vh_make_model              object
vh_age                      int64
vh_fuel                    object
vh_type                     int64
vh_speed                  float64
vh_value                  float64
vh_weight                 float64
population                  int64
town_surface_area         float64
claim_amount              float64
dtype: object
['id_policy', 'year', 'pol_no_claims_discount', 'pol_duration', 'pol_pay_freq', 'pol_payd', 'pol_usage', 'drv_sex1', 'drv_age1', 'drv_age_lic1', 'drv_drv2', 'drv_sex2', '

In [105]:
objects = claims.select_dtypes(['object'])
categorical = claims.select_dtypes(['int64'])
continuous = claims.select_dtypes(['float64'])

In [135]:
from sklearn.impute import SimpleImputer

objects_filled = SimpleImputer(missing_values=np.nan, strategy='most_frequent').fit_transform(objects)
categorial_filled = SimpleImputer(missing_values=np.nan, strategy='median').fit_transform(categorical)
continuous_filled = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(continuous)

objects_filled = pd.DataFrame(objects_filled, columns=objects.columns.to_list())
categorial_filled = pd.DataFrame(categorial_filled, columns=categorical.columns.to_list())
continuous_filled = pd.DataFrame(continuous_filled, columns=continuous.columns.to_list())

In [149]:
design_matrix = pd.get_dummies(objects_filled, columns=['vh_make_model', 'pol_usage', 'drv_sex2', 'vh_fuel'], dtype=int)
design_matrix

objects_design = design_matrix.drop(columns=['id_policy'])

In [159]:
df = pd.concat([objects_design, categorial_filled, continuous_filled], axis = 1)

In [160]:
print(df.shape)
df.head(10)


(14173, 498)


Unnamed: 0,vh_make_model_aawqanlavsjfqrne,vh_make_model_abacekzzrkhtgpcp,vh_make_model_abcepdrvvynjsufa,vh_make_model_aceqpjprqgzhffuw,vh_make_model_adhoqfsfdpetomvs,vh_make_model_adzzjitkyqlberpu,vh_make_model_aewtdnpoiopumymt,vh_make_model_aifsqdniwqmcuqpv,vh_make_model_aivacsqryguqpdib,vh_make_model_ajktbllxjzfdtwpy,...,vh_type,population,pol_no_claims_discount,drv_age2,drv_age_lic2,vh_speed,vh_value,vh_weight,town_surface_area,claim_amount
0,0,0,0,0,0,0,0,0,0,0,...,1.0,320.0,0.0,48.031279,26.131633,200.0,25000.0,1300.0,58.3,339.84
1,0,0,0,0,0,0,0,0,0,0,...,1.0,530.0,0.0,48.031279,26.131633,158.0,13390.0,885.0,385.6,1236.0
2,0,0,0,0,0,0,0,0,0,0,...,1.0,960.0,0.0,42.0,24.0,160.0,2088.0,0.0,701.5,224.33
3,0,0,0,0,0,0,0,0,0,0,...,1.0,1290.0,0.0,49.0,27.0,182.0,11785.0,1080.0,49.8,1343.41
4,0,0,0,0,0,0,0,0,0,0,...,1.0,70.0,0.07,48.031279,26.131633,250.0,25250.0,1450.0,30.8,130.23
5,0,0,0,0,0,0,0,0,0,0,...,1.0,10.0,0.0,34.0,14.0,158.0,13390.0,885.0,52.9,390.03
6,0,0,0,0,0,0,0,0,0,0,...,1.0,240.0,0.0,58.0,39.0,168.0,11876.0,1225.0,103.7,699.54
7,0,0,0,0,0,0,0,0,0,0,...,1.0,1980.0,0.0,48.031279,26.131633,170.0,11129.0,927.0,218.2,381.15
8,0,0,0,0,0,0,0,0,0,0,...,0.0,1300.0,0.319,48.031279,26.131633,148.0,16702.0,1350.0,53.0,87.53
9,0,0,0,0,0,0,0,0,0,0,...,1.0,390.0,0.0,34.0,16.0,200.0,25000.0,1300.0,396.5,670.91


In [161]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1:]

In [162]:
x_train, x_test, y_train, y_test = train_test_split(
    x,
    y, 
    test_size = 0.25, # train is 75%, test is 25% 
    random_state = 0, # stratify = y,
)

In [165]:
from sklearn.tree import DecisionTreeRegressor

evaluation = {}

for crit in ['squared_error', 'friedman_mse', 'absolute_error', 'poisson']:
    tree = DecisionTreeRegressor(criterion=crit)
    tree.fit(x_train, y_train)
    y_pred = tree.predict(x_test)
    err = round(np.sqrt(metrics.mean_squared_error(y_test, y_pred)), 2)
    evaluation[crit] = err

evaluation


{'squared_error': 1951.75,
 'friedman_mse': 1984.85,
 'absolute_error': 2690.9,
 'poisson': 1920.65}