In [47]:
%%capture
#### Load Libraries (without cell output)

import autograd.numpy as np # import autograd wrapped numpy
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
import sklearn
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import neighbors, metrics
from sklearn.preprocessing import scale

import statsmodels.api as sm
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod.families import Gamma

In [48]:
claims = pd.read_csv("./Qualification_Package/Claims_Years_1_to_3.csv")
print(claims['claim_amount'].mean())

print(claims.shape)

1187.758451280604
(14173, 24)


In [49]:
print(claims.dtypes)
print(claims.columns.to_list())


id_policy                  object
year                        int64
pol_no_claims_discount    float64
pol_duration                int64
pol_pay_freq               object
pol_payd                   object
pol_usage                  object
drv_sex1                   object
drv_age1                    int64
drv_age_lic1                int64
drv_drv2                   object
drv_sex2                   object
drv_age2                  float64
drv_age_lic2              float64
vh_make_model              object
vh_age                      int64
vh_fuel                    object
vh_type                    object
vh_speed                  float64
vh_value                  float64
vh_weight                 float64
population                  int64
town_surface_area         float64
claim_amount              float64
dtype: object
['id_policy', 'year', 'pol_no_claims_discount', 'pol_duration', 'pol_pay_freq', 'pol_payd', 'pol_usage', 'drv_sex1', 'drv_age1', 'drv_age_lic1', 'drv_drv2', 'drv_sex2', '

In [50]:
objects = claims.select_dtypes(['object'])
categorical = claims.select_dtypes(['int64'])
continuous = claims.select_dtypes(['float64'])

In [51]:
from sklearn.impute import SimpleImputer

objects_filled = SimpleImputer(missing_values=np.nan, strategy='most_frequent').fit_transform(objects)
categorial_filled = SimpleImputer(missing_values=np.nan, strategy='median').fit_transform(categorical)
continuous_filled = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(continuous)

objects_filled = pd.DataFrame(objects_filled)
categorial_filled = pd.DataFrame(categorial_filled)
continuous_filled = pd.DataFrame(continuous_filled)

In [52]:
df = pd.concat([objects_filled, categorial_filled, continuous_filled], axis = 1)

In [53]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4.1,5.1,0.1,1.1,2.1,3.1,4.2,5.2,6.1,7.1
0,PL042479,Biannual,No,WorkPrivate,M,No,0,zspzyfdefowgwddf,Diesel,Tourism,...,5.0,320.0,0.000,48.031279,26.131633,200.0,25000.0,1300.0,58.3,339.84
1,PL042480,Yearly,No,Retired,F,No,0,hselphnqlvecmmyx,Diesel,Tourism,...,1.0,530.0,0.000,48.031279,26.131633,158.0,13390.0,885.0,385.6,1236.00
2,PL042548,Biannual,No,Professional,F,Yes,M,demgvtbzilochupd,Gasoline,Tourism,...,4.0,960.0,0.000,42.000000,24.000000,160.0,2088.0,0.0,701.5,224.33
3,PL042474,Biannual,No,WorkPrivate,F,Yes,M,aywlgifrijfokyzu,Diesel,Tourism,...,1.0,1290.0,0.000,49.000000,27.000000,182.0,11785.0,1080.0,49.8,1343.41
4,PL042425,Yearly,No,WorkPrivate,F,No,0,ajtardhciglimsdi,Diesel,Tourism,...,3.0,70.0,0.070,48.031279,26.131633,250.0,25250.0,1450.0,30.8,130.23
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14168,PL031958,Biannual,No,WorkPrivate,M,No,0,nyrtstlobluggnkw,Gasoline,Tourism,...,3.0,20.0,0.000,48.031279,26.131633,212.0,38330.0,1218.0,23.2,2383.80
14169,PL012289,Monthly,No,WorkPrivate,M,No,0,kzwthrslljkmbqur,Gasoline,Tourism,...,3.0,780.0,0.087,48.031279,26.131633,158.0,11228.0,992.0,131.3,639.50
14170,PL090328,Monthly,No,Retired,M,No,0,yttvzqeuddvehiqu,Diesel,Tourism,...,6.0,270.0,0.036,48.031279,26.131633,165.0,21535.0,1424.0,281.0,998.86
14171,PL075392,Biannual,No,WorkPrivate,M,Yes,F,fadjogsnmecatcfb,Gasoline,Tourism,...,9.0,340.0,0.000,53.000000,29.000000,164.0,5110.0,828.0,257.1,307.11


In [54]:
# print(objects.columns.to_list(), categorical.columns.to_list(), continuous.columns.to_list())

In [55]:
# mapper = {0: 'id_policy', 
#         10: 'year', 
#         16: 'pol_no_claims_discount', 
#         11: 'pol_duration', 
#         1: 'pol_pay_freq', 
#         2: 'pol_payd', 
#         3: 'pol_usage', 
#         4: 'drv_sex1', 
#         12: 'drv_age1', 
#         13: 'drv_age_lic1', 
#         5: 'drv_drv2', 
#         6: 'drv_sex2', 
#         17: 'drv_age2', 
#         18: 'drv_age_lic2', 
#         7: 'vh_make_model', 
#         14: 'vh_age', 
#         8: 'vh_fuel', 
#         9: 'vh_type', 
#         19: 'vh_speed', 
#         20: 'vh_value', 
#         21: 'vh_weight', 
#         15: 'population', 
#         22: 'town_surface_area', 
#         23: 'claim_amount',
# }
# df = df.rename(columns=mapper)

In [57]:
print(df.shape)
df.head(10)


(14173, 24)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4.1,5.1,0.1,1.1,2.1,3.1,4.2,5.2,6.1,7.1
0,PL042479,Biannual,No,WorkPrivate,M,No,0,zspzyfdefowgwddf,Diesel,Tourism,...,5.0,320.0,0.0,48.031279,26.131633,200.0,25000.0,1300.0,58.3,339.84
1,PL042480,Yearly,No,Retired,F,No,0,hselphnqlvecmmyx,Diesel,Tourism,...,1.0,530.0,0.0,48.031279,26.131633,158.0,13390.0,885.0,385.6,1236.0
2,PL042548,Biannual,No,Professional,F,Yes,M,demgvtbzilochupd,Gasoline,Tourism,...,4.0,960.0,0.0,42.0,24.0,160.0,2088.0,0.0,701.5,224.33
3,PL042474,Biannual,No,WorkPrivate,F,Yes,M,aywlgifrijfokyzu,Diesel,Tourism,...,1.0,1290.0,0.0,49.0,27.0,182.0,11785.0,1080.0,49.8,1343.41
4,PL042425,Yearly,No,WorkPrivate,F,No,0,ajtardhciglimsdi,Diesel,Tourism,...,3.0,70.0,0.07,48.031279,26.131633,250.0,25250.0,1450.0,30.8,130.23
5,PL042414,Monthly,No,WorkPrivate,F,Yes,M,hselphnqlvecmmyx,Diesel,Tourism,...,11.0,10.0,0.0,34.0,14.0,158.0,13390.0,885.0,52.9,390.03
6,PL042436,Biannual,No,WorkPrivate,M,Yes,F,xzdsapxqliboezbc,Diesel,Tourism,...,8.0,240.0,0.0,58.0,39.0,168.0,11876.0,1225.0,103.7,699.54
7,PL042456,Biannual,No,WorkPrivate,M,No,0,xpxsjmglcvcsxwdy,Gasoline,Tourism,...,1.0,1980.0,0.0,48.031279,26.131633,170.0,11129.0,927.0,218.2,381.15
8,PL042449,Biannual,No,Professional,M,No,0,xkzehzohmfrsmolg,Diesel,Commercial,...,1.0,1300.0,0.319,48.031279,26.131633,148.0,16702.0,1350.0,53.0,87.53
9,PL042664,Monthly,No,WorkPrivate,F,Yes,F,zspzyfdefowgwddf,Diesel,Tourism,...,4.0,390.0,0.0,34.0,16.0,200.0,25000.0,1300.0,396.5,670.91
