In [1]:
%%capture
#### Load Libraries (without cell output)

import autograd.numpy as np # import autograd wrapped numpy
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
import sklearn
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import neighbors, metrics
from sklearn.preprocessing import scale

In [2]:
claims = pd.read_csv("./Qualification_Package/Claims_Years_1_to_3.csv")

In [3]:
#### Calculate RMSE for predictions
def RMSE(x, y):
    MSE = ((y - x) ** 2).mean()
    return np.sqrt(MSE)

# This benchmark RMSE is 2193.342.
# We must iterate from here to get a better model, either with GLM, machine learning, or something else.

In [4]:
claims['pol_pay_freq'] = claims['pol_pay_freq'].replace( {'Biannual': 2, 'Yearly': 1, 'Monthly': 12, 'Quarterly': 4} )
claims['pol_payd'] = claims['pol_payd'].replace( {'No': 0, 'Yes': 1} )
claims['drv_sex1'] = claims['drv_sex1'].replace( {'M': 1, 'F': 0} )
claims['vh_type'] = claims['vh_type'].replace( {'Tourism': 1, 'Commercial': 0} )
claims['drv_drv2'] = claims['drv_drv2'].replace( {'No': 0, 'Yes': 1} )

objects = claims.select_dtypes(['object'])
categorical = claims.select_dtypes(['int64'])
continuous = claims.select_dtypes(['float64'])

from sklearn.impute import SimpleImputer

objects_filled = SimpleImputer(missing_values=np.nan, strategy='most_frequent').fit_transform(objects)
categorial_filled = SimpleImputer(missing_values=np.nan, strategy='median').fit_transform(categorical)
continuous_filled = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(continuous)

objects_filled = pd.DataFrame(objects_filled, columns=objects.columns.to_list())
categorial_filled = pd.DataFrame(categorial_filled, columns=categorical.columns.to_list())
continuous_filled = pd.DataFrame(continuous_filled, columns=continuous.columns.to_list())

design_matrix = pd.get_dummies(objects_filled, columns=['vh_make_model', 'pol_usage', 'drv_sex2', 'vh_fuel'], dtype=int)
design_matrix

objects_design = design_matrix.drop(columns=['id_policy'])

df = pd.concat([objects_design, categorial_filled, continuous_filled], axis = 1)

x = df.iloc[:,:-1]
y = df.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split(
    x,
    y, 
    train_size = 0.8,
    test_size = 0.2, # train is 75%, test is 25% 
    random_state = 0, # stratify = y,
)

  claims['pol_pay_freq'] = claims['pol_pay_freq'].replace( {'Biannual': 2, 'Yearly': 1, 'Monthly': 12, 'Quarterly': 4} )
  claims['pol_payd'] = claims['pol_payd'].replace( {'No': 0, 'Yes': 1} )
  claims['drv_sex1'] = claims['drv_sex1'].replace( {'M': 1, 'F': 0} )
  claims['vh_type'] = claims['vh_type'].replace( {'Tourism': 1, 'Commercial': 0} )
  claims['drv_drv2'] = claims['drv_drv2'].replace( {'No': 0, 'Yes': 1} )


In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn import svm

r1 = DecisionTreeRegressor(criterion='squared_error', splitter='best', random_state=0)
r2 = KNeighborsRegressor(n_neighbors=30)
r3 = svm.SVR()
r4 = LinearRegression()

er = VotingRegressor([('dt', r1), ('kn', r2), ('sv', r3)])

er.fit(x_train, y_train)

y_pred = er.predict(x_test)

print(RMSE(y_pred, y_test))

1457.421859844611


In [6]:
er1 = VotingRegressor([('dt', r1), ('kn', r2)])
er2 = VotingRegressor([('dt', r1), ('sv', r3)])
er3 = VotingRegressor([('kn', r2), ('sv', r3)])

er1.fit(x_train, y_train); er2.fit(x_train, y_train); er3.fit(x_train, y_train)

y_pred1 = er1.predict(x_test); y_pred2 = er2.predict(x_test); y_pred3 = er3.predict(x_test)

print(RMSE(y_pred1, y_test))
print(RMSE(y_pred2, y_test))
print(RMSE(y_pred3, y_test))

1565.9609630606826
1563.4353823845313
1372.762275666161


Best voting regressor is just with KNN and SVM. Continue analysis with this

In [7]:
lin_reg = LinearRegression()
lin_reg.fit(x_train, y_train)
y_pred_lin_reg = lin_reg.predict(x_test)
print(RMSE(y_pred_lin_reg, y_test))

44542734.34000827


Linear regression seems to be terrible