In [239]:
%%capture
#### Load Libraries (without cell output)

import autograd.numpy as np # import autograd wrapped numpy
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
import sklearn
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import neighbors, metrics
from sklearn.preprocessing import scale

from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import VotingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn import svm

from sklearn.impute import SimpleImputer

In [240]:
def RMSE(x, y):
    MSE = ((y - x) ** 2).mean()
    return np.sqrt(MSE)

claims = pd.read_csv("./Qualification_Package/Claims_Years_1_to_3.csv")

In [241]:
def preprocess_xy(dataframe):
    claims = dataframe.copy(deep=True)
    claims['pol_pay_freq'] = claims['pol_pay_freq'].replace( {'Biannual': 2, 'Yearly': 1, 'Monthly': 12, 'Quarterly': 4} )
    claims['pol_payd'] = claims['pol_payd'].replace( {'No': 0, 'Yes': 1} )
    claims['drv_sex1'] = claims['drv_sex1'].replace( {'M': 1, 'F': 0} )
    claims['vh_type'] = claims['vh_type'].replace( {'Tourism': 1, 'Commercial': 0} )
    claims['drv_drv2'] = claims['drv_drv2'].replace( {'No': 0, 'Yes': 1} )

    claims['vh_make_model'] = claims['vh_make_model'].apply(hash)

    objects = claims.select_dtypes(['object'])
    categorical = claims.select_dtypes(['int64'])
    continuous = claims.select_dtypes(['float64'])

    objects_filled = SimpleImputer(missing_values=np.nan, strategy='most_frequent').fit_transform(objects)
    categorical_filled = SimpleImputer(missing_values=np.nan, strategy='median').fit_transform(categorical)
    continuous_filled = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(continuous)

    objects_filled = pd.DataFrame(objects_filled, columns=objects.columns.to_list())
    categorical_filled = pd.DataFrame(categorical_filled, columns=categorical.columns.to_list())
    continuous_filled = pd.DataFrame(continuous_filled, columns=continuous.columns.to_list())
    
    design_matrix = pd.get_dummies(objects_filled, columns=['pol_usage', 'drv_sex2', 'vh_fuel'], dtype=int)
    objects_design = design_matrix.drop(columns=['id_policy'])

    df = pd.concat([objects_design, categorical_filled, continuous_filled], axis = 1)

    x = df.iloc[:,:-1]
    y = df.iloc[:,-1]

    x_train, x_test, y_train, y_test = train_test_split(
        x,
        y, 
        train_size = 0.8,
        test_size = 0.2, # train is 75%, test is 25% 
        random_state = 0, # stratify = y,
    )
    return x_train, x_test, y_train, y_test

In [242]:
def preprocess_x(dataframe):
    claims = dataframe.copy(deep=True)
    claims['pol_pay_freq'] = claims['pol_pay_freq'].replace( {'Biannual': 2, 'Yearly': 1, 'Monthly': 12, 'Quarterly': 4} )
    claims['pol_payd'] = claims['pol_payd'].replace( {'No': 0, 'Yes': 1} )
    claims['drv_sex1'] = claims['drv_sex1'].replace( {'M': 1, 'F': 0} )
    claims['vh_type'] = claims['vh_type'].replace( {'Tourism': 1, 'Commercial': 0} )
    claims['drv_drv2'] = claims['drv_drv2'].replace( {'No': 0, 'Yes': 1} )
    
    claims['vh_make_model'] = claims['vh_make_model'].apply(hash)

    objects = claims.select_dtypes(['object'])
    categorical = claims.select_dtypes(['int64'])
    continuous = claims.select_dtypes(['float64'])

    objects_filled = SimpleImputer(missing_values=np.nan, strategy='most_frequent').fit_transform(objects)
    categorical_filled = SimpleImputer(missing_values=np.nan, strategy='median').fit_transform(categorical)
    continuous_filled = SimpleImputer(missing_values=np.nan, strategy='mean').fit_transform(continuous)

    objects_filled = pd.DataFrame(objects_filled, columns=objects.columns.to_list())
    categorical_filled = pd.DataFrame(categorical_filled, columns=categorical.columns.to_list())
    continuous_filled = pd.DataFrame(continuous_filled, columns=continuous.columns.to_list())

    design_matrix = pd.get_dummies(objects_filled, columns=['pol_usage', 'drv_sex2', 'vh_fuel'], dtype=int)
    objects_design = design_matrix.drop(columns=['id_policy'])

    categorical_filled = categorical_filled.drop(columns=['Unnamed: 0'])

    df = pd.concat([objects_design, categorical_filled, continuous_filled], axis = 1)

    return df

In [243]:
x_train, x_test, y_train, y_test = preprocess_xy(claims)

  claims['pol_pay_freq'] = claims['pol_pay_freq'].replace( {'Biannual': 2, 'Yearly': 1, 'Monthly': 12, 'Quarterly': 4} )
  claims['pol_payd'] = claims['pol_payd'].replace( {'No': 0, 'Yes': 1} )
  claims['drv_sex1'] = claims['drv_sex1'].replace( {'M': 1, 'F': 0} )
  claims['vh_type'] = claims['vh_type'].replace( {'Tourism': 1, 'Commercial': 0} )
  claims['drv_drv2'] = claims['drv_drv2'].replace( {'No': 0, 'Yes': 1} )


In [244]:
x_train.columns

Index(['pol_usage_AllTrips', 'pol_usage_Professional', 'pol_usage_Retired',
       'pol_usage_WorkPrivate', 'drv_sex2_0', 'drv_sex2_F', 'drv_sex2_M',
       'vh_fuel_Diesel', 'vh_fuel_Gasoline', 'vh_fuel_Hybrid', 'year',
       'pol_duration', 'pol_pay_freq', 'pol_payd', 'drv_sex1', 'drv_age1',
       'drv_age_lic1', 'drv_drv2', 'vh_make_model', 'vh_age', 'vh_type',
       'population', 'pol_no_claims_discount', 'drv_age2', 'drv_age_lic2',
       'vh_speed', 'vh_value', 'vh_weight', 'town_surface_area'],
      dtype='object')

In [245]:
er = VotingRegressor([('kn', KNeighborsRegressor(n_neighbors=30)), ('sv', svm.SVR())])
er.fit(x_train, y_train)
y_pred = er.predict(x_test)
print(RMSE(y_pred, y_test))

1375.8664419158904


In [246]:
def evaluate(claims, model):
    x_train, x_test, y_train, y_test = preprocess_xy(claims)
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    return np.sqrt(metrics.mean_squared_error(y_test, y_pred))

In [247]:
print(evaluate(claims, er))

  claims['pol_pay_freq'] = claims['pol_pay_freq'].replace( {'Biannual': 2, 'Yearly': 1, 'Monthly': 12, 'Quarterly': 4} )
  claims['pol_payd'] = claims['pol_payd'].replace( {'No': 0, 'Yes': 1} )
  claims['drv_sex1'] = claims['drv_sex1'].replace( {'M': 1, 'F': 0} )
  claims['vh_type'] = claims['vh_type'].replace( {'Tourism': 1, 'Commercial': 0} )
  claims['drv_drv2'] = claims['drv_drv2'].replace( {'No': 0, 'Yes': 1} )


1375.8664419158904


In [248]:
def predict(x, model):
    x_new = preprocess_x(x)
    return model.predict(x_new)

In [249]:
data = pd.read_csv("./Qualification_Package/Submission_Data.csv")

In [250]:
data_processed = preprocess_x(data)

  claims['pol_pay_freq'] = claims['pol_pay_freq'].replace( {'Biannual': 2, 'Yearly': 1, 'Monthly': 12, 'Quarterly': 4} )
  claims['pol_payd'] = claims['pol_payd'].replace( {'No': 0, 'Yes': 1} )
  claims['drv_sex1'] = claims['drv_sex1'].replace( {'M': 1, 'F': 0} )
  claims['vh_type'] = claims['vh_type'].replace( {'Tourism': 1, 'Commercial': 0} )
  claims['drv_drv2'] = claims['drv_drv2'].replace( {'No': 0, 'Yes': 1} )


In [251]:
data_processed.columns

Index(['pol_usage_AllTrips', 'pol_usage_Professional', 'pol_usage_Retired',
       'pol_usage_WorkPrivate', 'drv_sex2_0', 'drv_sex2_F', 'drv_sex2_M',
       'vh_fuel_Diesel', 'vh_fuel_Gasoline', 'vh_fuel_Hybrid', 'year',
       'pol_duration', 'pol_pay_freq', 'pol_payd', 'drv_sex1', 'drv_age1',
       'drv_age_lic1', 'drv_drv2', 'vh_make_model', 'vh_age', 'vh_type',
       'population', 'pol_no_claims_discount', 'drv_age2', 'drv_age_lic2',
       'vh_speed', 'vh_value', 'vh_weight', 'town_surface_area'],
      dtype='object')

In [252]:
print(x_train.shape)
print(data_processed.shape)

(11338, 29)
(4140, 29)


In [253]:

x_new = preprocess_x(data)

predictions = er.predict(x_new)

  claims['pol_pay_freq'] = claims['pol_pay_freq'].replace( {'Biannual': 2, 'Yearly': 1, 'Monthly': 12, 'Quarterly': 4} )
  claims['pol_payd'] = claims['pol_payd'].replace( {'No': 0, 'Yes': 1} )
  claims['drv_sex1'] = claims['drv_sex1'].replace( {'M': 1, 'F': 0} )
  claims['vh_type'] = claims['vh_type'].replace( {'Tourism': 1, 'Commercial': 0} )
  claims['drv_drv2'] = claims['drv_drv2'].replace( {'No': 0, 'Yes': 1} )


In [255]:
print(predictions)
print(predictions.shape)

[ 922.08324747 1043.15745529 1136.35801878 ... 1068.82161979  879.04482817
 1352.95161341]
(4140,)
