
## Project:  Allstate Claims Severity
#### Author:   Joshep Downs, James Peng, Megan Pera, Diana Rodenberger 
#### Purpose:  Predicting cost and severity of claims for AllState
#### Created:  10/29/2016

In [2]:
import unittest

# General libraries.
import re, os, sys
import numpy as np
import pandas as pd
import logging
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
#from sklearn.feature_extraction import 
from sklearn import preprocessing

from sklearn.utils import shuffle

from sklearn.metrics import mean_absolute_error

from sklearn import linear_model

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer as DV

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import KFold

import xgboost as xgb

In [3]:

df_data = pd.read_csv('./data_in/train.csv')

In [4]:
df_data_encoded = df_data.copy()

# encoding into the categorical value
le = preprocessing.LabelEncoder()
for c in df_data_encoded.columns:
    if c.find('cat') >=0: # -1: substring not found, >=0, starting index
        df_data_encoded[c] = le.fit_transform(df_data_encoded[c])

np.random.seed(100)


col = list(df_data_encoded.columns)
col.remove('loss')
col.remove('id')

X = df_data_encoded[col]
y = np.log10(df_data_encoded.loss)
id = df_data_encoded.id

X_train, X_dev, y_train, y_dev, id_train, id_dev = train_test_split( X, y, id, test_size=0.33, random_state=1)

In [5]:
#id_train, x_train,  y_train = shuffle( df_train.id, df_train[col] , df_train.loss, random_state=0)

pd.options.mode.chained_assignment = None


lr = linear_model.LinearRegression()

# Train the model using the training sets
lr.fit(X_train, y_train)

y_pred = lr.predict(X_dev)

s = cross_val_score(lr, X_train, y_train, scoring='neg_mean_absolute_error')

print('mean_absolute_error on training data: {0}'.format(s))

mae = mean_absolute_error(y_dev, y_pred)
print('mean_absolute_error on test data {0}'.format(mae))

print('end')
#if __name__ == '__main__':
    #unittest.main()


mean_absolute_error on training data: [-0.19872471 -0.19753498 -0.19821047]
mean_absolute_error on test data 0.198086964004104
end


In [6]:
# fitting model on training data
xgbr = xgb.XGBRegressor(max_depth=6, n_estimators=500, learning_rate=0.1, subsample=0.8, colsample_bytree=0.4,
                     min_child_weight = 3,  seed=7)
xgbr.fit(X_train, y_train)

print(xgbr)
#Making predictions
y_pred = xgbr.predict(X_dev) 

s = cross_val_score(xgbr, X_train, y_train, scoring='neg_mean_absolute_error')

print('mean_absolute_error on training data: {0}'.format(s))

mae = mean_absolute_error(y_dev, y_pred)
print('mean_absolute_error on test data {0}'.format(mae))

print('end')

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.4,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=6,
       min_child_weight=3, missing=None, n_estimators=500, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=7, silent=True, subsample=0.8)
mean_absolute_error on training data: [-0.18234071 -0.18135463 -0.18286949]
mean_absolute_error on test data 0.18057031968795062
end


In [50]:
df_test = pd.read_csv('./data_in/test.csv')

In [51]:
df_test_encoded = df_test.copy()

# encoding into the categorical value
le = preprocessing.LabelEncoder()
for c in df_test_encoded.columns:
    if c.find('cat') >=0: # -1: substring not found, >=0, starting index
        df_test_encoded[c] = le.fit_transform(df_test_encoded[c])

np.random.seed(100)


col = list(df_test_encoded.columns)
col.remove('id')

X_test = df_test_encoded[col]
id_test = df_test_encoded.id

In [54]:
#id_train, x_train,  y_train = shuffle( df_train.id, df_train[col] , df_train.loss, random_state=0)

pd.options.mode.chained_assignment = None

#use same linear model previously fit with training data
y_test_log_pred = lr.predict(X_test)

# convert from log10 scale to linear scale 
y_test_pred = np.power(10, y_test_log_pred)

# assamble the final dataset
y_test_predicted_loss=pd.DataFrame(id_test)

y_test_pred_df=pd.DataFrame({"loss": y_test_pred})

y_test_predicted_loss=pd.concat([y_test_predicted_loss,y_test_pred_df], axis=1)


#check final output
print("number of ids: ", len(id_test))
print("number of rows in predicted test set: ",len(y_test_predicted_loss) )

#create csv file to submit
y_test_predicted_loss.to_csv('./data_out/AllState_LossPrediction_v1.csv', encoding='utf-8', index=False)


number of ids:  125546
number of rows in predicted test set:  125546
