## Prepaparation

In [1]:
import pandas as pd
import numpy as np

#model
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

#other
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler



## Loading datasets

In [2]:
print('loading dataset...')
tr = pd.read_csv("./training_data.csv")
#te = pd.read_csv("./testing_data.csv")
print('loading complete')

tr_label = tr['Next_Premium']
tr_feature = tr
drop_feature = ['Next_Premium','Prior_Policy_Number','nequipment9','Vehicle_Make_and_Model1','Distribution_Channel','Accident_Date','Claim_Number']
for fe in drop_feature:
    tr_feature = tr_feature.drop(fe, axis=1)

tr_x, te_x, tr_y, te_y = train_test_split(tr_feature, tr_label, test_size=0.33, random_state=2018)


scaler_x = StandardScaler().fit(tr_x)
scalerY = StandardScaler()
#scaler_y = StandardScaler().fit(tr_y)
tr_x = scaler_x.transform(tr_x)
te_x = scaler_x.transform(te_x)
#tr_y = scaler_y.transform(tr_y)

loading dataset...
loading complete


## SVR

In [3]:
model_SVR = SVR()
#model_SVR_params = {'C':[val*0.01 for val in range(1, 10)]}
model_SVR_params = {'C':[0.1]}

## Random forests regression

In [4]:
model_RF = RandomForestRegressor()
model_RF_params = {'n_jobs' : [6], 'random_state' : [2018], 'n_estimators' : [val for val in range(1, 10)], 'max_depth' : [val for val in range(1, 10)]}

## Elastic net regression

In [5]:
model_EN = linear_model.ElasticNet(random_state=2018, alpha=1.0)
model_EN_params = {'alpha':[val*0.1 for val in range(1, 100)]}

## XGB

In [9]:
model_xgb_params = {'objective':['reg:linear'],
                    'learning_rate': [.03, 0.05, .07],
                    'max_depth': [val for val in range(1, 11)],
                    'min_child_weight': [val for val in range(2, 8)],
                    'silent': [0],
                    'subsample': [1],
                    'colsample_bytree': [0.7],
                    'n_estimators': [500],
                    'random_state': [2018],
                    'booster': ['gblinear'],
                    'gamma': [.1, .01, .001],
                    }
model_xgb = xgb.XGBRegressor(n_jobs=10)

## Training

In [None]:
#model_list = [model_SVR, model_RF, model_EN, model_xgb]
model_list = [model_xgb]
params = {model_SVR:model_SVR_params, model_RF:model_RF_params, model_EN:model_EN_params, model_xgb:model_xgb_params}

print('predicting...')
for model in model_list:
    #sc = cross_val_score(model, tr_feature, tr_label, scoring='neg_mean_absolute_error', cv=2)
    grid = GridSearchCV(model, params[model], verbose=True, n_jobs=1)
    grid.fit(tr_x, tr_y)
    pred = grid.predict(te_x)
    sc = mean_absolute_error(te_y, pred)
    print('')
    print(str(model) + ' / score: ' + str(abs(np.average(sc))))

predicting...
Fitting 3 folds for each of 540 candidates, totalling 1620 fits


## Test data predicting

In [8]:
test_res = pd.read_csv("./testing_data.csv")
drop_feature = ['Next_Premium','Prior_Policy_Number','nequipment9','Vehicle_Make_and_Model1','Distribution_Channel','Accident_Date','Claim_Number']
for fe in drop_feature:
    test_res = test_res.drop(fe, axis=1)
    
#predicting
print('predicting')
test_feature = scaler_x.transform(test_res)
pred =  grid.predict(test_feature)
pred = np.maximum(pred, 0)
    

print('csv writing')
#test file writing
test_data_file = pd.read_csv("./testing-set.csv")
test_data_file['Next_Premium'] = pred

test_data_file

test_data_file.to_csv('Lien_dataset/lien_test_result_0828.csv', index=False)
    

predicting
csv writing


## Model saver

In [None]:
from sklearn.externals import joblib

joblib.dump(grid, 'Lien_dataset/Model_saver/xgb_0827.pkl')

#model = joblib.load('Lien_dataset/Model_saver/xgb_0827.pkl')
