## Prepaparation

In [1]:
import pandas as pd
import numpy as np

#model
from sklearn import linear_model
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb

#other
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler



## Loading datasets

In [2]:
print('loading dataset...')
tr = pd.read_csv("./training_data.csv")
#te = pd.read_csv("./testing_data.csv")
print('loading complete')

tr_label = tr['Next_Premium']
tr_feature = tr
drop_feature = ['Next_Premium','Prior_Policy_Number','nequipment9','Vehicle_Make_and_Model1','Distribution_Channel','Accident_Date','Claim_Number']
for fe in drop_feature:
    tr_feature = tr_feature.drop(fe, axis=1)

tr_x, te_x, tr_y, te_y = train_test_split(tr_feature, tr_label, test_size=0.33, random_state=2018)

tr_y = np.reshape(tr_y, (-1, 1))
te_y = np.reshape(te_y, (-1, 1))
scaler_x = StandardScaler().fit(tr_x)
scalerY = StandardScaler()

tr_x = scaler_x.transform(tr_x)
te_x = scaler_x.transform(te_x)


loading dataset...
loading complete


  return getattr(obj, method)(*args, **kwds)


## Standard ML model

## SVR

In [None]:
model_SVR = SVR()
#model_SVR_params = {'C':[val*0.01 for val in range(1, 10)]}
model_SVR_params = {'C':[0.1, 0.05, 0.01, 0.5]}

## Random forests regression

In [None]:
model_RF = RandomForestRegressor()
model_RF_params = {'n_jobs' : [6], 'random_state' : [2018], 'n_estimators' : [val for val in range(1, 10)], 'max_depth' : [val for val in range(1, 10)]}

## Elastic net regression

In [None]:
model_EN = linear_model.ElasticNetCV(random_state=2018, n_jobs=10)
model_EN_params = {'l1_ratio': [.1, .5, .7, .9, .95, .99, 1]}
#model_EN_params = {}

## Linear regression

In [None]:
model_LR = linear_model.LinearRegression()
model_LR_params = {'n_jobs': [10]
}

## XGB

In [None]:
model_xgb_params =  {'booster': ['gblinear'],
                     'colsample_bytree': [0.7],
                     'gamma': [0.1],
                     'learning_rate': [0.07],
                     'max_depth': [3],
                     'min_child_weight': [3],
                     'n_estimators': [500],
                     'objective': ['reg:linear'],
                     'random_state': [2018],
                     'silent': [0],
                     'subsample': [0.7]}
model_xgb = xgb.XGBRegressor(n_jobs=10)

## Training

In [None]:
#model_list = [model_SVR, model_RF, model_EN, model_xgb]
model_list = [model_SVR]
params = {model_SVR:model_SVR_params, model_RF:model_RF_params, model_EN:model_EN_params, model_xgb:model_xgb_params, model_LR:model_LR_params}

print('predicting...')
for model in model_list:
    #sc = cross_val_score(model, tr_feature, tr_label, scoring='neg_mean_absolute_error', cv=2)
    grid = GridSearchCV(model, params[model], verbose=True)
    grid.fit(tr_x, tr_y)
    pred = grid.predict(te_x)
    sc = mean_absolute_error(te_y, pred)
    print('')
    print(str(model) + ' / score: ' + str(abs(np.average(sc))))

## Deep learning

In [None]:
#prepaparation
import tensorflow as tf
# from keras.backend.tensorflow_backend import set_session
# config = tf.ConfigProto()
# config.gpu_options.per_process_gpu_memory_fraction = 1 # 設定使用記憶體的％數
# set_session(tf.Session(config=config))

def layer(output_dim, input_dim, input, activation = None):
    with tf.name_scope('weight'):
        W = tf.Variable(tf.random_normal([input_dim, output_dim]))
    
    with tf.name_scope('bias'):
        b = tf.Variable(tf.random_normal([1, output_dim]))
    
    output = tf.matmul(input, W) + b
    
    if activation:
        output = activation(output)
    
    return output


with tf.name_scope('input'):
    x = tf.placeholder(tf.float32, [None, len(list(tr_x[0]))], name='feature')
    y = tf.placeholder(tf.float32, [None, 1], name='label')

with tf.name_scope('Weight'):
    W = tf.Variable(tf.truncated_normal([len(list(tr_x[0])), 1], stddev=0.1), name='Weight')

with tf.name_scope('bias'):
    b = tf.Variable(tf.constant(0.1, shape=[1]), name='bias')

with tf.name_scope('Layer'):
    l1 = layer(400, len(list(tr_x[0])), x, activation=tf.nn.relu)
    #h1 = layer(200, 300, l1, activation=None)
    #h2 = layer(300, 500, h1, activation=None)
    y_pred = layer(1, 400, l1, activation=None)

with tf.name_scope('loss'):
    loss = tf.sqrt(tf.reduce_mean(tf.square(tf.subtract(y, y_pred))))
    tf.summary.scalar('loss', loss)

with tf.name_scope('train'):
    train_step = tf.train.AdamOptimizer(0.0001).minimize(loss)

merge = tf.summary.merge_all()
saver = tf.train.Saver()

with tf.Session(config=tf.ConfigProto(log_device_placement = True, allow_soft_placement = True)) as sess:
    sess.run(tf.global_variables_initializer())
    train_writer = tf.summary.FileWriter('Lien_dataset/tensorflow_info/train', sess.graph)
    test_writer = tf.summary.FileWriter('Lien_dataset/tensorflow_info/test', sess.graph)
    batch_size = 20
    batch_num = len(tr_y) // batch_size

    for epoch in range(50):
        for batch_i in range(batch_num):
            try:
                batch_xs = tr_x[batch_i*batch_size: (batch_i+1)*batch_size]
                batch_ys = tr_y[batch_i*batch_size: (batch_i+1)*batch_size]
                
            except:
                batch_xs = tr_x[batch_i*batch_size:]
                batch_ys = tr_y[batch_i*batch_size:]

            sess.run(train_step, feed_dict={x:batch_xs, y:batch_ys})

        sc = mean_absolute_error(te_y, y_pred.eval(feed_dict = {x:te_x}))
        print('Epoch: ' + str(epoch) + ' score: ' + str(sc))
            
        
        


  from ._conv import register_converters as _register_converters


Epoch: 0 score: 3240.3708699456515
Epoch: 1 score: 2893.0145688715106
Epoch: 2 score: 2775.5070929762437
Epoch: 3 score: 2703.293452468219
Epoch: 4 score: 2641.9727131496247
Epoch: 5 score: 2591.902076177431
Epoch: 6 score: 2551.544591156414
Epoch: 7 score: 2517.990198362083
Epoch: 8 score: 2489.227830875255
Epoch: 9 score: 2463.9842550995445
Epoch: 10 score: 2441.5544757336847
Epoch: 11 score: 2421.6829022927436
Epoch: 12 score: 2403.9895341803467
Epoch: 13 score: 2388.113948657688
Epoch: 14 score: 2373.6521042229997
Epoch: 15 score: 2360.5081048462957
Epoch: 16 score: 2348.365761669793
Epoch: 17 score: 2337.0576691595693
Epoch: 18 score: 2326.5778322776646
Epoch: 19 score: 2316.92938461709
Epoch: 20 score: 2307.9661165172947
Epoch: 21 score: 2299.51482541183
Epoch: 22 score: 2291.454090273896
Epoch: 23 score: 2283.751171223869
Epoch: 24 score: 2276.4547204423893
Epoch: 25 score: 2269.5093975017767
Epoch: 26 score: 2262.8700595274104
Epoch: 27 score: 2256.526435613166
Epoch: 28 score:

## Test data predicting

In [None]:
test_res = pd.read_csv("./testing_data.csv")
drop_feature = ['Next_Premium','Prior_Policy_Number','nequipment9','Vehicle_Make_and_Model1','Distribution_Channel','Accident_Date','Claim_Number']
for fe in drop_feature:
    test_res = test_res.drop(fe, axis=1)
    
#predicting
print('predicting')
test_feature = scaler_x.transform(test_res)
pred =  grid.predict(test_feature)
pred = np.maximum(pred, 0)
    

print('csv writing')
#test file writing
test_data_file = pd.read_csv("./testing-set.csv")
test_data_file['Next_Premium'] = pred

test_data_file

test_data_file.to_csv('Lien_dataset/lien_test_result_0829.csv', index=False)
    

## Model saver

In [None]:
from sklearn.externals import joblib

joblib.dump(grid, 'Lien_dataset/Model_saver/xgb_0827.pkl')

model = joblib.load('Lien_dataset/Model_saver/xgb_0827.pkl')
model.best_params_