* This sript is based on the script on 20160603
* The purpose of this script today is to study the combination of different models

In [1]:
import numpy as np
import matplotlib.pyplot as plt

import graphlab
import datetime
import math

from sklearn.ensemble import GradientBoostingRegressor
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

%matplotlib inline

np.random.seed(1)

### Define functions

In [2]:
# datetime.date format: year, month, day
def date_to_day(date):
    date_list = date.strip().split('-')
    return datetime.date(int(date_list[0]),int(date_list[1]),int(date_list[2])).weekday() + 1

In [3]:
# define MAPE evaluation function
def mape(result, result_predicted):
    count = 0
    sum = 0
    for (item_1, item_2) in zip(result, result_predicted):
        if item_1 > 0:
            count = count + 1
            sum = sum + math.fabs((item_1 - item_2)/item_1)
    return (sum, count, sum/count)

In [4]:
# define post_prediction_modification to predicted data
# set all the predicted gap that is less than 1 to 1
def prediction_modification(prediction):
    vect = prediction
    for i in range(len(vect)):
        if vect[i] < 1.0:
            vect[i] = 1.0
    return vect

### Load data

In [5]:
# SFrame.read_csv function to read csv files
data_training = graphlab.SFrame.read_csv("data-all_training.csv", \
                                        column_type_hints=[str,str,long,long,long,long,long,float,long,long,long,long,\
                                                           float,float,float,float,float,str,str,str,str,long,long,\
                                                           long,long,float,long,long,long,long,float,long,long,long,long,\
                                                           long,long,long,long,long,long,long,long,long,long,long,long])
data_test_set_1 = graphlab.SFrame.read_csv("data-all_test_set_1.csv", \
                                          column_type_hints=[str,str,long,long,long,long,long,float,long,long,long,long,\
                                                           float,float,float,float,float,str,str,str,str,long,long,\
                                                           long,long,float,long,long,long,long,float,long,long,long,long,\
                                                           long,long,long,long,long,long,long,long,long,long,long,long])

[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: C:\Users\Marine\AppData\Local\Temp\graphlab_server_1478495358.log.0


This non-commercial license of GraphLab Create for academic use is assigned to lei.mao@duke.edu and will expire on November 21, 2016.


In [6]:
data_training['day_of_week'] = data_training['date'].apply(lambda x: str(date_to_day(x)))
data_test_set_1['day_of_week'] = data_test_set_1['date'].apply(lambda x: str(date_to_day(x)))

data_training['gap_delta'] = data_training['gap_t(j)'] - data_training['gap_t(j-1)']
data_training['gap_delta_1'] = data_training['gap_t(j-1)'] - data_training['gap_t(j-2)']
data_training['gap_delta_2'] = data_training['gap_t(j-2)'] - data_training['gap_t(j-3)']
data_training['gap_delta_3'] = data_training['gap_t(j-1)'] - data_training['gap_t(j-3)']


data_test_set_1['gap_delta_1'] = data_test_set_1['gap_t(j-1)'] - data_test_set_1['gap_t(j-2)']
data_test_set_1['gap_delta_2'] = data_test_set_1['gap_t(j-2)'] - data_test_set_1['gap_t(j-3)']
data_test_set_1['gap_delta_3'] = data_test_set_1['gap_t(j-1)'] - data_test_set_1['gap_t(j-3)']


data_training['log_gap_t(j)'] = data_training['gap_t(j)'].apply(lambda x: math.log10(x+1.0))
data_training['log_gap_t(j-1)'] = data_training['gap_t(j-1)'].apply(lambda x: math.log10(x+1.0))
data_training['log_gap_t(j-2)'] = data_training['gap_t(j-2)'].apply(lambda x: math.log10(x+1.0))
data_training['log_gap_t(j-3)'] = data_training['gap_t(j-3)'].apply(lambda x: math.log10(x+1.0))


data_test_set_1['log_gap_t(j-1)'] = data_test_set_1['gap_t(j-1)'].apply(lambda x: math.log10(x+1.0))
data_test_set_1['log_gap_t(j-2)'] = data_test_set_1['gap_t(j-2)'].apply(lambda x: math.log10(x+1.0))
data_test_set_1['log_gap_t(j-3)'] = data_test_set_1['gap_t(j-3)'].apply(lambda x: math.log10(x+1.0))


# delete data at time_slot_id 1, 2 and 3
data_training = data_training[(data_training['time_slot_id'] > 3)]

In [7]:
data_training_training, data_training_validation = data_training.random_split(.9, seed=1)

In [8]:
features = ['start_district_id', 'time_slot_id', 'gap_t(j-1)', 'gap_t(j-2)', 'gap_t(j-3)', 'gap_averaged',
            'gap_delta_1','gap_delta_2','gap_delta_3',
            'log_gap_t(j-1)','log_gap_t(j-2)','log_gap_t(j-3)',
            'order_t(j-1)', 'order_t(j-2)', 'order_t(j-3)', 'order_averaged', 'price_avg_t(j-1)', 'price_avg_t(j-2)',
            'price_avg_t(j-3)', 'weather_t(j)', 'weather_t(j-1)', 'weather_t(j-2)', 'weather_t(j-3)', 
            'pm_t(j)','pm_t(j-1)','pm_t(j-2)','pm_t(j-3)','pm_avg', 'tj_1_j', 'tj_2_j','tj_3_j','tj_4_j',
            'tj_1_j_1', 'tj_2_j_1','tj_3_j_1','tj_4_j_1', 'tj_1_j_2', 'tj_2_j_2','tj_3_j_2','tj_4_j_2',
            'tj_1_j_3', 'tj_2_j_3','tj_3_j_3','tj_4_j_3','day_of_week']

In [9]:
features = ['start_district_id', 'time_slot_id',
            'log_gap_t(j-1)','log_gap_t(j-2)','log_gap_t(j-3)',
            'order_t(j-1)', 'order_t(j-2)', 'order_t(j-3)', 'order_averaged', 'price_avg_t(j-1)', 'price_avg_t(j-2)',
            'price_avg_t(j-3)', 'weather_t(j)', 'weather_t(j-1)', 'weather_t(j-2)', 'weather_t(j-3)', 
            'pm_t(j)','pm_t(j-1)','pm_t(j-2)','pm_t(j-3)','pm_avg', 'tj_1_j', 'tj_2_j','tj_3_j','tj_4_j',
            'tj_1_j_1', 'tj_2_j_1','tj_3_j_1','tj_4_j_1', 'tj_1_j_2', 'tj_2_j_2','tj_3_j_2','tj_4_j_2',
            'tj_1_j_3', 'tj_2_j_3','tj_3_j_3','tj_4_j_3','day_of_week']

In [10]:
feature_numpy_training = data_training_training[features].to_numpy().astype(float)

In [11]:
#feature_numpy_training

In [12]:
target_numpy_training = data_training_training['log_gap_t(j)'].to_numpy().astype(float)

In [13]:
feature_numpy_validation = data_training_validation[features].to_numpy().astype(float)

In [14]:
target_numpy_validation = data_training_validation['log_gap_t(j)'].to_numpy().astype(float)

### Explore hyperparameters

In [15]:
'''
# define parameters to search

n_estimators = [5,10,20,30,40,50,60,70,80,90,100,150,200,300,400,500]
max_depth = [2,3,4,5,6,7,8]
min_samples_split = [1,2,3,4]

n_estimators = [5,10]
max_depth = [2,3]
min_samples_split = [1,2]
'''

'\n# define parameters to search\n\nn_estimators = [5,10,20,30,40,50,60,70,80,90,100,150,200,300,400,500]\nmax_depth = [2,3,4,5,6,7,8]\nmin_samples_split = [1,2,3,4]\n\nn_estimators = [5,10]\nmax_depth = [2,3]\nmin_samples_split = [1,2]\n'

In [16]:
'''
output_file = open("hyperparamters_test_result",'w')
output_file.write("n_estimators,max_depth,min_samples_split,mae,rmse\n")
output_file.close()

for i in range(len(n_estimators)):
    for j in range(len(max_depth)):
        for k in range(len(min_samples_split)):
            print i, j, k
            params = {'n_estimators':n_estimators[i], 'max_depth': max_depth[j], 'min_samples_split': min_samples_split[k], 
                      'verbose': False, 'learning_rate': 0.01, 'loss': 'quantile', 'alpha': 0.5}
            model = ensemble.GradientBoostingRegressor(**params)
            model.fit(feature_numpy_training, target_numpy_training)
            validation_prediction = model.predict(feature_numpy_validation).tolist()
            validation_target = target_numpy_validation.tolist()
            (sum, count, mae) = mape(validation_target,validation_prediction)
            rmse = np.sqrt(mean_squared_error(validation_target,validation_prediction))
            output_file = open("hyperparamters_test_result",'a')
            output_file.write(str(n_estimators[i]) + ',' + str(max_depth[j]) + ',' + str(min_samples_split[k]) + \
                              ',' + str(mae) + ',' + str(rmse) + '\n')
            output_file.close()
            
 '''           

'\noutput_file = open("hyperparamters_test_result",\'w\')\noutput_file.write("n_estimators,max_depth,min_samples_split,mae,rmse\n")\noutput_file.close()\n\nfor i in range(len(n_estimators)):\n    for j in range(len(max_depth)):\n        for k in range(len(min_samples_split)):\n            print i, j, k\n            params = {\'n_estimators\':n_estimators[i], \'max_depth\': max_depth[j], \'min_samples_split\': min_samples_split[k], \n                      \'verbose\': False, \'learning_rate\': 0.01, \'loss\': \'quantile\', \'alpha\': 0.5}\n            model = ensemble.GradientBoostingRegressor(**params)\n            model.fit(feature_numpy_training, target_numpy_training)\n            validation_prediction = model.predict(feature_numpy_validation).tolist()\n            validation_target = target_numpy_validation.tolist()\n            (sum, count, mae) = mape(validation_target,validation_prediction)\n            rmse = np.sqrt(mean_squared_error(validation_target,validation_prediction))\

### Train the model

In [17]:
# Least Absolute Deviation (LAD) regression

# Fit regression model
params = {'n_estimators':100, 'max_depth': 4, 'min_samples_split': 1, 'verbose': True,
          'learning_rate': 0.01, 'loss': 'quantile', 'alpha': 0.5}
model_lad = ensemble.GradientBoostingRegressor(**params)

model_lad.fit(feature_numpy_training, target_numpy_training)


      Iter       Train Loss   Remaining Time 
         1           0.0329            2.29m
         2           0.0331            2.31m
         3           0.0331            2.33m
         4           0.0331            2.32m
         5           0.0331            2.30m
         6           0.0331            2.27m
         7           0.0331            2.24m
         8           0.0331            2.21m
         9           0.0332            2.18m
        10           0.0332            2.15m
        20           0.0333            1.92m
        30           0.0330            1.67m
        40           0.0324            1.43m
        50           0.0320            1.19m
        60           0.0313           56.74s
        70           0.0307           42.30s
        80           0.0301           28.05s
        90           0.0296           14.01s
       100           0.0290            0.00s


GradientBoostingRegressor(alpha=0.5, init=None, learning_rate=0.01,
             loss='quantile', max_depth=4, max_features=None,
             max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=1,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0,
             verbose=True, warm_start=False)

In [18]:
# Least Squares (LS) regression

# Fit regression model
params = {'n_estimators':1, 'max_depth': 4, 'min_samples_split': 1, 'verbose': True,
          'learning_rate': 0.01, 'loss': 'ls'}
model_ls = ensemble.GradientBoostingRegressor(**params)

model_ls.fit(feature_numpy_training, target_numpy_training)


      Iter       Train Loss   Remaining Time 
         1           0.2414            0.00s


GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.01, loss='ls',
             max_depth=4, max_features=None, max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=1,
             min_weight_fraction_leaf=0.0, n_estimators=1, presort='auto',
             random_state=None, subsample=1.0, verbose=True,
             warm_start=False)

In [19]:
# Graphlab model
model_gl = graphlab.boosted_trees_regression.create(data_training_training, features=features, target='log_gap_t(j)', 
                                                    max_iterations = 100,
                                                    max_depth = 9, random_seed = 1)

PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.



In [20]:
# predict gap_delta to predict gap
gap_delta_numpy_training = data_training_training['gap_delta'].to_numpy().astype(float)
gap_delta_numpy_validation = data_training_validation['gap_delta'].to_numpy().astype(float)


# Least Absolute Deviation (LAD) regression

# Fit regression model
params = {'n_estimators':100, 'max_depth': 4, 'min_samples_split': 1, 'verbose': True,
          'learning_rate': 0.01, 'loss': 'quantile', 'alpha': 0.5}
model_delta_lad = ensemble.GradientBoostingRegressor(**params)

model_delta_lad.fit(feature_numpy_training, gap_delta_numpy_training)

      Iter       Train Loss   Remaining Time 
         1           0.0039            2.18m
         2           0.0057            2.12m
         3           0.0076            2.10m
         4           0.0096            2.08m
         5           0.0114            2.06m
         6           0.0131            2.04m
         7           0.0149            2.01m
         8           0.0167            1.99m
         9           0.0185            1.97m
        10           0.0202            2.00m
        20           0.0368            1.78m
        30           0.0518            1.54m
        40           0.0651            1.31m
        50           0.0770            1.09m
        60           0.0878           52.03s
        70           0.0973           38.95s
        80           0.1067           26.00s
        90           0.1160           13.01s
       100           0.1243            0.00s


GradientBoostingRegressor(alpha=0.5, init=None, learning_rate=0.01,
             loss='quantile', max_depth=4, max_features=None,
             max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=1,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0,
             verbose=True, warm_start=False)

### Model validation and evaluation

* LAD model

In [21]:

validation_prediction_lad_temp = model_lad.predict(feature_numpy_validation).tolist()
validation_prediction_lad = [math.pow(10,i) - 1.0 for i in validation_prediction_lad_temp]
#print validation_prediction[0:100]


In [22]:
validation_target = list(data_training_validation['gap_t(j)'])
#print validation_target[0:100]

In [23]:
model_comparison = graphlab.SFrame()
model_comparison['target_value'] = validation_target
model_comparison['model_lad_unmodified'] = validation_prediction_lad

In [24]:
# prediction_modification
validation_prediction_lad_modified = prediction_modification(validation_prediction_lad)
#print validation_prediction_modified[0:100]

In [25]:
(sum, count, mape_lad) = mape(validation_target,validation_prediction_lad_modified)

In [26]:
print "mape_lad = %f" %mape_lad

mape_lad = 0.436144


In [27]:
rmse_lad = np.sqrt(mean_squared_error(validation_target,validation_prediction_lad_modified))

In [28]:
print "rmse_lad = %f" %rmse_lad

rmse_lad = 36.753885


In [29]:
model_comparison['model_lad_modified'] = validation_prediction_lad_modified

* LS model

In [30]:
validation_prediction_ls = model_ls.predict(feature_numpy_validation).tolist()

In [31]:
#model_comparison['model_ls_unmodified'] = validation_prediction_ls

In [32]:
validation_prediction_ls_modified = prediction_modification(validation_prediction_ls)

In [33]:
(sum, count, mape_ls) = mape(validation_target,validation_prediction_ls_modified)

In [34]:
print "mape_ls = %f" %mape_ls

mape_ls = 0.511959


In [35]:
rmse_ls = np.sqrt(mean_squared_error(validation_target,validation_prediction_ls_modified))

In [36]:
print "rmse_ls = %f" %rmse_ls

rmse_ls = 38.614115


In [37]:

#model_comparison['model_ls_modified'] = validation_prediction_ls_modified

* Graphlab model

In [38]:
validation_prediction_gl_temp = list(model_gl.predict(data_training_validation))

validation_prediction_gl = [math.pow(10,i) - 1.0 for i in validation_prediction_gl_temp]


In [39]:
model_comparison['model_gl_unmodified'] = validation_prediction_gl

In [40]:
validation_prediction_gl_modified = prediction_modification(validation_prediction_gl)

In [41]:
(sum, count, mape_gl) = mape(validation_target,validation_prediction_gl_modified)

In [42]:
print "mape_gl = %f" %mape_gl

mape_gl = 0.452113


In [43]:
rmse_gl = np.sqrt(mean_squared_error(validation_target,validation_prediction_gl_modified))

In [44]:
print "rmse_gl = %f" %rmse_gl

rmse_gl = 13.081696


In [45]:

model_comparison['model_gl_modified'] = validation_prediction_gl_modified

* Predict gap_delta

In [46]:
validation_prediction_delta_lad = (model_delta_lad.predict(feature_numpy_validation) + data_training_validation['gap_t(j-1)'].to_numpy().astype(float)).tolist()



In [47]:
model_comparison['model_delta_lad_unmodified'] = validation_prediction_delta_lad

In [48]:
validation_prediction_delta_modified = prediction_modification(validation_prediction_delta_lad)

In [49]:
(sum, count, mape_delta_lad) = mape(validation_target,validation_prediction_delta_modified)

In [50]:
print "mape_delta_lad = %f" %mape_delta_lad

mape_delta_lad = 0.565263


In [51]:
rmse_delta_lad = np.sqrt(mean_squared_error(validation_target,validation_prediction_delta_modified))

In [52]:
print "rmse_delta_lad = %f" %rmse_delta_lad

rmse_delta_lad = 15.590565


In [53]:
model_comparison['model_delta_lad_modified'] = validation_prediction_delta_modified

* Export data

In [54]:
model_comparison.export_csv("model_comparison.csv")

In [55]:
#model_comparison

In [56]:
################################################################################

### New boosting: integrate Graphlab rmse model and Scikit lad model
* Low rmse is used to fit big gap numbers
* Low lad is used to fit most of the numbers (here small gap numbers)
* Ask machine to learn how to choose different models

In [57]:
data_boosting = graphlab.SFrame()
data_boosting = data_training_validation['start_district_id', 'time_slot_id', 'gap_delta_1','gap_delta_2','gap_delta_3',
                                         'gap_t(j)', 'gap_t(j-1)', 'gap_t(j-2)', 'gap_t(j-3)', 'day_of_week']
data_boosting['gl_ls_predicted'] = validation_prediction_gl_modified
data_boosting['scikit_lad_predicted'] = validation_prediction_lad_modified
data_boosting['scikit_lad_predicted_delta_method'] = validation_prediction_delta_modified

data_boosting['S1'] = data_boosting['gl_ls_predicted'] - data_boosting['scikit_lad_predicted']
data_boosting['S2'] = data_boosting['gl_ls_predicted'] - data_boosting['scikit_lad_predicted_delta_method']
data_boosting['S3'] = data_boosting['scikit_lad_predicted'] - data_boosting['scikit_lad_predicted_delta_method']

data_boosting['D1'] = data_boosting['gl_ls_predicted'] / data_boosting['scikit_lad_predicted']
data_boosting['D2'] = data_boosting['gl_ls_predicted'] / data_boosting['scikit_lad_predicted_delta_method']
data_boosting['D3'] = data_boosting['scikit_lad_predicted'] / data_boosting['scikit_lad_predicted_delta_method']

features_boosting = ['time_slot_id','gap_t(j-1)', 'gap_t(j-2)', 'gap_t(j-3)','gap_delta_1','gap_delta_2','gap_delta_3',
                     'S1','S2','S3','D1','D2','D3',
                     'gl_ls_predicted', 'scikit_lad_predicted', 'scikit_lad_predicted_delta_method']

data_boosting_training, data_boosting_validation = data_boosting.random_split(.9, seed=1)
#data_boosting

In [58]:
params = {'n_estimators':100, 'max_depth': 4, 'min_samples_split': 1, 'verbose': True,
          'learning_rate': 0.01, 'loss': 'quantile', 'alpha': 0.5}
model_boosting = ensemble.GradientBoostingRegressor(**params)

feature_numpy_boosting_training = data_boosting_training[features_boosting].to_numpy().astype(float)
target_numpy_boosting_training = data_boosting_training['gap_t(j)'].to_numpy().astype(float)

feature_numpy_boosting_validation = data_boosting_validation[features_boosting].to_numpy().astype(float)
target_numpy_boosting_validation = data_boosting_validation['gap_t(j)'].to_numpy().astype(float)


model_boosting.fit(feature_numpy_boosting_training, target_numpy_boosting_training)

      Iter       Train Loss   Remaining Time 
         1           3.0696            3.27s
         2           3.0629            3.28s
         3           3.0561            3.20s
         4           3.0494            3.14s
         5           3.0427            3.12s
         6           3.0361            3.07s
         7           3.0296            3.27s
         8           3.0231            3.36s
         9           3.0167            3.29s
        10           3.0069            3.22s
        20           2.8893            2.84s
        30           2.7800            2.53s
        40           2.6482            2.21s
        50           2.5039            1.92s
        60           2.3595            1.57s
        70           2.2240            1.17s
        80           2.1010            0.80s
        90           1.9897            0.40s
       100           1.8803            0.00s


GradientBoostingRegressor(alpha=0.5, init=None, learning_rate=0.01,
             loss='quantile', max_depth=4, max_features=None,
             max_leaf_nodes=None, min_samples_leaf=1, min_samples_split=1,
             min_weight_fraction_leaf=0.0, n_estimators=100,
             presort='auto', random_state=None, subsample=1.0,
             verbose=True, warm_start=False)

In [59]:
'''
model_boosting = graphlab.boosted_trees_regression.create(data_boosting, features = features_boosting, target='gap_t(j)', 
                                                          max_iterations = 100,
                                                          max_depth = 9, random_seed = 1)
'''

"\nmodel_boosting = graphlab.boosted_trees_regression.create(data_boosting, features = features_boosting, target='gap_t(j)', \n                                                          max_iterations = 100,\n                                                          max_depth = 9, random_seed = 1)\n"

In [60]:
validation_prediction_boosting = model_boosting.predict(feature_numpy_boosting_validation).tolist()
#validation_prediction_boosting = list(model_boosting.predict(data_training_validation))

In [61]:
validation_prediction_boosting_modified = prediction_modification(validation_prediction_boosting)

In [62]:
(sum, count, mape_boosting) = mape(target_numpy_boosting_validation,validation_prediction_boosting_modified)

In [63]:
rmse_boosting = np.sqrt(mean_squared_error(target_numpy_boosting_validation,validation_prediction_boosting_modified))

In [64]:
print "mape_boosting = %f" %mape_boosting
print "rmse_boosting = %f" %rmse_boosting

mape_boosting = 0.402721
rmse_boosting = 25.972123


In [65]:
data_boosting_validation['prediction_boosting'] = validation_prediction_boosting
data_boosting_validation['gap_t(j)','gap_t(j-1)','gl_ls_predicted','scikit_lad_predicted', 
                         'scikit_lad_predicted_delta_method', 'prediction_boosting']

data_boosting_validation['gap_t(j)','gap_t(j-1)', 'gl_ls_predicted','scikit_lad_predicted', 
                         'scikit_lad_predicted_delta_method','prediction_boosting'][480:550]

gap_t(j),gap_t(j-1),gl_ls_predicted,scikit_lad_predicted,scikit_lad_predicted_delt a_method ...,prediction_boosting
96,73,108.517205889,15.7204397024,72.0109017648,70.1804162718
8,8,7.47599182851,4.27608158363,7.26077668804,5.91576001871
11,12,16.243311135,5.79672590875,11.260776688,10.418550267
32,17,21.1649079332,7.31560482892,16.260776688,12.5512990302
0,0,1.0,1.0,1.0,1.0
1,0,1.0,1.0,1.0,1.0
0,2,1.0,1.0,1.93427309565,1.0
0,1,1.0,1.0,1.0,1.0
1,0,1.0,1.0,1.0,1.0
3,2,1.77083434302,1.00386235486,1.91572667925,1.1390887206


In [66]:
#data_boosting_validation['gap_t(j)', 'scikit_lad_predicted_delta_method','prediction_boosting'].print_rows(num_rows=100)

### Conditional Combination

In [67]:
'''
# search for threhold
for i in range(1, int(max(validation_prediction_lad_modified)) + 1):
    prediction_conditional_combination = []
    for j in range(len(validation_prediction_lad_modified)):
        if validation_prediction_lad_modified[j] <= i:
            prediction_conditional_combination.append(validation_prediction_lad_modified[j])
        else:
            prediction_conditional_combination.append(validation_prediction_gl_modified[j])
    (sum, count, mape_conditional_combination) = mape(validation_target, prediction_conditional_combination)
    print mape_conditional_combination
'''

'\n# search for threhold\nfor i in range(1, int(max(validation_prediction_lad_modified)) + 1):\n    prediction_conditional_combination = []\n    for j in range(len(validation_prediction_lad_modified)):\n        if validation_prediction_lad_modified[j] <= i:\n            prediction_conditional_combination.append(validation_prediction_lad_modified[j])\n        else:\n            prediction_conditional_combination.append(validation_prediction_gl_modified[j])\n    (sum, count, mape_conditional_combination) = mape(validation_target, prediction_conditional_combination)\n    print mape_conditional_combination\n'

In [68]:
'''
# search for threhold
mape_smallest = 1
i_corresponding = 1
for i in range(1, int(max(validation_prediction_delta_modified)) + 1):
    prediction_conditional_combination = []
    for j in range(len(validation_prediction_delta_modified)):
        if validation_prediction_delta_modified[j] >= i:
            prediction_conditional_combination.append(validation_prediction_delta_modified[j])
        else:
            prediction_conditional_combination.append(validation_prediction_lad_modified[j])
    (sum, count, mape_conditional_combination) = mape(validation_target, prediction_conditional_combination)
    if mape_conditional_combination < mape_smallest:
        mape_smallest = mape_conditional_combination
        i_corresponding = i
print mape_smallest
print i_corresponding

prediction_conditional_combination = []
for j in range(len(validation_prediction_delta_modified)):
    if validation_prediction_delta_modified[j] >= i_corresponding:
        prediction_conditional_combination.append(validation_prediction_delta_modified[j])
    else:
        prediction_conditional_combination.append(validation_prediction_lad_modified[j])
    
data_boosting['prediction_conditional_combination'] = prediction_conditional_combination
'''

"\n# search for threhold\nmape_smallest = 1\ni_corresponding = 1\nfor i in range(1, int(max(validation_prediction_delta_modified)) + 1):\n    prediction_conditional_combination = []\n    for j in range(len(validation_prediction_delta_modified)):\n        if validation_prediction_delta_modified[j] >= i:\n            prediction_conditional_combination.append(validation_prediction_delta_modified[j])\n        else:\n            prediction_conditional_combination.append(validation_prediction_lad_modified[j])\n    (sum, count, mape_conditional_combination) = mape(validation_target, prediction_conditional_combination)\n    if mape_conditional_combination < mape_smallest:\n        mape_smallest = mape_conditional_combination\n        i_corresponding = i\nprint mape_smallest\nprint i_corresponding\n\nprediction_conditional_combination = []\nfor j in range(len(validation_prediction_delta_modified)):\n    if validation_prediction_delta_modified[j] >= i_corresponding:\n        prediction_condition

In [69]:
'''
data_boosting['gap_t(j)','gl_ls_predicted','scikit_lad_predicted','prediction_boosting', 'prediction_conditional_combination']\
.export_csv("model_comparison_2.csv")
'''

'\ndata_boosting[\'gap_t(j)\',\'gl_ls_predicted\',\'scikit_lad_predicted\',\'prediction_boosting\', \'prediction_conditional_combination\'].export_csv("model_comparison_2.csv")\n'

### Model prediction

In [70]:
prediction_items = []
fhand = open("read_me_1.txt")
for line in fhand:
    line_splitted = line.strip().split('-')
    prediction_items.append(((line_splitted[0] + '-' + line_splitted[1] + '-' + line_splitted[2]), line_splitted[3]))
fhand.close()
#prediction_items

In [71]:
data_test_set_1_filtered = graphlab.SFrame()
for (date, time_slot_id) in prediction_items:
    data_test_set_1_filtered = data_test_set_1_filtered.append(
    data_test_set_1[(data_test_set_1['date'] == date) & (data_test_set_1['time_slot_id'] == int(time_slot_id))])

In [72]:
feature_numpy_test = data_test_set_1_filtered[features].to_numpy().astype(float)

In [73]:
# lad model prediction
test_prediction_lad_temp_1 = model_lad.predict(feature_numpy_test).tolist()
test_prediction_lad_temp_2 = [math.pow(10,i) - 1.0 for i in test_prediction_lad_temp_1]
test_prediction_lad = prediction_modification(test_prediction_lad_temp_2)


# graphlab model prediction
test_prediction_gl_temp_1 = list(model_gl.predict(data_test_set_1_filtered))
test_prediction_gl_temp_2 = [math.pow(10,i) - 1.0 for i in test_prediction_gl_temp_1]
test_prediction_gl = prediction_modification(test_prediction_gl_temp_2)

# gap_delta model prediction
test_prediction_delta_lad = prediction_modification\
((model_delta_lad.predict(feature_numpy_test) + data_test_set_1_filtered['gap_t(j-1)'].to_numpy().astype(float)).tolist())


test_data_boosting = graphlab.SFrame()
test_data_boosting = data_test_set_1_filtered['start_district_id', 'time_slot_id', 'gap_delta_1','gap_delta_2','gap_delta_3',
                                              'gap_t(j)', 'gap_t(j-1)', 'gap_t(j-2)', 'gap_t(j-3)', 'day_of_week']

test_data_boosting['gl_ls_predicted'] = test_prediction_gl
test_data_boosting['scikit_lad_predicted'] = test_prediction_lad
test_data_boosting['scikit_lad_predicted_delta_method'] = test_prediction_delta_lad

test_data_boosting['S1'] = test_data_boosting['gl_ls_predicted'] - test_data_boosting['scikit_lad_predicted']
test_data_boosting['S2'] = test_data_boosting['gl_ls_predicted'] - test_data_boosting['scikit_lad_predicted_delta_method']
test_data_boosting['S3'] = test_data_boosting['scikit_lad_predicted'] - test_data_boosting['scikit_lad_predicted_delta_method']

test_data_boosting['D1'] = test_data_boosting['gl_ls_predicted'] / test_data_boosting['scikit_lad_predicted']
test_data_boosting['D2'] = test_data_boosting['gl_ls_predicted'] / test_data_boosting['scikit_lad_predicted_delta_method']
test_data_boosting['D3'] = test_data_boosting['scikit_lad_predicted'] / test_data_boosting['scikit_lad_predicted_delta_method']

features_boosting = ['time_slot_id','gap_t(j-1)', 'gap_t(j-2)', 'gap_t(j-3)','gap_delta_1','gap_delta_2','gap_delta_3',
                     'S1','S2','S3','D1','D2','D3',
                     'gl_ls_predicted', 'scikit_lad_predicted', 'scikit_lad_predicted_delta_method']

feature_numpy_boosting_test = test_data_boosting[features_boosting].to_numpy().astype(float)

In [74]:
test_prediction_boosting = model_boosting.predict(feature_numpy_boosting_test).tolist()


In [75]:
test_prediction_modified = prediction_modification(test_prediction_boosting)

# test_prediction_modified[0:100]

In [76]:
data_test_set_1_filtered['prediction'] = test_prediction_modified

In [77]:
#data_test_set_1_filtered

### Make submission file

In [78]:
def make_submission(result, filename='submission.txt'):
    output_file = open(filename,'w')
    for row in data_test_set_1_filtered:
        output_file.write(str(row['start_district_id']) + ',' + row['date'] + '-' \
                          + str(row['time_slot_id']) + ',' + str(row['prediction']) + '\n')
    output_file.close()

In [79]:
make_submission(data_test_set_1_filtered)