* This sript is based on the script on 20160603
* The purpose of this script today is to study the combination of different models

In [6]:
import numpy as np
import matplotlib.pyplot as plt

import graphlab
import datetime
import math

from sklearn.ensemble import GradientBoostingRegressor
from sklearn import ensemble
from sklearn import datasets
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error

%matplotlib inline

np.random.seed(1)

### Define functions

In [7]:
# datetime.date format: year, month, day
def date_to_day(date):
    date_list = date.strip().split('-')
    return datetime.date(int(date_list[0]),int(date_list[1]),int(date_list[2])).weekday() + 1

In [8]:
# define MAPE evaluation function
def mape(result, result_predicted):
    count = 0
    sum = 0
    for (item_1, item_2) in zip(result, result_predicted):
        if item_1 > 0:
            count = count + 1
            sum = sum + math.fabs((item_1 - item_2)/item_1)
    return (sum, count, sum/count)

In [9]:
# define post_prediction_modification to predicted data
# set all the predicted gap that is less than 1 to 1
def prediction_modification(prediction):
    vect = prediction
    for i in range(len(vect)):
        if vect[i] < 1.0:
            vect[i] = 1.0
    return vect

### Load data

In [10]:
# SFrame.read_csv function to read csv files
data_training = graphlab.SFrame.read_csv("data-all_training.csv", \
                                        column_type_hints=[str,str,long,long,long,long,long,float,long,long,long,long,\
                                                           float,float,float,float,float,str,str,str,str,long,long,\
                                                           long,long,float,long,long,long,long,float,long,long,long,long,\
                                                           long,long,long,long,long,long,long,long,long,long,long,long])
data_test_set_1 = graphlab.SFrame.read_csv("data-all_test_set_1.csv", \
                                          column_type_hints=[str,str,long,long,long,long,long,float,long,long,long,long,\
                                                           float,float,float,float,float,str,str,str,str,long,long,\
                                                           long,long,float,long,long,long,long,float,long,long,long,long,\
                                                           long,long,long,long,long,long,long,long,long,long,long,long])

GraphLab Create requires a license to use. To get a non-commercial  license for academic use only, visit https://turi.com/register.



InvalidProductKey: Product key not found.

In [None]:
data_training['day_of_week'] = data_training['date'].apply(lambda x: str(date_to_day(x)))
data_test_set_1['day_of_week'] = data_test_set_1['date'].apply(lambda x: str(date_to_day(x)))

data_training['gap_delta'] = data_training['gap_t(j)'] - data_training['gap_t(j-1)']
data_training['gap_delta_1'] = data_training['gap_t(j-1)'] - data_training['gap_t(j-2)']
data_training['gap_delta_2'] = data_training['gap_t(j-2)'] - data_training['gap_t(j-3)']
data_training['gap_delta_3'] = data_training['gap_t(j-1)'] - data_training['gap_t(j-3)']

data_test_set_1['gap_delta'] = data_test_set_1['gap_t(j)'] - data_test_set_1['gap_t(j-1)']
data_test_set_1['gap_delta_1'] = data_test_set_1['gap_t(j-1)'] - data_test_set_1['gap_t(j-2)']
data_test_set_1['gap_delta_2'] = data_test_set_1['gap_t(j-2)'] - data_test_set_1['gap_t(j-3)']
data_test_set_1['gap_delta_3'] = data_test_set_1['gap_t(j-1)'] - data_test_set_1['gap_t(j-3)']

# delete data at time_slot_id 1, 2 and 3
data_training = data_training[(data_training['time_slot_id'] > 3)]

In [None]:
data_training_training, data_training_validation = data_training.random_split(.9, seed=8)

In [None]:
features = ['start_district_id', 'time_slot_id', 'gap_t(j-1)', 'gap_t(j-2)', 'gap_t(j-3)', 'gap_averaged',
            'gap_delta_1','gap_delta_2','gap_delta_3',
            'order_t(j-1)', 'order_t(j-2)', 'order_t(j-3)', 'order_averaged', 'price_avg_t(j-1)', 'price_avg_t(j-2)',
            'price_avg_t(j-3)', 'weather_t(j)', 'weather_t(j-1)', 'weather_t(j-2)', 'weather_t(j-3)', 
            'pm_t(j)','pm_t(j-1)','pm_t(j-2)','pm_t(j-3)','pm_avg', 'tj_1_j', 'tj_2_j','tj_3_j','tj_4_j',
            'tj_1_j_1', 'tj_2_j_1','tj_3_j_1','tj_4_j_1', 'tj_1_j_2', 'tj_2_j_2','tj_3_j_2','tj_4_j_2',
            'tj_1_j_3', 'tj_2_j_3','tj_3_j_3','tj_4_j_3','day_of_week']

In [None]:
feature_numpy_training = data_training_training[features].to_numpy().astype(float)

In [None]:
len(feature_numpy_training)

In [None]:
target_numpy_training = data_training_training['gap_delta'].to_numpy().astype(float)

In [None]:
feature_numpy_validation = data_training_validation[features].to_numpy().astype(float)

In [None]:
target_numpy_validation = data_training_validation['gap_delta'].to_numpy().astype(float)

### Train the model

In [None]:
# Least Absolute Deviation (LAD) regression

# Fit regression model
params = {'n_estimators':100, 'max_depth': 4, 'min_samples_split': 1, 'verbose': True,
          'learning_rate': 0.01, 'loss': 'quantile', 'alpha': 0.5}
model_lad = ensemble.GradientBoostingRegressor(**params)

model_lad.fit(feature_numpy_training, target_numpy_training)


In [None]:
# Least Squares (LS) regression

# Fit regression model
params = {'n_estimators':1, 'max_depth': 4, 'min_samples_split': 1, 'verbose': True,
          'learning_rate': 0.01, 'loss': 'ls'}
model_ls = ensemble.GradientBoostingRegressor(**params)

model_ls.fit(feature_numpy_training, target_numpy_training)


In [None]:
# Graphlab model
model_gl = graphlab.boosted_trees_regression.create(data_training_training, features=features, target='gap_delta', 
                                                    max_iterations = 100,
                                                    max_depth = 9, random_seed = 1)

### Model validation and evaluation

* LAD model

In [None]:
len(data_training_validation['gap_t(j-1)'])

In [None]:
len(model_lad.predict(feature_numpy_validation))

In [None]:
validation_prediction_lad = (model_lad.predict(feature_numpy_validation) + data_training_validation['gap_t(j-1)'].to_numpy().astype(float)).tolist()
#print validation_prediction[0:100]

In [None]:
validation_target = (target_numpy_validation + data_training_validation['gap_t(j-1)'].to_numpy().astype(float)).tolist()
#print validation_target[0:100]

In [None]:
model_comparison = graphlab.SFrame()
model_comparison['target_value'] = validation_target
model_comparison['model_lad_unmodified'] = validation_prediction_lad

In [None]:
# prediction_modification
validation_prediction_lad_modified = prediction_modification(validation_prediction_lad)
#print validation_prediction_modified[0:100]

In [None]:
(sum, count, mape_lad) = mape(validation_target,validation_prediction_lad_modified)

In [None]:
print "mape_lad = %f" %mape_lad

In [None]:
rmse_lad = np.sqrt(mean_squared_error(validation_target,validation_prediction_lad_modified))

In [None]:
print "rmse_lad = %f" %rmse_lad

In [None]:
model_comparison['model_lad_modified'] = validation_prediction_lad_modified

* LS model

In [None]:
validation_prediction_ls = model_ls.predict(feature_numpy_validation).tolist()

In [None]:
model_comparison['model_ls_unmodified'] = validation_prediction_ls

In [None]:
validation_prediction_ls_modified = prediction_modification(validation_prediction_ls)

In [None]:
(sum, count, mape_ls) = mape(validation_target,validation_prediction_ls_modified)

In [None]:
print "mape_ls = %f" %mape_ls

In [None]:
rmse_ls = np.sqrt(mean_squared_error(validation_target,validation_prediction_ls_modified))

In [None]:
print "rmse_ls = %f" %rmse_ls

In [None]:

model_comparison['model_ls_modified'] = validation_prediction_ls_modified

* Graphlab model

In [None]:
validation_prediction_gl = list(model_gl.predict(data_training_validation) + data_training_validation['gap_t(j-1)'])

In [None]:
model_comparison['model_gl_unmodified'] = validation_prediction_gl

In [None]:
validation_prediction_gl_modified = prediction_modification(validation_prediction_gl)

In [None]:
(sum, count, mape_gl) = mape(validation_target,validation_prediction_gl_modified)

In [None]:
print "mape_gl = %f" %mape_gl

In [None]:
rmse_gl = np.sqrt(mean_squared_error(validation_target,validation_prediction_gl_modified))

In [None]:
print "rmse_gl = %f" %rmse_gl

In [None]:
model_comparison['model_gl_modified'] = validation_prediction_gl_modified

* Export data

In [None]:
model_comparison.export_csv("model_comparison.csv")

### New boosting: integrate Graphlab rmse model and Scikit lad model
* Low rmse is used to fit big gap numbers
* Low lad is used to fit most of the numbers (here small gap numbers)
* Ask machine to learn how to choose different models

In [None]:
data_boosting = graphlab.SFrame()
data_boosting = data_training_validation['start_district_id', 'time_slot_id', 
                                         'gap_t(j)', 'gap_t(j-1)', 'gap_t(j-2)', 'gap_t(j-3)', 'day_of_week']
data_boosting['gl_ls_predicted'] = validation_prediction_gl_modified
data_boosting['scikit_lad_predicted'] = validation_prediction_lad_modified
features_boosting = ['gl_ls_predicted', 'scikit_lad_predicted']

In [None]:
params = {'n_estimators':100, 'max_depth': 4, 'min_samples_split': 1, 'verbose': True,
          'learning_rate': 0.01, 'loss': 'quantile', 'alpha': 0.5}
model_boosting = ensemble.GradientBoostingRegressor(**params)

feature_numpy_boosting = data_boosting[features_boosting].to_numpy().astype(float)
target_numpy_boosting = data_boosting['gap_t(j)'].to_numpy().astype(float)

model_boosting.fit(feature_numpy_boosting, target_numpy_boosting)

In [None]:
validation_prediction_boosting = model_boosting.predict(feature_numpy_boosting).tolist()
#validation_prediction_boosting = list(model_boosting.predict(data_training_validation))

In [None]:
validation_prediction_boosting_modified = prediction_modification(validation_prediction_boosting)

In [None]:
(sum, count, mape_boosting) = mape(validation_target,validation_prediction_boosting_modified)

In [None]:
rmse_boosting = np.sqrt(mean_squared_error(validation_target,validation_prediction_boosting_modified))

In [None]:
print "mape_boosting = %f" %mape_boosting
print "rmse_boosting = %f" %rmse_boosting

In [None]:
data_boosting['prediction_boosting'] = validation_prediction_boosting
data_boosting['gap_t(j)','gl_ls_predicted','scikit_lad_predicted','prediction_boosting']

### Conditional Combination

In [None]:
data_boosting['gap_t(j)','gl_ls_predicted','scikit_lad_predicted','prediction_boosting'].print_rows(num_rows=10)

In [None]:
# search for threhold
mape_smallest = 1
i_corresponding = 1
for i in range(1, int(max(validation_prediction_gl_modified)) + 1):
    prediction_conditional_combination = []
    for j in range(len(validation_prediction_gl_modified)):
        if validation_prediction_gl_modified[j] >= i:
            prediction_conditional_combination.append(validation_prediction_gl_modified[j])
        else:
            prediction_conditional_combination.append(validation_prediction_lad_modified[j])
    (sum, count, mape_conditional_combination) = mape(validation_target, prediction_conditional_combination)
    if mape_conditional_combination < mape_smallest:
        mape_smallest = mape_conditional_combination
        i_corresponding = i
print mape_smallest
print i_corresponding

prediction_conditional_combination = []
for j in range(len(validation_prediction_gl_modified)):
    if validation_prediction_gl_modified[j] >= i_corresponding:
        prediction_conditional_combination.append(validation_prediction_gl_modified[j])
    else:
        prediction_conditional_combination.append(validation_prediction_lad_modified[j])
    
data_boosting['prediction_conditional_combination'] = prediction_conditional_combination

In [None]:
data_boosting['gap_t(j)','gl_ls_predicted','scikit_lad_predicted','prediction_boosting', 'prediction_conditional_combination']\
.export_csv("model_comparison_2.csv")

### Model prediction

In [None]:
prediction_items = []
fhand = open("read_me_1.txt")
for line in fhand:
    line_splitted = line.strip().split('-')
    prediction_items.append(((line_splitted[0] + '-' + line_splitted[1] + '-' + line_splitted[2]), line_splitted[3]))
fhand.close()
#prediction_items

In [None]:
data_test_set_1_filtered = graphlab.SFrame()
for (date, time_slot_id) in prediction_items:
    data_test_set_1_filtered = data_test_set_1_filtered.append(
    data_test_set_1[(data_test_set_1['date'] == date) & (data_test_set_1['time_slot_id'] == int(time_slot_id))])

In [None]:
feature_numpy_test = data_test_set_1_filtered[features].to_numpy()

In [None]:
test_prediction = model.predict(feature_numpy_test).tolist()

In [None]:
test_prediction_modified = prediction_modification(test_prediction)

# test_prediction_modified[0:100]

In [None]:
data_test_set_1_filtered['prediction'] = test_prediction_modified

In [None]:
#data_test_set_1_filtered

### Make submission file

In [None]:
def make_submission(result, filename='submission.txt'):
    output_file = open(filename,'w')
    for row in data_test_set_1_filtered:
        output_file.write(str(row['start_district_id']) + ',' + row['date'] + '-' \
                          + str(row['time_slot_id']) + ',' + str(row['prediction']) + '\n')
    output_file.close()

In [None]:
make_submission(data_test_set_1_filtered)