# Practicum Project - Modeling

### Import packages

In [144]:
import pandas as pd
import numpy as np
import h2o
import operator
from datetime import datetime, date
import matplotlib.pyplot as plt
from h2o.estimators.glm import H2OGeneralizedLinearEstimator
from h2o.estimators.gbm import H2OGradientBoostingEstimator 
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.grid.grid_search import H2OGridSearch
from h2o.estimators.deeplearning import H2ODeepLearningEstimator


### Initiate a H2O cluster

In [2]:
h2o.init(min_mem_size = '16g')

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_232"; OpenJDK Runtime Environment (build 1.8.0_232-8u232-b09-1~deb9u1-b09); OpenJDK 64-Bit Server VM (build 25.232-b09, mixed mode)
  Starting server from /opt/conda/lib/python3.6/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /tmp/tmp_28_7_x1
  JVM stdout: /tmp/tmp_28_7_x1/h2o_unknownUser_started_from_python.out
  JVM stderr: /tmp/tmp_28_7_x1/h2o_unknownUser_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,Etc/UTC
H2O data parsing timezone:,UTC
H2O cluster version:,3.26.0.10
H2O cluster version age:,"28 days, 6 hours and 3 minutes"
H2O cluster name:,H2O_from_python_unknownUser_yv890m
H2O cluster total nodes:,1
H2O cluster free memory:,15.33 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


### Data Processing

In [90]:
mydf = pd.read_csv("../input/final-processed-price-data/Final_Processed_Price_Data.csv")

In [91]:
mydf.start_date = pd.to_datetime(mydf.start_date)
mydf.end_date = pd.to_datetime(mydf.end_date)
mydf['length'] = mydf.end_date - mydf.start_date
mydf['length'] = mydf['length'] / np.timedelta64(1, 'D')
mydf.drop(mydf[mydf.length > 60].index, inplace=True)
mydf = mydf.dropna(subset=['price'])

In [93]:
mydf = mydf[['start_date','end_date','city','product','Explosions/Remote violence',
                 'Battles','Strategic developments','Violence against civilians',
                 'Military Forces of Syria (2000-)','Unidentified Armed Group (Syria)',
                 'Other','QSD: Syrian Democratic Forces','Islamic State (Syria)','price']]

In [197]:
mydf.head()

Unnamed: 0,start_date,end_date,city,product,Explosions/Remote violence,Battles,Strategic developments,Violence against civilians,Military Forces of Syria (2000-),Unidentified Armed Group (Syria),Other,QSD: Syrian Democratic Forces,Islamic State (Syria),price
1,2019-02-05,2019-02-19,Afrin,Baby Formula,347,32,9,12,315,36,24,1,0,2650.0
2,2019-02-19,2019-03-19,Afrin,Baby Formula,804,53,42,12,649,49,86,3,5,3350.0
3,2019-03-19,2019-04-04,Afrin,Baby Formula,373,41,19,22,296,33,35,6,1,3250.0
4,2019-04-04,2019-04-18,Afrin,Baby Formula,423,29,16,10,332,47,29,1,4,3625.0
5,2019-04-18,2019-05-02,Afrin,Baby Formula,478,29,5,9,354,34,24,1,1,2875.0


### 1. Baby Formula

In [137]:
BabyFormula_Result = pd.DataFrame()
t = mydf.loc[(mydf['product'] == 'Baby Formula')]
frame = h2o.H2OFrame(t)
train, test = frame.split_frame([0.8])
y = 'price'
X = [name for name in frame.columns if name not in ['start_date', 'end_date','product', y]]

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [138]:
# GLM
GLM_BabyFormula = H2OGeneralizedLinearEstimator(family='gaussian',
                                         solver='AUTO',
                                         standardize=True,
                                         lambda_search=True,
                                        nfolds = 5)

GLM_BabyFormula.train(X, y, training_frame=train)
BabyFormula_Result['GLM'] = [GLM_BabyFormula.rmse(),GLM_BabyFormula.mae(),GLM_BabyFormula.r2()]

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [140]:
# Random Forest
rf_BabyFormula = H2ORandomForestEstimator(
    ntrees=2000,                      
    max_depth=100,                   
    stopping_rounds=5,               
    score_each_iteration=True,      
    model_id='rf_model',
    nfolds = 5)             # for easy lookup in flow

rf_BabyFormula.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
BabyFormula_Result['Random Forest'] = [rf_BabyFormula.rmse(),rf_BabyFormula.mae(),rf_BabyFormula.r2()]

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [142]:
# Gradient Boosting
hyper_parameters = {'ntrees':list(range(0, 1000, 50)),
                    'max_depth':list(range(1, 100, 2)),
                   'sample_rate':[s/float(10) for s in range(1, 11)],
                    'col_sample_rate':[s/float(10) for s in range(1, 11)]}

search_criteria = {'strategy':'RandomDiscrete',
                   'max_models':100,
                   'max_runtime_secs':600}

gsearch = H2OGridSearch(H2OGradientBoostingEstimator,
                        hyper_params=hyper_parameters,
                        search_criteria=search_criteria)

gsearch.train(x=X,
              y=y,
              training_frame=train,
             nfolds = 5)

GBM_BabyFormula = gsearch.get_grid()[0]
BabyFormula_Result['Gradient Boosting'] = [GBM_BabyFormula.rmse(),GBM_BabyFormula.mae(),GBM_BabyFormula.r2()]

gbm Grid Build progress: |████████████████████████████████████████████████| 100%


In [151]:
#Neural Network
nn_BabyFormula = H2ODeepLearningEstimator(
    epochs=100,                   
    hidden=[100],                
    input_dropout_ratio=0.1,    
    hidden_dropout_ratios=[0.05], 
    activation='TanhWithDropout', 
    l1=0.001,                     
    l2=0.01,                      
    adaptive_rate=True,           
    stopping_rounds=5,            
    score_each_iteration=True,    
    model_id='nn_model',
    nfolds = 5)        

# train nn model
nn_BabyFormula.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
BabyFormula_Result['Neural Network'] = [nn_BabyFormula.rmse(),nn_BabyFormula.mae(),nn_BabyFormula.r2()]

deeplearning Model Build progress: |██████████████████████████████████████| 100%


In [152]:
BabyFormula_Result

Unnamed: 0,GLM,Random Forest,Gradient Boosting,Neural Network
0,2523.014611,1167.455115,522.312535,1192.030366
1,1781.288248,761.559694,362.248794,819.093219
2,1.4e-05,0.785891,0.957144,0.776782


In [194]:
GBM_BabyFormula

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  Grid_GBM_py_55_sid_9a3a_model_python_1575570960632_5_model_32


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,550.0,550.0,59805.0,1.0,5.0,2.783636,2.0,6.0,3.887273




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 272810.38470356603
RMSE: 522.3125354647025
MAE: 362.2487943503889
RMSLE: 1.0641957850460708
Mean Residual Deviance: 272810.38470356603

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 1078932.0319115978
RMSE: 1038.716531066873
MAE: 705.4510274382203
RMSLE: 1.2135276180450327
Mean Residual Deviance: 1078932.0319115978

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,703.4181,60.189056,736.2474,757.358,615.79376,741.4304,666.26086
1,mean_residual_deviance,1067401.6,357352.78,753186.4,1263782.2,617103.9,1283085.4,1419850.0
2,mse,1067401.6,357352.78,753186.4,1263782.2,617103.9,1283085.4,1419850.0
3,r2,0.8250138,0.03379821,0.784537,0.85655457,0.857361,0.79576254,0.83085406
4,residual_deviance,1067401.6,357352.78,753186.4,1263782.2,617103.9,1283085.4,1419850.0
5,rmse,1020.3823,181.04387,867.8631,1124.1807,785.5596,1132.7336,1191.5746
6,rmsle,1.163443,0.3461675,0.65946984,1.6252215,1.1595579,1.1122442,1.2607214



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2019-12-05 20:36:51,3 min 44.671 sec,0.0,2523.032161,1781.30013,6365691.0
1,,2019-12-05 20:36:51,3 min 44.673 sec,1.0,2355.045627,1640.87851,5546240.0
2,,2019-12-05 20:36:51,3 min 44.675 sec,2.0,2320.377511,1619.560603,5384152.0
3,,2019-12-05 20:36:51,3 min 44.676 sec,3.0,2266.437941,1592.104289,5136741.0
4,,2019-12-05 20:36:51,3 min 44.678 sec,4.0,2091.709729,1449.451691,4375250.0
5,,2019-12-05 20:36:51,3 min 44.679 sec,5.0,1978.902774,1351.291826,3916056.0
6,,2019-12-05 20:36:51,3 min 44.681 sec,6.0,1876.274742,1266.68232,3520407.0
7,,2019-12-05 20:36:51,3 min 44.682 sec,7.0,1764.17909,1168.515989,3112328.0
8,,2019-12-05 20:36:51,3 min 44.684 sec,8.0,1676.265471,1090.231478,2809866.0
9,,2019-12-05 20:36:51,3 min 44.685 sec,9.0,1588.980105,1004.51411,2524858.0



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,city,5486499000.0,1.0,0.712949
1,Islamic State (Syria),349631800.0,0.063726,0.045433
2,Battles,336357100.0,0.061306,0.043708
3,Explosions/Remote violence,299580700.0,0.054603,0.038929
4,Other,256328100.0,0.04672,0.033309
5,Unidentified Armed Group (Syria),239648400.0,0.04368,0.031141
6,Violence against civilians,222602200.0,0.040573,0.028926
7,QSD: Syrian Democratic Forces,215869500.0,0.039346,0.028051
8,Military Forces of Syria (2000-),190770800.0,0.034771,0.02479
9,Strategic developments,98209260.0,0.0179,0.012762




### 2. Bulgur Wheat

In [156]:
BulgurWheat_Result = pd.DataFrame()
t = mydf.loc[(mydf['product'] == 'Bulgur Wheat')]
frame = h2o.H2OFrame(t)
train, test = frame.split_frame([0.8])
y = 'price'
X = [name for name in frame.columns if name not in ['start_date', 'end_date','product', y]]

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [157]:
# GLM
GLM_BulgurWheat = H2OGeneralizedLinearEstimator(family='gaussian',
                                         solver='AUTO',
                                         standardize=True,
                                         lambda_search=True,
                                        nfolds = 5)

GLM_BulgurWheat.train(X, y, training_frame=train)
BulgurWheat_Result['GLM'] = [GLM_BulgurWheat.rmse(),GLM_BulgurWheat.mae(),GLM_BulgurWheat.r2()]

glm Model Build progress: |███████████████████████████████████████████████| 100%


In [158]:
# Random Forest
rf_BulgurWheat = H2ORandomForestEstimator(
    ntrees=2000,                      
    max_depth=100,                   
    stopping_rounds=5,               
    score_each_iteration=True,      
    model_id='rf_model',
    nfolds = 5)             # for easy lookup in flow

rf_BulgurWheat.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
BulgurWheat_Result['Random Forest'] = [rf_BulgurWheat.rmse(),rf_BulgurWheat.mae(),rf_BulgurWheat.r2()]

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [159]:
# Gradient Boosting
hyper_parameters = {'ntrees':list(range(0, 1000, 50)),
                    'max_depth':list(range(1, 100, 2)),
                   'sample_rate':[s/float(10) for s in range(1, 11)],
                    'col_sample_rate':[s/float(10) for s in range(1, 11)]}

search_criteria = {'strategy':'RandomDiscrete',
                   'max_models':100,
                   'max_runtime_secs':300}

gsearch = H2OGridSearch(H2OGradientBoostingEstimator,
                        hyper_params=hyper_parameters,
                        search_criteria=search_criteria)

gsearch.train(x=X,
              y=y,
              training_frame=train,
             nfolds = 5)

GBM_BulgurWheat = gsearch.get_grid()[0]
BulgurWheat_Result['Gradient Boosting'] = [GBM_BulgurWheat.rmse(),GBM_BulgurWheat.mae(),GBM_BulgurWheat.r2()]

gbm Grid Build progress: |████████████████████████████████████████████████| 100%


In [160]:
#Neural Network
nn_BulgurWheat = H2ODeepLearningEstimator(
    epochs=50,                   
    hidden=[100],                
    input_dropout_ratio=0.1,    
    hidden_dropout_ratios=[0.05], 
    activation='TanhWithDropout', 
    l1=0.001,                     
    l2=0.01,                      
    adaptive_rate=True,           
    stopping_rounds=5,            
    score_each_iteration=True,    
    model_id='nn_model',
    nfolds = 5)        

# train nn model
nn_BulgurWheat.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
BulgurWheat_Result['Neural Network'] = [nn_BulgurWheat.rmse(),nn_BulgurWheat.mae(),nn_BulgurWheat.r2()]

BulgurWheat_Result

deeplearning Model Build progress: |██████████████████████████████████████| 100%


Unnamed: 0,GLM,Random Forest,Gradient Boosting,Neural Network
0,104.859947,75.373533,15.172966,96.960532
1,78.887949,48.866543,9.87839,73.440372
2,0.000287,0.483472,0.979069,0.145236


In [193]:
GBM_BulgurWheat

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  Grid_GBM_py_59_sid_9a3a_model_python_1575570960632_7_model_31


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,300.0,300.0,53253.0,3.0,10.0,6.106667,7.0,11.0,9.403334




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 230.21891220066593
RMSE: 15.172966493097713
MAE: 9.878390189928886
RMSLE: 0.04833957458518292
Mean Residual Deviance: 230.21891220066593

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 5140.034250676028
RMSE: 71.69403218313242
MAE: 49.544941672104535
RMSLE: 0.2209076339559042
Mean Residual Deviance: 5140.034250676028

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,49.289253,6.235176,58.594345,48.254784,46.70361,41.70283,51.190693
1,mean_residual_deviance,5087.816,1785.4279,8065.7227,4862.096,4758.832,3248.5417,4503.887
2,mse,5087.816,1785.4279,8065.7227,4862.096,4758.832,3248.5417,4503.887
3,r2,0.53356034,0.124120265,0.3278271,0.6442646,0.6019982,0.5785141,0.5151978
4,residual_deviance,5087.816,1785.4279,8065.7227,4862.096,4758.832,3248.5417,4503.887
5,rmse,70.52587,11.932984,89.80937,69.72874,68.98428,56.99598,67.11101
6,rmsle,0.21851486,0.028360117,0.2661762,0.21845159,0.20006067,0.19444722,0.21343863



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2019-12-05 20:54:05,4 min 25.579 sec,0.0,104.875003,78.903104,10998.766158
1,,2019-12-05 20:54:05,4 min 25.582 sec,1.0,100.391114,74.509933,10078.375744
2,,2019-12-05 20:54:05,4 min 25.584 sec,2.0,94.90261,70.278148,9006.505465
3,,2019-12-05 20:54:05,4 min 25.587 sec,3.0,91.378375,67.483903,8350.007377
4,,2019-12-05 20:54:05,4 min 25.589 sec,4.0,87.203786,64.246518,7604.500234
5,,2019-12-05 20:54:05,4 min 25.592 sec,5.0,84.866334,61.991427,7202.294706
6,,2019-12-05 20:54:05,4 min 25.594 sec,6.0,82.759069,60.280919,6849.063466
7,,2019-12-05 20:54:05,4 min 25.597 sec,7.0,80.536072,58.799697,6486.058913
8,,2019-12-05 20:54:05,4 min 25.599 sec,8.0,77.181622,56.288355,5957.002749
9,,2019-12-05 20:54:05,4 min 25.602 sec,9.0,75.824781,54.473124,5749.397429



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,city,3396558.0,1.0,0.250399
1,Explosions/Remote violence,2429207.0,0.715197,0.179084
2,Unidentified Armed Group (Syria),1443661.0,0.425037,0.106429
3,Battles,1367472.0,0.402605,0.100812
4,Islamic State (Syria),1102158.0,0.324493,0.081253
5,Military Forces of Syria (2000-),965368.6,0.28422,0.071168
6,QSD: Syrian Democratic Forces,957442.6,0.281886,0.070584
7,Other,653673.8,0.192452,0.04819
8,Strategic developments,650560.2,0.191535,0.04796
9,Violence against civilians,598493.6,0.176206,0.044122




### 3. Chickpeas

In [161]:
Chickpeas_Result = pd.DataFrame()
t = mydf.loc[(mydf['product'] == 'Chickpeas')]
frame = h2o.H2OFrame(t)
train, test = frame.split_frame([0.8])
y = 'price'
X = [name for name in frame.columns if name not in ['start_date', 'end_date','product', y]]

# GLM
GLM_Chickpeas = H2OGeneralizedLinearEstimator(family='gaussian',
                                         solver='AUTO',
                                         standardize=True,
                                         lambda_search=True,
                                        nfolds = 5)

GLM_Chickpeas.train(X, y, training_frame=train)
Chickpeas_Result['GLM'] = [GLM_Chickpeas.rmse(),GLM_Chickpeas.mae(),GLM_Chickpeas.r2()]

# Random Forest
rf_Chickpeas = H2ORandomForestEstimator(
    ntrees=2000,                      
    max_depth=100,                   
    stopping_rounds=5,               
    score_each_iteration=True,      
    model_id='rf_model',
    nfolds = 5)             # for easy lookup in flow

rf_Chickpeas.train(
    x=X,
    y=y,
    training_frame=train)

Chickpeas_Result['Random Forest'] = [rf_Chickpeas.rmse(),rf_Chickpeas.mae(),rf_Chickpeas.r2()]

# Gradient Boosting
hyper_parameters = {'ntrees':list(range(0, 1000, 50)),
                    'max_depth':list(range(1, 100, 2)),
                   'sample_rate':[s/float(10) for s in range(1, 11)],
                    'col_sample_rate':[s/float(10) for s in range(1, 11)]}

search_criteria = {'strategy':'RandomDiscrete',
                   'max_models':100,
                   'max_runtime_secs':300}

gsearch = H2OGridSearch(H2OGradientBoostingEstimator,
                        hyper_params=hyper_parameters,
                        search_criteria=search_criteria)

gsearch.train(x=X,
              y=y,
              training_frame=train,
             nfolds = 5)

GBM_Chickpeas = gsearch.get_grid()[0]
Chickpeas_Result['Gradient Boosting'] = [GBM_Chickpeas.rmse(),GBM_Chickpeas.mae(),GBM_Chickpeas.r2()]

#Neural Network
nn_Chickpeas = H2ODeepLearningEstimator(
    epochs=50,                   
    hidden=[100],                
    input_dropout_ratio=0.1,    
    hidden_dropout_ratios=[0.05], 
    activation='TanhWithDropout', 
    l1=0.001,                     
    l2=0.01,                      
    adaptive_rate=True,           
    stopping_rounds=5,            
    score_each_iteration=True,    
    model_id='nn_model',
    nfolds = 5)        

nn_Chickpeas.train(
    x=X,
    y=y,
    training_frame=train)

Chickpeas_Result['Neural Network'] = [nn_Chickpeas.rmse(),nn_Chickpeas.mae(),nn_Chickpeas.r2()]

# print model information
Chickpeas_Result


Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
gbm Grid Build progress: |████████████████████████████████████████████████| 100%
deeplearning Model Build progress: |██████████████████████████████████████| 100%


Unnamed: 0,GLM,Random Forest,Gradient Boosting,Neural Network
0,221.267847,142.961662,0.565259,154.34657
1,157.760835,88.866776,0.244154,97.943932
2,0.000109,0.582597,0.999993,0.513469


In [192]:
GBM_Chickpeas

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  Grid_GBM_py_63_sid_9a3a_model_python_1575570960632_9_model_11


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,750.0,750.0,216971.0,6.0,15.0,10.565333,16.0,21.0,18.325333




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 0.3195178034120893
RMSE: 0.5652590586731798
MAE: 0.24415361120345744
RMSLE: 0.0009713976970573164
Mean Residual Deviance: 0.3195178034120893

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 22227.46880652738
RMSE: 149.0887950401618
MAE: 95.11484167578203
RMSLE: 0.24733387674311538
Mean Residual Deviance: 22227.46880652738

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,96.036354,19.699406,110.50289,70.93894,118.85069,82.25071,97.638535
1,mean_residual_deviance,22418.527,8888.949,33217.285,9356.277,27177.158,22408.027,19933.887
2,mse,22418.527,8888.949,33217.285,9356.277,27177.158,22408.027,19933.887
3,r2,0.53791976,0.16725987,0.5976431,0.76709867,0.306186,0.49386063,0.5248105
4,residual_deviance,22418.527,8888.949,33217.285,9356.277,27177.158,22408.027,19933.887
5,rmse,146.9439,32.132942,182.2561,96.72785,164.85497,149.69312,141.18741
6,rmsle,0.24736455,0.024251541,0.2588584,0.20933598,0.27141616,0.23879017,0.25842205



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2019-12-05 20:56:26,1 min 41.577 sec,0.0,221.279868,157.776067,48964.779773
1,,2019-12-05 20:56:26,1 min 41.581 sec,1.0,206.014459,144.407078,42441.957333
2,,2019-12-05 20:56:26,1 min 41.584 sec,2.0,192.774356,133.145087,37161.952473
3,,2019-12-05 20:56:26,1 min 41.587 sec,3.0,180.846367,122.573962,32705.408563
4,,2019-12-05 20:56:26,1 min 41.589 sec,4.0,169.961847,113.862595,28887.029369
5,,2019-12-05 20:56:26,1 min 41.593 sec,5.0,160.828206,106.164199,25865.711963
6,,2019-12-05 20:56:26,1 min 41.595 sec,6.0,153.017232,99.785422,23414.273391
7,,2019-12-05 20:56:27,1 min 41.598 sec,7.0,145.655321,94.198097,21215.4724
8,,2019-12-05 20:56:27,1 min 41.601 sec,8.0,139.311337,89.381328,19407.648551
9,,2019-12-05 20:56:27,1 min 41.604 sec,9.0,133.840975,85.256606,17913.406712



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,city,22630440.0,1.0,0.373679
1,Military Forces of Syria (2000-),13492880.0,0.596227,0.222798
2,Islamic State (Syria),5789058.0,0.255808,0.09559
3,Battles,4482231.0,0.198062,0.074012
4,Explosions/Remote violence,3498229.0,0.154581,0.057764
5,Unidentified Armed Group (Syria),3246984.0,0.143479,0.053615
6,QSD: Syrian Democratic Forces,2463242.0,0.108846,0.040674
7,Violence against civilians,2447853.0,0.108166,0.04042
8,Strategic developments,1697596.0,0.075014,0.028031
9,Other,812620.8,0.035908,0.013418




### 4. Cooking Oil

In [162]:
CookingOil_Result = pd.DataFrame()
t = mydf.loc[(mydf['product'] == 'Cooking Oil')]
frame = h2o.H2OFrame(t)
train, test = frame.split_frame([0.8])
y = 'price'
X = [name for name in frame.columns if name not in ['start_date', 'end_date','product', y]]

# GLM
GLM_CookingOil = H2OGeneralizedLinearEstimator(family='gaussian',
                                         solver='AUTO',
                                         standardize=True,
                                         lambda_search=True,
                                        nfolds = 5)

GLM_CookingOil.train(X, y, training_frame=train)
CookingOil_Result['GLM'] = [GLM_CookingOil.rmse(),GLM_CookingOil.mae(),GLM_CookingOil.r2()]

# Random Forest
rf_CookingOil = H2ORandomForestEstimator(
    ntrees=2000,                      
    max_depth=100,                   
    stopping_rounds=5,               
    score_each_iteration=True,      
    model_id='rf_model',
    nfolds = 5)             # for easy lookup in flow

rf_CookingOil.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
CookingOil_Result['Random Forest'] = [rf_CookingOil.rmse(),rf_CookingOil.mae(),rf_CookingOil.r2()]

# Gradient Boosting
hyper_parameters = {'ntrees':list(range(0, 1000, 50)),
                    'max_depth':list(range(1, 100, 2)),
                   'sample_rate':[s/float(10) for s in range(1, 11)],
                    'col_sample_rate':[s/float(10) for s in range(1, 11)]}

search_criteria = {'strategy':'RandomDiscrete',
                   'max_models':100,
                   'max_runtime_secs':300}

gsearch = H2OGridSearch(H2OGradientBoostingEstimator,
                        hyper_params=hyper_parameters,
                        search_criteria=search_criteria)

gsearch.train(x=X,
              y=y,
              training_frame=train,
             nfolds = 5)

GBM_CookingOil = gsearch.get_grid()[0]
CookingOil_Result['Gradient Boosting'] = [GBM_CookingOil.rmse(),GBM_CookingOil.mae(),GBM_CookingOil.r2()]

#Neural Network
nn_CookingOil = H2ODeepLearningEstimator(
    epochs=50,                   
    hidden=[100],                
    input_dropout_ratio=0.1,    
    hidden_dropout_ratios=[0.05], 
    activation='TanhWithDropout', 
    l1=0.001,                     
    l2=0.01,                      
    adaptive_rate=True,           
    stopping_rounds=5,            
    score_each_iteration=True,    
    model_id='nn_model',
    nfolds = 5)        

# train nn model
nn_CookingOil.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
CookingOil_Result['Neural Network'] = [nn_CookingOil.rmse(),nn_CookingOil.mae(),nn_CookingOil.r2()]

CookingOil_Result


Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
gbm Grid Build progress: |████████████████████████████████████████████████| 100%
deeplearning Model Build progress: |██████████████████████████████████████| 100%


Unnamed: 0,GLM,Random Forest,Gradient Boosting,Neural Network
0,195.298817,152.475227,39.997909,141.761156
1,132.743464,90.707275,21.589744,89.83804
2,0.000146,0.390553,0.958062,0.473193


In [191]:
GBM_CookingOil

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  Grid_GBM_py_67_sid_9a3a_model_python_1575570960632_11_model_36


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,100.0,100.0,27933.0,6.0,13.0,9.27,15.0,20.0,17.55




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 1599.832688690109
RMSE: 39.99790855394953
MAE: 21.589744431631907
RMSLE: 0.04397394710073751
Mean Residual Deviance: 1599.832688690109

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 19972.444491724156
RMSE: 141.32389922346522
MAE: 83.04496953622171
RMSLE: 0.16341178408211038
Mean Residual Deviance: 19972.444491724156

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,83.90135,17.693436,63.670498,80.061134,94.27209,73.048485,108.45454
1,mean_residual_deviance,20456.514,11073.24,10103.065,16058.983,26445.281,12802.648,36872.586
2,mse,20456.514,11073.24,10103.065,16058.983,26445.281,12802.648,36872.586
3,r2,0.46806777,0.1304334,0.6463605,0.54941946,0.31468984,0.4303291,0.3995399
4,residual_deviance,20456.514,11073.24,10103.065,16058.983,26445.281,12802.648,36872.586
5,rmse,139.00584,37.64782,100.51401,126.724045,162.62006,113.14879,192.02237
6,rmsle,0.16378453,0.030925712,0.12904657,0.15325655,0.2037677,0.14526667,0.18758516



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2019-12-05 21:04:49,4 min 59.075 sec,0.0,195.31309,132.758848,38147.203103
1,,2019-12-05 21:04:49,4 min 59.079 sec,1.0,186.192131,125.177295,34667.50959
2,,2019-12-05 21:04:49,4 min 59.082 sec,2.0,177.514635,116.77008,31511.445664
3,,2019-12-05 21:04:49,4 min 59.084 sec,3.0,170.187581,109.897386,28963.812887
4,,2019-12-05 21:04:49,4 min 59.086 sec,4.0,163.46094,103.738017,26719.479054
5,,2019-12-05 21:04:49,4 min 59.089 sec,5.0,157.782978,98.044551,24895.468212
6,,2019-12-05 21:04:49,4 min 59.091 sec,6.0,151.726608,93.61977,23020.963494
7,,2019-12-05 21:04:49,4 min 59.094 sec,7.0,147.203707,89.204054,21668.931297
8,,2019-12-05 21:04:49,4 min 59.096 sec,8.0,142.53487,84.878616,20316.189028
9,,2019-12-05 21:04:49,4 min 59.099 sec,9.0,136.355313,80.391085,18592.771323



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,city,10035160.0,1.0,0.232883
1,Battles,7285156.0,0.725963,0.169064
2,Unidentified Armed Group (Syria),6409415.0,0.638696,0.148741
3,Explosions/Remote violence,5802354.0,0.578202,0.134653
4,Other,3186310.0,0.317514,0.073944
5,Islamic State (Syria),2694166.0,0.268473,0.062523
6,Military Forces of Syria (2000-),2582961.0,0.257391,0.059942
7,QSD: Syrian Democratic Forces,1986089.0,0.197913,0.046091
8,Violence against civilians,1610262.0,0.160462,0.037369
9,Strategic developments,1499129.0,0.149388,0.03479




### 5. Flour

In [163]:
Flour_Result = pd.DataFrame()
t = mydf.loc[(mydf['product'] == 'Flour')]
frame = h2o.H2OFrame(t)
train, test = frame.split_frame([0.8])
y = 'price'
X = [name for name in frame.columns if name not in ['start_date', 'end_date','product', y]]

# GLM
GLM_Flour = H2OGeneralizedLinearEstimator(family='gaussian',
                                         solver='AUTO',
                                         standardize=True,
                                         lambda_search=True,
                                        nfolds = 5)

GLM_Flour.train(X, y, training_frame=train)
Flour_Result['GLM'] = [GLM_Flour.rmse(),GLM_Flour.mae(),GLM_Flour.r2()]

# Random Forest
rf_Flour = H2ORandomForestEstimator(
    ntrees=2000,                      
    max_depth=100,                   
    stopping_rounds=5,               
    score_each_iteration=True,      
    model_id='rf_model',
    nfolds = 5)             # for easy lookup in flow

rf_Flour.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
Flour_Result['Random Forest'] = [rf_Flour.rmse(),rf_Flour.mae(),rf_Flour.r2()]

# Gradient Boosting
hyper_parameters = {'ntrees':list(range(0, 1000, 50)),
                    'max_depth':list(range(1, 100, 2)),
                   'sample_rate':[s/float(10) for s in range(1, 11)],
                    'col_sample_rate':[s/float(10) for s in range(1, 11)]}

search_criteria = {'strategy':'RandomDiscrete',
                   'max_models':100,
                   'max_runtime_secs':300}

gsearch = H2OGridSearch(H2OGradientBoostingEstimator,
                        hyper_params=hyper_parameters,
                        search_criteria=search_criteria)

gsearch.train(x=X,
              y=y,
              training_frame=train,
             nfolds = 5)

GBM_Flour = gsearch.get_grid()[0]
Flour_Result['Gradient Boosting'] = [GBM_Flour.rmse(),GBM_Flour.mae(),GBM_Flour.r2()]

#Neural Network
nn_Flour = H2ODeepLearningEstimator(
    epochs=50,                   
    hidden=[100],                
    input_dropout_ratio=0.1,    
    hidden_dropout_ratios=[0.05], 
    activation='TanhWithDropout', 
    l1=0.001,                     
    l2=0.01,                      
    adaptive_rate=True,           
    stopping_rounds=5,            
    score_each_iteration=True,    
    model_id='nn_model',
    nfolds = 5)        

# train nn model
nn_Flour.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
Flour_Result['Neural Network'] = [nn_Flour.rmse(),nn_Flour.mae(),nn_Flour.r2()]

Flour_Result


Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
gbm Grid Build progress: |████████████████████████████████████████████████| 100%
deeplearning Model Build progress: |██████████████████████████████████████| 100%


Unnamed: 0,GLM,Random Forest,Gradient Boosting,Neural Network
0,137.312695,98.336544,54.728899,96.669613
1,96.300791,56.545631,29.91469,64.518807
2,0.000187,0.487224,0.841171,0.504461


In [189]:
GBM_Flour

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  Grid_GBM_py_71_sid_9a3a_model_python_1575570960632_13_model_4


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,250.0,250.0,32368.0,2.0,6.0,4.104,4.0,8.0,5.556




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 2995.2524216725546
RMSE: 54.728899328166236
MAE: 29.914690135854535
RMSLE: 0.1339703396800367
Mean Residual Deviance: 2995.2524216725546

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 8952.897871895799
RMSE: 94.61975413144867
MAE: 60.671134445607386
RMSLE: 0.2601093090066851
Mean Residual Deviance: 8952.897871895799

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,61.041676,7.0334516,61.02653,52.945858,59.381622,59.53018,72.32418
1,mean_residual_deviance,8968.647,6824.2026,6692.461,4890.0537,6279.2715,5864.0664,21117.385
2,mse,8968.647,6824.2026,6692.461,4890.0537,6279.2715,5864.0664,21117.385
3,r2,0.4556827,0.16903815,0.2858203,0.71922183,0.3611835,0.39668408,0.51550376
4,residual_deviance,8968.647,6824.2026,6692.461,4890.0537,6279.2715,5864.0664,21117.385
5,rmse,90.57473,30.920582,81.807465,69.92892,79.24185,76.577194,145.31822
6,rmsle,0.26104945,0.019446677,0.2737876,0.23027976,0.2720642,0.2532063,0.27590942



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2019-12-05 21:05:29,34.611 sec,0.0,137.325527,96.311781,18858.300399
1,,2019-12-05 21:05:29,34.613 sec,1.0,133.368001,91.375613,17787.023772
2,,2019-12-05 21:05:29,34.615 sec,2.0,127.511547,86.69602,16259.194599
3,,2019-12-05 21:05:29,34.616 sec,3.0,124.013281,81.870021,15379.293927
4,,2019-12-05 21:05:29,34.618 sec,4.0,121.015573,78.706147,14644.768862
5,,2019-12-05 21:05:29,34.620 sec,5.0,118.69627,75.709959,14088.804438
6,,2019-12-05 21:05:29,34.622 sec,6.0,115.035627,73.0665,13233.195582
7,,2019-12-05 21:05:29,34.624 sec,7.0,113.304671,71.108917,12837.948556
8,,2019-12-05 21:05:29,34.626 sec,8.0,110.036064,68.738794,12107.935313
9,,2019-12-05 21:05:29,34.628 sec,9.0,108.325502,67.249031,11734.414307



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,city,3986614.0,1.0,0.207027
1,Unidentified Armed Group (Syria),3004600.0,0.753672,0.156031
2,Battles,2789744.0,0.699778,0.144873
3,Other,1824662.0,0.457697,0.094756
4,Explosions/Remote violence,1756090.0,0.440497,0.091195
5,Violence against civilians,1620548.0,0.406497,0.084156
6,Military Forces of Syria (2000-),1514096.0,0.379795,0.078628
7,Islamic State (Syria),1232953.0,0.309273,0.064028
8,QSD: Syrian Democratic Forces,1167422.0,0.292836,0.060625
9,Strategic developments,359745.3,0.090238,0.018682




### 6. Gasoline

In [164]:
Gasoline_Result = pd.DataFrame()
t = mydf.loc[(mydf['product'] == 'Gasoline')]
frame = h2o.H2OFrame(t)
train, test = frame.split_frame([0.8])
y = 'price'
X = [name for name in frame.columns if name not in ['start_date', 'end_date','product', y]]

# GLM
GLM_Gasoline = H2OGeneralizedLinearEstimator(family='gaussian',
                                         solver='AUTO',
                                         standardize=True,
                                         lambda_search=True,
                                        nfolds = 5)

GLM_Gasoline.train(X, y, training_frame=train)
Gasoline_Result['GLM'] = [GLM_Gasoline.rmse(),GLM_Gasoline.mae(),GLM_Gasoline.r2()]

# Random Forest
rf_Gasoline = H2ORandomForestEstimator(
    ntrees=2000,                      
    max_depth=100,                   
    stopping_rounds=5,               
    score_each_iteration=True,      
    model_id='rf_model',
    nfolds = 5)             # for easy lookup in flow

rf_Gasoline.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
Gasoline_Result['Random Forest'] = [rf_Gasoline.rmse(),rf_Gasoline.mae(),rf_Gasoline.r2()]

# Gradient Boosting
hyper_parameters = {'ntrees':list(range(0, 1000, 50)),
                    'max_depth':list(range(1, 100, 2)),
                   'sample_rate':[s/float(10) for s in range(1, 11)],
                    'col_sample_rate':[s/float(10) for s in range(1, 11)]}

search_criteria = {'strategy':'RandomDiscrete',
                   'max_models':100,
                   'max_runtime_secs':300}

gsearch = H2OGridSearch(H2OGradientBoostingEstimator,
                        hyper_params=hyper_parameters,
                        search_criteria=search_criteria)

gsearch.train(x=X,
              y=y,
              training_frame=train,
             nfolds = 5)

GBM_Gasoline = gsearch.get_grid()[0]
Gasoline_Result['Gradient Boosting'] = [GBM_Gasoline.rmse(),GBM_Gasoline.mae(),GBM_Gasoline.r2()]

#Neural Network
nn_Gasoline = H2ODeepLearningEstimator(
    epochs=50,                   
    hidden=[100],                
    input_dropout_ratio=0.1,    
    hidden_dropout_ratios=[0.05], 
    activation='TanhWithDropout', 
    l1=0.001,                     
    l2=0.01,                      
    adaptive_rate=True,           
    stopping_rounds=5,            
    score_each_iteration=True,    
    model_id='nn_model',
    nfolds = 5)        

# train nn model
nn_Gasoline.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
Gasoline_Result['Neural Network'] = [nn_Gasoline.rmse(),nn_Gasoline.mae(),nn_Gasoline.r2()]

Gasoline_Result


Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
gbm Grid Build progress: |████████████████████████████████████████████████| 100%
deeplearning Model Build progress: |██████████████████████████████████████| 100%


Unnamed: 0,GLM,Random Forest,Gradient Boosting,Neural Network
0,501.141446,217.840999,39.73599,297.777424
1,199.42958,98.755409,27.228939,185.44438
2,3.4e-05,0.811051,0.993713,0.64694


In [188]:
GBM_Gasoline

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  Grid_GBM_py_75_sid_9a3a_model_python_1575570960632_15_model_29


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,900.0,900.0,108691.0,2.0,6.0,3.598889,3.0,7.0,4.872222




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 1578.9488829225024
RMSE: 39.73598976900541
MAE: 27.2289389151114
RMSLE: 0.09136298071130647
Mean Residual Deviance: 1578.9488829225024

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 40884.02488867999
RMSE: 202.1979843833266
MAE: 140.58798643092075
RMSLE: NaN
Mean Residual Deviance: 40884.02488867999

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,139.8346,19.623682,129.78162,161.05989,119.344025,160.70331,128.28412
1,mean_residual_deviance,40189.57,15602.534,51996.27,57360.332,18877.268,41645.27,31068.725
2,mse,40189.57,15602.534,51996.27,57360.332,18877.268,41645.27,31068.725
3,r2,-1.0901072,2.7817693,0.87990725,-5.3039894,0.9760766,-2.5871878,0.5846572
4,residual_deviance,40189.57,15602.534,51996.27,57360.332,18877.268,41645.27,31068.725
5,rmse,197.05132,41.236362,228.0269,239.50017,137.39456,204.07173,176.26323
6,rmsle,0.48211053,0.12595859,0.3723358,0.45436797,,0.6196278,



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2019-12-05 21:14:19,4 min 19.913 sec,0.0,501.150024,199.433177,251151.346384
1,,2019-12-05 21:14:19,4 min 19.916 sec,1.0,490.750451,193.704846,240836.005581
2,,2019-12-05 21:14:19,4 min 19.918 sec,2.0,478.959801,187.787298,229402.491334
3,,2019-12-05 21:14:19,4 min 19.920 sec,3.0,466.716712,184.191478,217824.489571
4,,2019-12-05 21:14:19,4 min 19.922 sec,4.0,463.90249,178.148935,215205.520421
5,,2019-12-05 21:14:19,4 min 19.924 sec,5.0,452.784592,172.825521,205013.887195
6,,2019-12-05 21:14:19,4 min 19.926 sec,6.0,452.409103,169.447492,204673.996758
7,,2019-12-05 21:14:19,4 min 19.928 sec,7.0,452.469796,168.304603,204728.916196
8,,2019-12-05 21:14:19,4 min 19.930 sec,8.0,437.9464,165.1969,191797.049579
9,,2019-12-05 21:14:19,4 min 19.933 sec,9.0,428.911642,163.120128,183965.196912



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,city,88637368.0,1.0,0.299799
1,Unidentified Armed Group (Syria),55879104.0,0.630424,0.189001
2,Strategic developments,44789604.0,0.505313,0.151492
3,Battles,25986510.0,0.293178,0.087894
4,Explosions/Remote violence,25406498.0,0.286634,0.085933
5,Islamic State (Syria),16591234.0,0.187181,0.056117
6,Other,14451146.0,0.163037,0.048878
7,QSD: Syrian Democratic Forces,9272976.0,0.104617,0.031364
8,Military Forces of Syria (2000-),8775136.0,0.099,0.02968
9,Violence against civilians,5866204.0,0.066182,0.019841




### 7. Kerosene

In [165]:
Kerosene_Result = pd.DataFrame()
t = mydf.loc[(mydf['product'] == 'Kerosene')]
frame = h2o.H2OFrame(t)
train, test = frame.split_frame([0.8])
y = 'price'
X = [name for name in frame.columns if name not in ['start_date', 'end_date','product', y]]

# GLM
GLM_Kerosene = H2OGeneralizedLinearEstimator(family='gaussian',
                                         solver='AUTO',
                                         standardize=True,
                                         lambda_search=True,
                                        nfolds = 5)

GLM_Kerosene.train(X, y, training_frame=train)
Kerosene_Result['GLM'] = [GLM_Kerosene.rmse(),GLM_Kerosene.mae(),GLM_Kerosene.r2()]

# Random Forest
rf_Kerosene = H2ORandomForestEstimator(
    ntrees=2000,                      
    max_depth=100,                   
    stopping_rounds=5,               
    score_each_iteration=True,      
    model_id='rf_model',
    nfolds = 5)             # for easy lookup in flow

rf_Kerosene.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
Kerosene_Result['Random Forest'] = [rf_Kerosene.rmse(),rf_Kerosene.mae(),rf_Kerosene.r2()]

# Gradient Boosting
hyper_parameters = {'ntrees':list(range(0, 1000, 50)),
                    'max_depth':list(range(1, 100, 2)),
                   'sample_rate':[s/float(10) for s in range(1, 11)],
                    'col_sample_rate':[s/float(10) for s in range(1, 11)]}

search_criteria = {'strategy':'RandomDiscrete',
                   'max_models':100,
                   'max_runtime_secs':300}

gsearch = H2OGridSearch(H2OGradientBoostingEstimator,
                        hyper_params=hyper_parameters,
                        search_criteria=search_criteria)

gsearch.train(x=X,
              y=y,
              training_frame=train,
             nfolds = 5)

GBM_Kerosene = gsearch.get_grid()[0]
Kerosene_Result['Gradient Boosting'] = [GBM_Kerosene.rmse(),GBM_Kerosene.mae(),GBM_Kerosene.r2()]

#Neural Network
nn_Kerosene = H2ODeepLearningEstimator(
    epochs=50,                   
    hidden=[100],                
    input_dropout_ratio=0.1,    
    hidden_dropout_ratios=[0.05], 
    activation='TanhWithDropout', 
    l1=0.001,                     
    l2=0.01,                      
    adaptive_rate=True,           
    stopping_rounds=5,            
    score_each_iteration=True,    
    model_id='nn_model',
    nfolds = 5)        

# train nn model
nn_Kerosene.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
Kerosene_Result['Neural Network'] = [nn_Kerosene.rmse(),nn_Kerosene.mae(),nn_Kerosene.r2()]

Kerosene_Result


Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
gbm Grid Build progress: |████████████████████████████████████████████████| 100%
deeplearning Model Build progress: |██████████████████████████████████████| 100%


Unnamed: 0,GLM,Random Forest,Gradient Boosting,Neural Network
0,182.269331,100.102733,1.990371,110.760645
1,106.334653,49.42374,0.944729,74.517685
2,0.000143,0.69842,0.999881,0.630783


In [187]:
GBM_Kerosene

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  Grid_GBM_py_79_sid_9a3a_model_python_1575570960632_17_model_29


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,950.0,950.0,172287.0,4.0,10.0,6.848421,7.0,13.0,9.721052




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 3.961576829822165
RMSE: 1.9903710281809683
MAE: 0.9447293281555176
RMSLE: 0.0064238103523563
Mean Residual Deviance: 3.961576829822165

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 8207.704146610764
RMSE: 90.5963804277564
MAE: 56.40922780579365
RMSLE: NaN
Mean Residual Deviance: 8207.704146610764

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,57.754276,21.031235,45.246037,91.07722,65.150215,38.661114,48.63681
1,mean_residual_deviance,8725.494,8228.858,4163.3086,22256.662,10872.935,2642.1953,3692.369
2,mse,8725.494,8228.858,4163.3086,22256.662,10872.935,2642.1953,3692.369
3,r2,0.7239674,0.15865786,0.82376915,0.7083052,0.5331305,0.93283564,0.6217963
4,residual_deviance,8725.494,8228.858,4163.3086,22256.662,10872.935,2642.1953,3692.369
5,rmse,86.03018,40.686337,64.523705,149.18668,104.27336,51.40229,60.764866
6,rmsle,0.34636906,0.055403676,0.42077407,,0.3459316,0.2877427,0.33102784



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2019-12-05 21:19:22,4 min 17.689 sec,0.0,182.282327,106.348485,33226.84677
1,,2019-12-05 21:19:22,4 min 17.701 sec,1.0,172.460352,98.583973,29742.57311
2,,2019-12-05 21:19:22,4 min 17.703 sec,2.0,162.606987,91.055381,26441.032285
3,,2019-12-05 21:19:22,4 min 17.705 sec,3.0,155.00113,85.141352,24025.35029
4,,2019-12-05 21:19:22,4 min 17.707 sec,4.0,149.022318,79.969774,22207.651181
5,,2019-12-05 21:19:22,4 min 17.709 sec,5.0,144.142154,76.563291,20776.960535
6,,2019-12-05 21:19:22,4 min 17.711 sec,6.0,138.019619,71.61351,19049.415299
7,,2019-12-05 21:19:22,4 min 17.713 sec,7.0,132.415195,67.626534,17533.783758
8,,2019-12-05 21:19:22,4 min 17.715 sec,8.0,127.379605,63.592646,16225.563771
9,,2019-12-05 21:19:22,4 min 17.716 sec,9.0,122.949825,61.441005,15116.659467



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,city,16358140.0,1.0,0.526665
1,Unidentified Armed Group (Syria),4872380.0,0.297857,0.156871
2,Battles,2975578.0,0.181902,0.095801
3,Islamic State (Syria),1480320.0,0.090494,0.04766
4,Explosions/Remote violence,1439243.0,0.087983,0.046338
5,QSD: Syrian Democratic Forces,1165560.0,0.071253,0.037526
6,Strategic developments,1157638.0,0.070768,0.037271
7,Military Forces of Syria (2000-),663665.9,0.040571,0.021367
8,Violence against civilians,600365.6,0.036701,0.019329
9,Other,346982.8,0.021212,0.011171




### 8. Lentils

In [166]:
Lentils_Result = pd.DataFrame()
t = mydf.loc[(mydf['product'] == 'Lentils')]
frame = h2o.H2OFrame(t)
train, test = frame.split_frame([0.8])
y = 'price'
X = [name for name in frame.columns if name not in ['start_date', 'end_date','product', y]]

# GLM
GLM_Lentils = H2OGeneralizedLinearEstimator(family='gaussian',
                                         solver='AUTO',
                                         standardize=True,
                                         lambda_search=True,
                                        nfolds = 5)

GLM_Lentils.train(X, y, training_frame=train)
Lentils_Result['GLM'] = [GLM_Lentils.rmse(),GLM_Lentils.mae(),GLM_Lentils.r2()]

# Random Forest
rf_Lentils = H2ORandomForestEstimator(
    ntrees=2000,                      
    max_depth=100,                   
    stopping_rounds=5,               
    score_each_iteration=True,      
    model_id='rf_model',
    nfolds = 5)             # for easy lookup in flow

rf_Lentils.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
Lentils_Result['Random Forest'] = [rf_Lentils.rmse(),rf_Lentils.mae(),rf_Lentils.r2()]

# Gradient Boosting
hyper_parameters = {'ntrees':list(range(0, 1000, 50)),
                    'max_depth':list(range(1, 100, 2)),
                   'sample_rate':[s/float(10) for s in range(1, 11)],
                    'col_sample_rate':[s/float(10) for s in range(1, 11)]}

search_criteria = {'strategy':'RandomDiscrete',
                   'max_models':100,
                   'max_runtime_secs':300}

gsearch = H2OGridSearch(H2OGradientBoostingEstimator,
                        hyper_params=hyper_parameters,
                        search_criteria=search_criteria)

gsearch.train(x=X,
              y=y,
              training_frame=train,
             nfolds = 5)

GBM_Lentils = gsearch.get_grid()[0]
Lentils_Result['Gradient Boosting'] = [GBM_Lentils.rmse(),GBM_Lentils.mae(),GBM_Lentils.r2()]

#Neural Network
nn_Lentils = H2ODeepLearningEstimator(
    epochs=50,                   
    hidden=[100],                
    input_dropout_ratio=0.1,    
    hidden_dropout_ratios=[0.05], 
    activation='TanhWithDropout', 
    l1=0.001,                     
    l2=0.01,                      
    adaptive_rate=True,           
    stopping_rounds=5,            
    score_each_iteration=True,    
    model_id='nn_model',
    nfolds = 5)        

# train nn model
nn_Lentils.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
Lentils_Result['Neural Network'] = [nn_Lentils.rmse(),nn_Lentils.mae(),nn_Lentils.r2()]

Lentils_Result


Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
gbm Grid Build progress: |████████████████████████████████████████████████| 100%
deeplearning Model Build progress: |██████████████████████████████████████| 100%


Unnamed: 0,GLM,Random Forest,Gradient Boosting,Neural Network
0,183.929356,125.578364,22.329361,136.545392
1,124.934818,78.016907,16.729562,100.792209
2,0.000245,0.533962,0.985265,0.449008


In [186]:
GBM_Lentils

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  Grid_GBM_py_83_sid_9a3a_model_python_1575570960632_19_model_22


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,350.0,350.0,47252.0,3.0,3.0,3.0,4.0,8.0,6.014286




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 498.600361244437
RMSE: 22.32936096811633
MAE: 16.72956166948591
RMSLE: 0.055896628043366633
Mean Residual Deviance: 498.600361244437

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 13113.236893196163
RMSE: 114.51304245891016
MAE: 73.46930880021662
RMSLE: 0.21734361931901064
Mean Residual Deviance: 13113.236893196163

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,73.52007,10.41238,70.94754,67.25194,69.95826,91.93127,67.51135
1,mean_residual_deviance,13483.522,4369.7524,16841.883,8754.625,15208.73,17761.445,8850.929
2,mse,13483.522,4369.7524,16841.883,8754.625,15208.73,17761.445,8850.929
3,r2,0.5887985,0.13022831,0.6593497,0.45910603,0.6584979,0.44012094,0.7269181
4,residual_deviance,13483.522,4369.7524,16841.883,8754.625,15208.73,17761.445,8850.929
5,rmse,114.80351,19.483198,129.77628,93.566154,123.32368,133.27208,94.07938
6,rmsle,0.21303733,0.028535284,0.18245628,0.24229963,0.1866629,0.24078536,0.21298246



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2019-12-05 21:22:57,2 min 49.044 sec,0.0,183.95188,124.958679,33838.294277
1,,2019-12-05 21:22:57,2 min 49.046 sec,1.0,175.056009,116.20563,30644.606319
2,,2019-12-05 21:22:57,2 min 49.047 sec,2.0,165.028854,108.360312,27234.522499
3,,2019-12-05 21:22:57,2 min 49.049 sec,3.0,155.364954,101.852958,24138.269067
4,,2019-12-05 21:22:57,2 min 49.050 sec,4.0,146.749731,95.653119,21535.48352
5,,2019-12-05 21:22:57,2 min 49.052 sec,5.0,139.972425,91.05573,19592.279836
6,,2019-12-05 21:22:57,2 min 49.053 sec,6.0,133.67963,86.696827,17870.243563
7,,2019-12-05 21:22:57,2 min 49.055 sec,7.0,128.204863,82.758129,16436.48678
8,,2019-12-05 21:22:57,2 min 49.057 sec,8.0,122.475468,78.836133,15000.240345
9,,2019-12-05 21:22:57,2 min 49.058 sec,9.0,118.310491,77.050871,13997.372237



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,Battles,10425730.0,1.0,0.265247
1,Other,7775914.0,0.745839,0.197831
2,city,4923817.0,0.472276,0.12527
3,Islamic State (Syria),4518224.0,0.433373,0.114951
4,Unidentified Armed Group (Syria),3184920.0,0.305487,0.081029
5,Explosions/Remote violence,2383563.0,0.228623,0.060642
6,QSD: Syrian Democratic Forces,2224991.0,0.213413,0.056607
7,Military Forces of Syria (2000-),1986623.0,0.19055,0.050543
8,Strategic developments,957848.6,0.091874,0.024369
9,Violence against civilians,924122.7,0.088639,0.023511




### 9. Mazout

In [167]:

Mazout_Result = pd.DataFrame()
t = mydf.loc[(mydf['product'] == 'Mazout')]
frame = h2o.H2OFrame(t)
train, test = frame.split_frame([0.8])
y = 'price'
X = [name for name in frame.columns if name not in ['start_date', 'end_date','product', y]]

# GLM
GLM_Mazout = H2OGeneralizedLinearEstimator(family='gaussian',
                                         solver='AUTO',
                                         standardize=True,
                                         lambda_search=True,
                                        nfolds = 5)

GLM_Mazout.train(X, y, training_frame=train)
Mazout_Result['GLM'] = [GLM_Mazout.rmse(),GLM_Mazout.mae(),GLM_Mazout.r2()]

# Random Forest
rf_Mazout = H2ORandomForestEstimator(
    ntrees=2000,                      
    max_depth=100,                   
    stopping_rounds=5,               
    score_each_iteration=True,      
    model_id='rf_model',
    nfolds = 5)             # for easy lookup in flow

rf_Mazout.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
Mazout_Result['Random Forest'] = [rf_Mazout.rmse(),rf_Mazout.mae(),rf_Mazout.r2()]

# Gradient Boosting
hyper_parameters = {'ntrees':list(range(0, 1000, 50)),
                    'max_depth':list(range(1, 100, 2)),
                   'sample_rate':[s/float(10) for s in range(1, 11)],
                    'col_sample_rate':[s/float(10) for s in range(1, 11)]}

search_criteria = {'strategy':'RandomDiscrete',
                   'max_models':100,
                   'max_runtime_secs':300}

gsearch = H2OGridSearch(H2OGradientBoostingEstimator,
                        hyper_params=hyper_parameters,
                        search_criteria=search_criteria)

gsearch.train(x=X,
              y=y,
              training_frame=train,
             nfolds = 5)

GBM_Mazout = gsearch.get_grid()[0]
Mazout_Result['Gradient Boosting'] = [GBM_Mazout.rmse(),GBM_Mazout.mae(),GBM_Mazout.r2()]

#Neural Network
nn_Mazout = H2ODeepLearningEstimator(
    epochs=50,                   
    hidden=[100],                
    input_dropout_ratio=0.1,    
    hidden_dropout_ratios=[0.05], 
    activation='TanhWithDropout', 
    l1=0.001,                     
    l2=0.01,                      
    adaptive_rate=True,           
    stopping_rounds=5,            
    score_each_iteration=True,    
    model_id='nn_model',
    nfolds = 5)        

# train nn model
nn_Mazout.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
Mazout_Result['Neural Network'] = [nn_Mazout.rmse(),nn_Mazout.mae(),nn_Mazout.r2()]

Mazout_Result


Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
gbm Grid Build progress: |████████████████████████████████████████████████| 100%
deeplearning Model Build progress: |██████████████████████████████████████| 100%


Unnamed: 0,GLM,Random Forest,Gradient Boosting,Neural Network
0,197.679847,130.991237,16.853579,136.911935
1,127.39124,70.14779,10.599259,86.808941
2,0.000133,0.560962,0.992732,0.520377


In [185]:
GBM_Mazout

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  Grid_GBM_py_87_sid_9a3a_model_python_1575570960632_21_model_25


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,700.0,700.0,97894.0,3.0,7.0,4.57,4.0,9.0,6.382857




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 284.0431318402472
RMSE: 16.853579199690707
MAE: 10.599259269480802
RMSLE: 0.36468550505148967
Mean Residual Deviance: 284.0431318402472

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 13023.288147331405
RMSE: 114.11962209598929
MAE: 72.26403440854554
RMSLE: 0.7303409406556637
Mean Residual Deviance: 13023.288147331405

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,72.643394,17.733986,89.3806,91.497826,55.28162,72.32502,54.731922
1,mean_residual_deviance,13599.552,9840.119,23365.541,25164.299,5334.595,8571.157,5562.1655
2,mse,13599.552,9840.119,23365.541,25164.299,5334.595,8571.157,5562.1655
3,r2,0.62501305,0.13211888,0.70283264,0.5815798,0.7389933,0.68845505,0.41320443
4,residual_deviance,13599.552,9840.119,23365.541,25164.299,5334.595,8571.157,5562.1655
5,rmse,110.33786,42.20647,152.85791,158.6326,73.038315,92.58054,74.579926
6,rmsle,0.6963309,0.33682746,0.96292245,1.1437731,0.4457425,0.54318964,0.3860269



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2019-12-05 21:28:46,3 min 32.414 sec,0.0,197.692964,127.409543,39082.508038
1,,2019-12-05 21:28:46,3 min 32.420 sec,1.0,191.312246,120.254404,36600.375602
2,,2019-12-05 21:28:46,3 min 32.423 sec,2.0,184.772509,114.897699,34140.880096
3,,2019-12-05 21:28:46,3 min 32.427 sec,3.0,177.923895,109.133514,31656.912538
4,,2019-12-05 21:28:46,3 min 32.429 sec,4.0,174.40045,105.805661,30415.516962
5,,2019-12-05 21:28:46,3 min 32.431 sec,5.0,169.48377,101.189658,28724.74828
6,,2019-12-05 21:28:46,3 min 32.433 sec,6.0,166.695028,98.896439,27787.232439
7,,2019-12-05 21:28:46,3 min 32.436 sec,7.0,162.735238,95.510746,26482.757749
8,,2019-12-05 21:28:46,3 min 32.439 sec,8.0,159.438127,93.162458,25420.516413
9,,2019-12-05 21:28:46,3 min 32.441 sec,9.0,157.504606,90.781636,24807.700975



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,city,13383360.0,1.0,0.320277
1,Battles,7074880.0,0.528632,0.169309
2,Islamic State (Syria),5124820.0,0.382925,0.122642
3,Unidentified Armed Group (Syria),4663072.0,0.348423,0.111592
4,QSD: Syrian Democratic Forces,3206228.0,0.239568,0.076728
5,Explosions/Remote violence,2932850.0,0.219141,0.070186
6,Strategic developments,1878188.0,0.140338,0.044947
7,Military Forces of Syria (2000-),1402312.0,0.10478,0.033559
8,Violence against civilians,1221507.0,0.091271,0.029232
9,Other,899571.5,0.067216,0.021528




### 10. Rice

In [168]:
Rice_Result = pd.DataFrame()
t = mydf.loc[(mydf['product'] == 'Rice')]
frame = h2o.H2OFrame(t)
train, test = frame.split_frame([0.8])
y = 'price'
X = [name for name in frame.columns if name not in ['start_date', 'end_date','product', y]]

# GLM
GLM_Rice = H2OGeneralizedLinearEstimator(family='gaussian',
                                         solver='AUTO',
                                         standardize=True,
                                         lambda_search=True,
                                        nfolds = 5)

GLM_Rice.train(X, y, training_frame=train)
Rice_Result['GLM'] = [GLM_Rice.rmse(),GLM_Rice.mae(),GLM_Rice.r2()]

# Random Forest
rf_Rice = H2ORandomForestEstimator(
    ntrees=2000,                      
    max_depth=100,                   
    stopping_rounds=5,               
    score_each_iteration=True,      
    model_id='rf_model',
    nfolds = 5)             # for easy lookup in flow

rf_Rice.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
Rice_Result['Random Forest'] = [rf_Rice.rmse(),rf_Rice.mae(),rf_Rice.r2()]

# Gradient Boosting
hyper_parameters = {'ntrees':list(range(0, 1000, 50)),
                    'max_depth':list(range(1, 100, 2)),
                   'sample_rate':[s/float(10) for s in range(1, 11)],
                    'col_sample_rate':[s/float(10) for s in range(1, 11)]}

search_criteria = {'strategy':'RandomDiscrete',
                   'max_models':100,
                   'max_runtime_secs':300}

gsearch = H2OGridSearch(H2OGradientBoostingEstimator,
                        hyper_params=hyper_parameters,
                        search_criteria=search_criteria)

gsearch.train(x=X,
              y=y,
              training_frame=train,
             nfolds = 5)

GBM_Rice = gsearch.get_grid()[0]
Rice_Result['Gradient Boosting'] = [GBM_Rice.rmse(),GBM_Rice.mae(),GBM_Rice.r2()]

#Neural Network
nn_Rice = H2ODeepLearningEstimator(
    epochs=50,                   
    hidden=[100],                
    input_dropout_ratio=0.1,    
    hidden_dropout_ratios=[0.05], 
    activation='TanhWithDropout', 
    l1=0.001,                     
    l2=0.01,                      
    adaptive_rate=True,           
    stopping_rounds=5,            
    score_each_iteration=True,    
    model_id='nn_model',
    nfolds = 5)        

# train nn model
nn_Rice.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
Rice_Result['Neural Network'] = [nn_Rice.rmse(),nn_Rice.mae(),nn_Rice.r2()]

Rice_Result


Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
gbm Grid Build progress: |████████████████████████████████████████████████| 100%
deeplearning Model Build progress: |██████████████████████████████████████| 100%


Unnamed: 0,GLM,Random Forest,Gradient Boosting,Neural Network
0,195.589434,144.905648,0.955589,146.470349
1,137.597992,89.540708,0.321822,99.597103
2,0.000151,0.4512,0.999976,0.439284


In [184]:
GBM_Rice

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  Grid_GBM_py_91_sid_9a3a_model_python_1575570960632_23_model_30


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,750.0,750.0,213292.0,6.0,16.0,10.498667,15.0,20.0,17.872




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 0.9131502515669352
RMSE: 0.9555889553395515
MAE: 0.32182230991599836
RMSLE: 0.0014302851016574074
Mean Residual Deviance: 0.9131502515669352

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 19792.874284297526
RMSE: 140.6871503880064
MAE: 90.15286402802842
RMSLE: 0.19850422284552563
Mean Residual Deviance: 19792.874284297526

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,90.23105,15.083771,76.302055,74.14525,99.70676,91.65199,109.349174
1,mean_residual_deviance,19399.896,5924.9614,13485.1045,17235.273,21412.775,16176.278,28690.049
2,mse,19399.896,5924.9614,13485.1045,17235.273,21412.775,16176.278,28690.049
3,r2,0.44107494,0.17817304,0.61103827,0.49258918,0.15064538,0.40954047,0.54156137
4,residual_deviance,19399.896,5924.9614,13485.1045,17235.273,21412.775,16176.278,28690.049
5,rmse,138.0614,20.58358,116.12538,131.28319,146.33104,127.186,169.38136
6,rmsle,0.19521402,0.026548924,0.16546646,0.19002016,0.21485691,0.17636211,0.22936442



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2019-12-05 21:34:55,4 min 37.338 sec,0.0,195.604217,137.605929,38261.009659
1,,2019-12-05 21:34:55,4 min 37.341 sec,1.0,183.215245,127.518966,33567.826143
2,,2019-12-05 21:34:55,4 min 37.344 sec,2.0,172.518446,119.113059,29762.614233
3,,2019-12-05 21:34:55,4 min 37.347 sec,3.0,163.262793,111.616366,26654.739528
4,,2019-12-05 21:34:55,4 min 37.350 sec,4.0,155.272262,105.509455,24109.475307
5,,2019-12-05 21:34:56,4 min 37.354 sec,5.0,148.417947,99.988863,22027.887117
6,,2019-12-05 21:34:56,4 min 37.357 sec,6.0,142.281949,95.017697,20244.152948
7,,2019-12-05 21:34:56,4 min 37.360 sec,7.0,135.852254,89.159646,18455.834989
8,,2019-12-05 21:34:56,4 min 37.363 sec,8.0,130.908682,85.077316,17137.083137
9,,2019-12-05 21:34:56,4 min 37.367 sec,9.0,125.542931,80.797518,15761.027617



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,city,10196270.0,1.0,0.224048
1,Battles,8978913.0,0.880608,0.197298
2,Other,7840804.0,0.768988,0.17229
3,Military Forces of Syria (2000-),5475078.0,0.536969,0.120307
4,Explosions/Remote violence,4077130.0,0.399865,0.089589
5,Unidentified Armed Group (Syria),2902452.0,0.284658,0.063777
6,QSD: Syrian Democratic Forces,1884566.0,0.184829,0.041411
7,Islamic State (Syria),1602573.0,0.157173,0.035214
8,Strategic developments,1431618.0,0.140406,0.031458
9,Violence against civilians,1119908.0,0.109835,0.024608




### 11. Sugar

In [169]:
Sugar_Result = pd.DataFrame()
t = mydf.loc[(mydf['product'] == 'Sugar')]
frame = h2o.H2OFrame(t)
train, test = frame.split_frame([0.8])
y = 'price'
X = [name for name in frame.columns if name not in ['start_date', 'end_date','product', y]]

# GLM
GLM_Sugar = H2OGeneralizedLinearEstimator(family='gaussian',
                                         solver='AUTO',
                                         standardize=True,
                                         lambda_search=True,
                                        nfolds = 5)

GLM_Sugar.train(X, y, training_frame=train)
Sugar_Result['GLM'] = [GLM_Sugar.rmse(),GLM_Sugar.mae(),GLM_Sugar.r2()]

# Random Forest
rf_Sugar = H2ORandomForestEstimator(
    ntrees=2000,                      
    max_depth=100,                   
    stopping_rounds=5,               
    score_each_iteration=True,      
    model_id='rf_model',
    nfolds = 5)             # for easy lookup in flow

rf_Sugar.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
Sugar_Result['Random Forest'] = [rf_Sugar.rmse(),rf_Sugar.mae(),rf_Sugar.r2()]

# Gradient Boosting
hyper_parameters = {'ntrees':list(range(0, 1000, 50)),
                    'max_depth':list(range(1, 100, 2)),
                   'sample_rate':[s/float(10) for s in range(1, 11)],
                    'col_sample_rate':[s/float(10) for s in range(1, 11)]}

search_criteria = {'strategy':'RandomDiscrete',
                   'max_models':100,
                   'max_runtime_secs':300}

gsearch = H2OGridSearch(H2OGradientBoostingEstimator,
                        hyper_params=hyper_parameters,
                        search_criteria=search_criteria)

gsearch.train(x=X,
              y=y,
              training_frame=train,
             nfolds = 5)

GBM_Sugar = gsearch.get_grid()[0]
Sugar_Result['Gradient Boosting'] = [GBM_Sugar.rmse(),GBM_Sugar.mae(),GBM_Sugar.r2()]

#Neural Network
nn_Sugar = H2ODeepLearningEstimator(
    epochs=50,                   
    hidden=[100],                
    input_dropout_ratio=0.1,    
    hidden_dropout_ratios=[0.05], 
    activation='TanhWithDropout', 
    l1=0.001,                     
    l2=0.01,                      
    adaptive_rate=True,           
    stopping_rounds=5,            
    score_each_iteration=True,    
    model_id='nn_model',
    nfolds = 5)        

# train nn model
nn_Sugar.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
Sugar_Result['Neural Network'] = [nn_Sugar.rmse(),nn_Sugar.mae(),nn_Sugar.r2()]

Sugar_Result


Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
gbm Grid Build progress: |████████████████████████████████████████████████| 100%
deeplearning Model Build progress: |██████████████████████████████████████| 100%


Unnamed: 0,GLM,Random Forest,Gradient Boosting,Neural Network
0,129.928788,100.775737,25.102042,97.045369
1,95.263306,62.250722,16.791795,71.424016
2,0.000437,0.400637,0.962691,0.442366


In [183]:
GBM_Sugar

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  Grid_GBM_py_95_sid_9a3a_model_python_1575570960632_25_model_4


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,200.0,200.0,46665.0,5.0,13.0,7.63,9.0,17.0,13.84




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 630.1125185015394
RMSE: 25.102042118153243
MAE: 16.791794772677953
RMSLE: 0.05915307918085026
Mean Residual Deviance: 630.1125185015394

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 8038.533346307199
RMSE: 89.65786829000118
MAE: 57.04287831425945
RMSLE: 0.19471079159930738
Mean Residual Deviance: 8038.533346307199

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,57.062492,6.57097,55.722893,50.075195,67.78894,54.420815,57.304623
1,mean_residual_deviance,8070.861,2978.4233,7091.965,4686.1196,12754.773,7176.22,8645.227
2,mse,8070.861,2978.4233,7091.965,4686.1196,12754.773,7176.22,8645.227
3,r2,0.4730062,0.2457046,0.60635257,0.63219655,0.5375641,0.5498352,0.039082497
4,residual_deviance,8070.861,2978.4233,7091.965,4686.1196,12754.773,7176.22,8645.227
5,rmse,88.65967,16.214325,84.213806,68.45524,112.937035,84.71258,92.97971
6,rmsle,0.1941219,0.021446394,0.194579,0.15797544,0.20241717,0.20114861,0.21448927



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2019-12-05 21:35:47,23.934 sec,0.0,129.957213,95.293259,16888.877085
1,,2019-12-05 21:35:47,23.937 sec,1.0,123.690239,89.215855,15299.275116
2,,2019-12-05 21:35:47,23.939 sec,2.0,118.176251,84.597891,13965.626298
3,,2019-12-05 21:35:47,23.941 sec,3.0,115.469339,81.880106,13333.16821
4,,2019-12-05 21:35:47,23.943 sec,4.0,110.910052,78.162889,12301.039664
5,,2019-12-05 21:35:47,23.945 sec,5.0,107.300825,74.825713,11513.467015
6,,2019-12-05 21:35:47,23.947 sec,6.0,104.212621,71.63442,10860.270377
7,,2019-12-05 21:35:47,23.949 sec,7.0,100.948759,68.5344,10190.651853
8,,2019-12-05 21:35:47,23.952 sec,8.0,99.932711,67.668547,9986.546722
9,,2019-12-05 21:35:47,23.954 sec,9.0,97.235854,64.913437,9454.811223



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,Islamic State (Syria),3061900.0,1.0,0.161364
1,Battles,2969759.0,0.969907,0.156508
2,QSD: Syrian Democratic Forces,2166477.0,0.70756,0.114175
3,Unidentified Armed Group (Syria),2102493.0,0.686663,0.110803
4,Military Forces of Syria (2000-),2015382.0,0.658213,0.106212
5,Explosions/Remote violence,1778369.0,0.580806,0.093721
6,city,1677982.0,0.54802,0.088431
7,Strategic developments,1175586.0,0.38394,0.061954
8,Other,1145346.0,0.374064,0.060361
9,Violence against civilians,881781.7,0.287985,0.046471




### 12. Tea

In [170]:
Tea_Result = pd.DataFrame()
t = mydf.loc[(mydf['product'] == 'Tea')]
frame = h2o.H2OFrame(t)
train, test = frame.split_frame([0.8])
y = 'price'
X = [name for name in frame.columns if name not in ['start_date', 'end_date','product', y]]

# GLM
GLM_Tea = H2OGeneralizedLinearEstimator(family='gaussian',
                                         solver='AUTO',
                                         standardize=True,
                                         lambda_search=True,
                                        nfolds = 5)

GLM_Tea.train(X, y, training_frame=train)
Tea_Result['GLM'] = [GLM_Tea.rmse(),GLM_Tea.mae(),GLM_Tea.r2()]

# Random Forest
rf_Tea = H2ORandomForestEstimator(
    ntrees=2000,                      
    max_depth=100,                   
    stopping_rounds=5,               
    score_each_iteration=True,      
    model_id='rf_model',
    nfolds = 5)             # for easy lookup in flow

rf_Tea.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
Tea_Result['Random Forest'] = [rf_Tea.rmse(),rf_Tea.mae(),rf_Tea.r2()]

# Gradient Boosting
hyper_parameters = {'ntrees':list(range(0, 1000, 50)),
                    'max_depth':list(range(1, 100, 2)),
                   'sample_rate':[s/float(10) for s in range(1, 11)],
                    'col_sample_rate':[s/float(10) for s in range(1, 11)]}

search_criteria = {'strategy':'RandomDiscrete',
                   'max_models':100,
                   'max_runtime_secs':300}

gsearch = H2OGridSearch(H2OGradientBoostingEstimator,
                        hyper_params=hyper_parameters,
                        search_criteria=search_criteria)

gsearch.train(x=X,
              y=y,
              training_frame=train,
             nfolds = 5)

GBM_Tea = gsearch.get_grid()[0]
Tea_Result['Gradient Boosting'] = [GBM_Tea.rmse(),GBM_Tea.mae(),GBM_Tea.r2()]

#Neural Network
nn_Tea = H2ODeepLearningEstimator(
    epochs=50,                   
    hidden=[100],                
    input_dropout_ratio=0.1,    
    hidden_dropout_ratios=[0.05], 
    activation='TanhWithDropout', 
    l1=0.001,                     
    l2=0.01,                      
    adaptive_rate=True,           
    stopping_rounds=5,            
    score_each_iteration=True,    
    model_id='nn_model',
    nfolds = 5)        

# train nn model
nn_Tea.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
Tea_Result['Neural Network'] = [nn_Tea.rmse(),nn_Tea.mae(),nn_Tea.r2()]

Tea_Result


Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
gbm Grid Build progress: |████████████████████████████████████████████████| 100%
deeplearning Model Build progress: |██████████████████████████████████████| 100%


Unnamed: 0,GLM,Random Forest,Gradient Boosting,Neural Network
0,1645.113311,1529.534376,80.862217,1468.749073
1,840.499256,664.103234,34.455399,957.434447
2,1.3e-05,0.135587,0.997584,0.202928


In [182]:
GBM_Tea

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  Grid_GBM_py_99_sid_9a3a_model_python_1575570960632_27_model_12


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,500.0,500.0,110487.0,5.0,12.0,8.354,11.0,16.0,12.806




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 6538.698151646879
RMSE: 80.86221708342455
MAE: 34.455398692255436
RMSLE: 0.013581620085880427
Mean Residual Deviance: 6538.698151646879

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 1652898.5247850558
RMSE: 1285.6510120499481
MAE: 648.1814835098677
RMSLE: 0.22400826805470772
Mean Residual Deviance: 1652898.5247850558

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,646.06226,121.82648,703.528,483.61023,769.86475,720.8256,552.48254
1,mean_residual_deviance,1639291.9,1321032.0,3817309.5,419929.66,1829311.9,1229366.2,900542.0
2,mse,1639291.9,1321032.0,3817309.5,419929.66,1829311.9,1229366.2,900542.0
3,r2,0.32407242,0.13450654,0.36509275,0.19717516,0.46294647,0.4277508,0.16739684
4,residual_deviance,1639291.9,1321032.0,3817309.5,419929.66,1829311.9,1229366.2,900542.0
5,rmse,1202.4142,491.7978,1953.7936,648.0198,1352.5205,1108.768,948.96893
6,rmsle,0.22384658,0.010882717,0.22985545,0.20930812,0.23825675,0.22171782,0.22009481



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2019-12-05 21:42:32,2 min 4.672 sec,0.0,1645.124227,840.500787,2706434.0
1,,2019-12-05 21:42:32,2 min 4.676 sec,1.0,1571.895152,789.668947,2470854.0
2,,2019-12-05 21:42:32,2 min 4.679 sec,2.0,1535.926471,741.371236,2359070.0
3,,2019-12-05 21:42:32,2 min 4.682 sec,3.0,1470.444085,704.392494,2162206.0
4,,2019-12-05 21:42:32,2 min 4.685 sec,4.0,1412.407425,665.596943,1994895.0
5,,2019-12-05 21:42:32,2 min 4.687 sec,5.0,1367.149366,636.306936,1869097.0
6,,2019-12-05 21:42:32,2 min 4.689 sec,6.0,1318.384783,605.982255,1738138.0
7,,2019-12-05 21:42:32,2 min 4.692 sec,7.0,1303.098405,584.425778,1698065.0
8,,2019-12-05 21:42:32,2 min 4.694 sec,8.0,1281.550677,566.399643,1642372.0
9,,2019-12-05 21:42:32,2 min 4.697 sec,9.0,1276.576967,557.792509,1629649.0



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,Battles,1163095000.0,1.0,0.332174
1,city,618274800.0,0.531577,0.176576
2,Other,401220400.0,0.344959,0.114586
3,Explosions/Remote violence,295207800.0,0.253812,0.08431
4,QSD: Syrian Democratic Forces,255298400.0,0.219499,0.072912
5,Unidentified Armed Group (Syria),236354200.0,0.203211,0.067502
6,Military Forces of Syria (2000-),204679200.0,0.175978,0.058455
7,Islamic State (Syria),141278700.0,0.121468,0.040348
8,Violence against civilians,138401900.0,0.118994,0.039527
9,Strategic developments,47653210.0,0.040971,0.01361




### 13. Tomato Paste

In [171]:
TomatoPaste_Result = pd.DataFrame()
t = mydf.loc[(mydf['product'] == 'Tomato Paste')]
frame = h2o.H2OFrame(t)
train, test = frame.split_frame([0.8])
y = 'price'
X = [name for name in frame.columns if name not in ['start_date', 'end_date','product', y]]

# GLM
GLM_TomatoPaste = H2OGeneralizedLinearEstimator(family='gaussian',
                                         solver='AUTO',
                                         standardize=True,
                                         lambda_search=True,
                                        nfolds = 5)

GLM_TomatoPaste.train(X, y, training_frame=train)
TomatoPaste_Result['GLM'] = [GLM_TomatoPaste.rmse(),GLM_TomatoPaste.mae(),GLM_TomatoPaste.r2()]

# Random Forest
rf_TomatoPaste = H2ORandomForestEstimator(
    ntrees=2000,                      
    max_depth=100,                   
    stopping_rounds=5,               
    score_each_iteration=True,      
    model_id='rf_model',
    nfolds = 5)             # for easy lookup in flow

rf_TomatoPaste.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
TomatoPaste_Result['Random Forest'] = [rf_TomatoPaste.rmse(),rf_TomatoPaste.mae(),rf_TomatoPaste.r2()]

# Gradient Boosting
hyper_parameters = {'ntrees':list(range(0, 1000, 50)),
                    'max_depth':list(range(1, 100, 2)),
                   'sample_rate':[s/float(10) for s in range(1, 11)],
                    'col_sample_rate':[s/float(10) for s in range(1, 11)]}

search_criteria = {'strategy':'RandomDiscrete',
                   'max_models':100,
                   'max_runtime_secs':300}

gsearch = H2OGridSearch(H2OGradientBoostingEstimator,
                        hyper_params=hyper_parameters,
                        search_criteria=search_criteria)

gsearch.train(x=X,
              y=y,
              training_frame=train,
             nfolds = 5)

GBM_TomatoPaste = gsearch.get_grid()[0]
TomatoPaste_Result['Gradient Boosting'] = [GBM_TomatoPaste.rmse(),GBM_TomatoPaste.mae(),GBM_TomatoPaste.r2()]

#Neural Network
nn_TomatoPaste = H2ODeepLearningEstimator(
    epochs=50,                   
    hidden=[100],                
    input_dropout_ratio=0.1,    
    hidden_dropout_ratios=[0.05], 
    activation='TanhWithDropout', 
    l1=0.001,                     
    l2=0.01,                      
    adaptive_rate=True,           
    stopping_rounds=5,            
    score_each_iteration=True,    
    model_id='nn_model',
    nfolds = 5)        

# train nn model
nn_TomatoPaste.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
TomatoPaste_Result['Neural Network'] = [nn_TomatoPaste.rmse(),nn_TomatoPaste.mae(),nn_TomatoPaste.r2()]

TomatoPaste_Result


Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
gbm Grid Build progress: |████████████████████████████████████████████████| 100%
deeplearning Model Build progress: |██████████████████████████████████████| 100%


Unnamed: 0,GLM,Random Forest,Gradient Boosting,Neural Network
0,320.996171,247.905922,43.63408,291.448908
1,203.983616,133.686648,15.164777,165.042992
2,0.000119,0.403621,0.981524,0.175723


In [181]:
GBM_TomatoPaste

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  Grid_GBM_py_103_sid_9a3a_model_python_1575570960632_29_model_7


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,250.0,250.0,73228.0,7.0,15.0,10.62,16.0,21.0,18.536




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 1903.932955332386
RMSE: 43.634080204954316
MAE: 15.164776585683102
RMSLE: 0.029504026656009243
Mean Residual Deviance: 1903.932955332386

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 47849.27064347408
RMSE: 218.7447614080714
MAE: 118.60053249911304
RMSLE: 0.20263185618582663
Mean Residual Deviance: 47849.27064347408

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,117.56263,15.896217,112.81335,99.49325,108.46575,139.42152,127.61929
1,mean_residual_deviance,46309.86,38085.633,36405.48,19680.842,25714.617,113207.3,36541.066
2,mse,46309.86,38085.633,36405.48,19680.842,25714.617,113207.3,36541.066
3,r2,0.56775403,0.123211734,0.5816733,0.748674,0.48221466,0.4279321,0.59827614
4,residual_deviance,46309.86,38085.633,36405.48,19680.842,25714.617,113207.3,36541.066
5,rmse,203.8137,77.2159,190.8022,140.28842,160.35777,336.46292,191.15718
6,rmsle,0.19785133,0.029060595,0.2073538,0.1529722,0.18578176,0.22266866,0.2204802



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2019-12-05 21:46:28,56.417 sec,0.0,321.015306,203.997893,103050.826497
1,,2019-12-05 21:46:28,56.421 sec,1.0,302.961166,189.510401,91785.46788
2,,2019-12-05 21:46:28,56.425 sec,2.0,287.50076,177.688138,82656.687158
3,,2019-12-05 21:46:28,56.428 sec,3.0,274.310442,167.202877,75246.21861
4,,2019-12-05 21:46:28,56.433 sec,4.0,263.101053,157.74299,69222.163898
5,,2019-12-05 21:46:28,56.436 sec,5.0,252.944563,149.300658,63980.951797
6,,2019-12-05 21:46:28,56.440 sec,6.0,244.66387,142.310724,59860.409505
7,,2019-12-05 21:46:28,56.443 sec,7.0,236.555215,135.629434,55958.369973
8,,2019-12-05 21:46:28,56.446 sec,8.0,229.860734,129.701602,52835.95688
9,,2019-12-05 21:46:28,56.450 sec,9.0,220.911907,122.161135,48802.070546



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,Battles,39816164.0,1.0,0.314256
1,city,37592968.0,0.944163,0.296709
2,Military Forces of Syria (2000-),12000583.0,0.3014,0.094717
3,Islamic State (Syria),10290638.0,0.258454,0.081221
4,Unidentified Armed Group (Syria),7402672.0,0.185921,0.058427
5,Other,5110995.5,0.128365,0.040339
6,Explosions/Remote violence,4527760.5,0.113717,0.035736
7,QSD: Syrian Democratic Forces,3992163.5,0.100265,0.031509
8,Violence against civilians,3590554.5,0.090178,0.028339
9,Strategic developments,2375238.75,0.059655,0.018747




### 14. White Beans

In [172]:
WhiteBeans_Result = pd.DataFrame()
t = mydf.loc[(mydf['product'] == 'White Beans')]
frame = h2o.H2OFrame(t)
train, test = frame.split_frame([0.8])
y = 'price'
X = [name for name in frame.columns if name not in ['start_date', 'end_date','product', y]]

# GLM
GLM_WhiteBeans = H2OGeneralizedLinearEstimator(family='gaussian',
                                         solver='AUTO',
                                         standardize=True,
                                         lambda_search=True,
                                        nfolds = 5)

GLM_WhiteBeans.train(X, y, training_frame=train)
WhiteBeans_Result['GLM'] = [GLM_WhiteBeans.rmse(),GLM_WhiteBeans.mae(),GLM_WhiteBeans.r2()]

# Random Forest
rf_WhiteBeans = H2ORandomForestEstimator(
    ntrees=2000,                      
    max_depth=100,                   
    stopping_rounds=5,               
    score_each_iteration=True,      
    model_id='rf_model',
    nfolds = 5)             # for easy lookup in flow

rf_WhiteBeans.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
WhiteBeans_Result['Random Forest'] = [rf_WhiteBeans.rmse(),rf_WhiteBeans.mae(),rf_WhiteBeans.r2()]

# Gradient Boosting
hyper_parameters = {'ntrees':list(range(0, 1000, 50)),
                    'max_depth':list(range(1, 100, 2)),
                   'sample_rate':[s/float(10) for s in range(1, 11)],
                    'col_sample_rate':[s/float(10) for s in range(1, 11)]}

search_criteria = {'strategy':'RandomDiscrete',
                   'max_models':100,
                   'max_runtime_secs':300}

gsearch = H2OGridSearch(H2OGradientBoostingEstimator,
                        hyper_params=hyper_parameters,
                        search_criteria=search_criteria)

gsearch.train(x=X,
              y=y,
              training_frame=train,
             nfolds = 5)

GBM_WhiteBeans = gsearch.get_grid()[0]
WhiteBeans_Result['Gradient Boosting'] = [GBM_WhiteBeans.rmse(),GBM_WhiteBeans.mae(),GBM_WhiteBeans.r2()]

#Neural Network
nn_WhiteBeans = H2ODeepLearningEstimator(
    epochs=50,                   
    hidden=[100],                
    input_dropout_ratio=0.1,    
    hidden_dropout_ratios=[0.05], 
    activation='TanhWithDropout', 
    l1=0.001,                     
    l2=0.01,                      
    adaptive_rate=True,           
    stopping_rounds=5,            
    score_each_iteration=True,    
    model_id='nn_model',
    nfolds = 5)        

# train nn model
nn_WhiteBeans.train(
    x=X,
    y=y,
    training_frame=train)

# print model information
WhiteBeans_Result['Neural Network'] = [nn_WhiteBeans.rmse(),nn_WhiteBeans.mae(),nn_WhiteBeans.r2()]

WhiteBeans_Result


Parse progress: |█████████████████████████████████████████████████████████| 100%
glm Model Build progress: |███████████████████████████████████████████████| 100%
drf Model Build progress: |███████████████████████████████████████████████| 100%
gbm Grid Build progress: |████████████████████████████████████████████████| 100%
deeplearning Model Build progress: |██████████████████████████████████████| 100%


Unnamed: 0,GLM,Random Forest,Gradient Boosting,Neural Network
0,226.445958,145.590559,9.628011,171.453656
1,165.790248,91.788813,3.858131,127.593221
2,0.000133,0.586687,0.998192,0.4268


In [180]:
GBM_WhiteBeans

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  Grid_GBM_py_107_sid_9a3a_model_python_1575570960632_31_model_18


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,800.0,800.0,162792.0,4.0,11.0,7.12125,8.0,14.0,11.485




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 92.69859366336983
RMSE: 9.628010888203743
MAE: 3.8581313145013505
RMSLE: 0.011438490917856634
Mean Residual Deviance: 92.69859366336983

ModelMetricsRegression: gbm
** Reported on cross-validation data. **

MSE: 20508.152197682553
RMSE: 143.2066765122442
MAE: 96.28767907133197
RMSLE: 0.22535977873506643
Mean Residual Deviance: 20508.152197682553

Cross-Validation Metrics Summary: 


Unnamed: 0,Unnamed: 1,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
0,mae,95.7237,12.461166,88.676094,117.29007,95.7858,88.577095,88.289474
1,mean_residual_deviance,20121.596,11021.659,14323.717,39616.7,16516.787,17113.803,13036.97
2,mse,20121.596,11021.659,14323.717,39616.7,16516.787,17113.803,13036.97
3,r2,0.59055406,0.106101066,0.63591003,0.48913175,0.7399463,0.4890587,0.5987235
4,residual_deviance,20121.596,11021.659,14323.717,39616.7,16516.787,17113.803,13036.97
5,rmse,138.44762,34.5299,119.68173,199.03944,128.51765,130.81973,114.17955
6,rmsle,0.22275428,0.021996561,0.19875914,0.24968901,0.22420397,0.23820071,0.20291859



Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2019-12-05 21:53:30,2 min 53.012 sec,0.0,226.461051,165.80724,51284.607808
1,,2019-12-05 21:53:30,2 min 53.016 sec,1.0,214.037386,155.673039,45812.00269
2,,2019-12-05 21:53:30,2 min 53.020 sec,2.0,202.75167,145.908149,41108.239799
3,,2019-12-05 21:53:30,2 min 53.023 sec,3.0,194.445967,138.777244,37809.234242
4,,2019-12-05 21:53:30,2 min 53.026 sec,4.0,183.535287,129.858793,33685.201721
5,,2019-12-05 21:53:30,2 min 53.029 sec,5.0,175.783323,123.396995,30899.776763
6,,2019-12-05 21:53:30,2 min 53.032 sec,6.0,169.057602,117.091021,28580.4729
7,,2019-12-05 21:53:30,2 min 53.035 sec,7.0,160.050692,109.906419,25616.224132
8,,2019-12-05 21:53:30,2 min 53.038 sec,8.0,154.386996,105.118169,23835.34444
9,,2019-12-05 21:53:30,2 min 53.041 sec,9.0,148.276095,100.25714,21985.800336



See the whole table with table.as_data_frame()

Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,city,20535530.0,1.0,0.329991
1,Explosions/Remote violence,10566416.0,0.514543,0.169795
2,Battles,7206499.0,0.350928,0.115803
3,Islamic State (Syria),4919779.5,0.239574,0.079057
4,Unidentified Armed Group (Syria),4584490.5,0.223247,0.073669
5,QSD: Syrian Democratic Forces,3428643.75,0.166962,0.055096
6,Military Forces of Syria (2000-),3125679.0,0.152208,0.050227
7,Violence against civilians,2951872.5,0.143745,0.047434
8,Strategic developments,2566188.0,0.124963,0.041237
9,Other,2345489.5,0.114216,0.03769




In [None]:
h2o.cluster().shutdown(prompt=False)
