# H2O RANDOM FOREST NOTEBOOK

In [1]:
## Importing Libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import h2o
import os

In [2]:
h2o.init(max_mem_size = "4G")
h2o.remove_all()


Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O cluster uptime:,2 days 15 hours 53 mins
H2O cluster timezone:,Asia/Kolkata
H2O data parsing timezone:,UTC
H2O cluster version:,3.24.0.3
H2O cluster version age:,12 days
H2O cluster name:,H2O_from_python_jaskirat_nh6b6k
H2O cluster total nodes:,1
H2O cluster free memory:,3.254 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [3]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator

dataset = pd.read_csv("/Football/train.csv")

## Handling Missing Values

In [4]:
## Removing Rows that contain null values

dataset = dataset.drop([7082 , 9363 , 11644 , 11645 , 11646 , 12027 , 12408])

In [5]:
dataset = dataset.drop([7408 , 11298])

In [6]:
## Handling Missing values

dataset['HTAG'] = dataset['HTAG'].fillna(1.0)
dataset['HTHG'] = dataset['HTHG'].fillna(0.0)

In [7]:
dataset = dataset.reset_index(drop = True)

## Feature Selection and Feature Extraction

In [8]:
## feature Extraction

dataset["GoalDifference"] = dataset.apply(lambda x : abs(x.HTHG - x.HTAG) , axis = 1)

In [9]:
## Feature Selection

dataset = dataset[["HomeTeam" , "AwayTeam" , "HTHG" , "HTAG" , "HST" , "AST" , "AC" , "HC" , "HR" , "AR" , "league" , "GoalDifference" , "FTR"]]

In [10]:
dataset.head()


Unnamed: 0,HomeTeam,AwayTeam,HTHG,HTAG,HST,AST,AC,HC,HR,AR,league,GoalDifference,FTR
0,Wolfsburg,Stuttgart,0.0,0.0,7.0,4.0,3.0,6.0,0.0,0.0,bundesliga,0.0,H
1,Dortmund,FC Koln,0.0,0.0,11.0,0.0,1.0,16.0,0.0,0.0,bundesliga,0.0,H
2,Hertha,Hannover,0.0,0.0,4.0,3.0,3.0,5.0,0.0,0.0,bundesliga,0.0,H
3,Hoffenheim,Bayern Munich,1.0,1.0,1.0,3.0,10.0,3.0,0.0,0.0,bundesliga,0.0,D
4,Mainz,Leverkusen,1.0,2.0,4.0,7.0,5.0,3.0,0.0,0.0,bundesliga,1.0,D


## Training and Response columns

In [11]:
training_columns = ["HomeTeam" , "AwayTeam" , "HTHG" , "HTAG" , "HST" , "AST" , "AC" , "HC" , "HR" , "AR" , "league" , "GoalDifference"]
response_column = "FTR"


## Here I am inserting my dataframe to H2O cluster.

In [12]:
## Inserting Dataframe in H2O Cluster

df_train = h2o.H2OFrame(dataset)

Parse progress: |█████████████████████████████████████████████████████████| 100%


## Splitting Dataset into Train-Test-Validate
## Ratio is 80:10:10% 

In [13]:
## Splitting Dataset into:
## 80% Training, 10% Validation and 10% Tetsing sets.
train, valid, test = df_train.split_frame([0.8, 0.1], seed=1234)

In [14]:
gbm_v1 = H2OGradientBoostingEstimator(
    ntrees=400,
    learn_rate=0.2,
    max_depth=30,
    stopping_tolerance=0.01, #10-fold increase in threshold as defined in rf_v1
    stopping_rounds=2,
    score_each_iteration=True,
    model_id="gbm_covType_v2",
    seed=2000000
)
gbm_v1.train(x=training_columns, y=response_column, training_frame=train, validation_frame=valid)



gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [15]:
gbm_v2 = H2OGradientBoostingEstimator(
    ntrees=600,
    learn_rate=0.5,
    max_depth=60,
    sample_rate=0.7,
    col_sample_rate=0.7,
    stopping_rounds=2,
    stopping_tolerance=0.01, #10-fold increase in threshold as defined in rf_v1
    score_each_iteration=True,
    
    model_id="gbm_covType_v3",
    seed=3000000
)
gbm_v2.train(x=training_columns, y=response_column, training_frame=train, validation_frame=valid)



gbm Model Build progress: |███████████████████████████████████████████████| 100%


## Distributed Random Forest Model

In [16]:
## Our Distributed Random Forest Model

rf_train = H2ORandomForestEstimator(
    model_id="rf_covType_v2",
    ntrees=400,
   # weights_column ='Severity', 
    nfolds=2,
    keep_cross_validation_predictions=True,
    max_depth= 70,
    stopping_rounds=1,
    stopping_tolerance=0.01,
    score_each_iteration=True,
    balance_classes = True ,
    seed=3000000)
rf_train.train(x=training_columns, y=response_column, training_frame=train, validation_frame=valid)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [17]:
## performance Matrix

performance = rf_train.model_performance(test_data=test)
print (performance) 


ModelMetricsMultinomial: drf
** Reported on test data. **

MSE: 0.2815495753424978
RMSE: 0.5306124530601386
LogLoss: 1.1434968794808973
Mean Per-Class Error: 0.40938982724666495
Confusion Matrix: Row labels: Actual class; Column labels: Predicted class



0,1,2,3,4
A,D,H,Error,Rate
229.0,63.0,70.0,0.3674033,133 / 362
84.0,98.0,170.0,0.7215909,254 / 352
32.0,49.0,501.0,0.1391753,81 / 582
345.0,210.0,741.0,0.3611111,"468 / 1,296"


Top-3 Hit Ratios: 


0,1
k,hit_ratio
1,0.6388889
2,0.8672839
3,1.0





## Prediction on Test data

In [18]:
## Prediction on Test data

final_rf_predictions =rf_train.predict(test[:-1])
print (final_rf_predictions)

#test set accuracy
acc= (final_rf_predictions['predict']==test['FTR']).as_data_frame(use_pandas=True).mean()
print (acc)

drf prediction progress: |████████████████████████████████████████████████| 100%


predict,A,D,H
A,0.529242,0.35387,0.116888
H,0.381806,0.158237,0.459957
D,0.0426254,0.816161,0.141213
H,0.0780157,0.189688,0.732297
A,0.680439,0.265889,0.0536719
A,0.694841,0.190062,0.115097
H,0.127947,0.0999937,0.772059
A,0.435154,0.264509,0.300337
D,0.0,0.925289,0.0747109
H,0.0880841,0.0,0.911916



predict    0.638889
dtype: float64


## Saving and Loading H2O Model

In [20]:
## Saving and Loading H2O model

model_path = h2o.save_model(model=rf_train, path="/H2OModel/FootballPredictionModel", force=True)

saved_model = h2o.load_model('/H2OModel/FootballPredictionModel/rf_covType_v2')

test_dataset = pd.read_csv("/Football/test-3.csv")

test_dataset["GoalDifference"] = test_dataset.apply(lambda x : abs(x.HTHG - x.HTAG) , axis = 1)

test_dataset = test_dataset[["HomeTeam" , "AwayTeam" , "HTHG" , "HTAG" , "HS" , "AS" , "HST" , "AST" , 
                "HR" , "AR" , "league" , "GoalDifference"]]


col_dtypes = {'HomeTeam':'string','AwayTeam':'string','HTHG':'numeric','HTAG':'numeric','HS':'numeric','AS':'numeric','HST':'numeric','AST':'numeric','HR':'numeric' , 'AR':'numeric' , 'league':'string' , 'GoalDifference':'numeric'}

covtype_df_TEST2 = h2o.H2OFrame(test_dataset,column_types = col_dtypes)

final_rf_predictions = saved_model.predict(covtype_df_TEST2)
print (final_rf_predictions)


Parse progress: |█████████████████████████████████████████████████████████| 100%
drf prediction progress: |████████████████████████████████████████████████| 100%




predict,A,D,H
H,0.0214427,0.019551,0.959006
H,0.0,0.0498117,0.950188
D,0.301205,0.366175,0.33262
D,0.0400809,0.694352,0.265567
H,0.0876447,0.186462,0.725893
H,0.0178825,0.211963,0.770155
A,0.938855,0.0611447,0.0
D,0.0,0.683133,0.316867
H,0.0297246,0.37943,0.590845
A,0.720403,0.138284,0.141313





## Converting predictions to dataframe and concatenating with Test set

In [21]:
## Converting Prediction into dataframe

h2oFinalPredictions=final_rf_predictions["predict"].as_data_frame()

In [22]:
## Concatenating Predicction Column to Test Set Dataframe

grps2 = pd.concat([test_dataset , h2oFinalPredictions] , axis = 1,join = 'inner') 
print (grps2.head())

        HomeTeam       AwayTeam  HTHG  HTAG  HS  AS  HST  AST  HR  AR  \
0  Bayern Munich     Leverkusen     2     0  13  19    8    4   0   0   
1        Hamburg       Augsburg     1     0  11  13    5    1   0   0   
2         Hertha      Stuttgart     0     0  10   9    3    2   0   0   
3     Hoffenheim  Werder Bremen     0     0  14  11    3    2   0   0   
4          Mainz       Hannover     0     0  14   6    6    2   0   0   

       league  GoalDifference predict  
0  bundesliga               2       H  
1  bundesliga               1       H  
2  bundesliga               0       D  
3  bundesliga               0       D  
4  bundesliga               0       H  
