Imports and inits
---

In [66]:
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator 
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.grid.grid_search import H2OGridSearch 
from h2o.estimators.xgboost import H2OXGBoostEstimator
from h2o.estimators.stackedensemble import H2OStackedEnsembleEstimator
h2o.init() # give h2o as much memory as possible
h2o.no_progress() # turn off h2o progress bars

import numpy as np
import pandas as pd

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: java version "1.8.0_131"; Java(TM) SE Runtime Environment (build 1.8.0_131-b11); Java HotSpot(TM) 64-Bit Server VM (build 25.131-b11, mixed mode)
  Starting server from /Users/niki/anaconda/lib/python3.5/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/z2/4db2w2591kb96lzzxlxwnljm0000gp/T/tmpww0kuf6m
  JVM stdout: /var/folders/z2/4db2w2591kb96lzzxlxwnljm0000gp/T/tmpww0kuf6m/h2o_niki_started_from_python.out
  JVM stderr: /var/folders/z2/4db2w2591kb96lzzxlxwnljm0000gp/T/tmpww0kuf6m/h2o_niki_started_from_python.err


H2OServerError: Server wasn't able to start in 10.115989 seconds.

Import data
---

In [53]:

train = h2o.import_file('/Users/niki/Documents/Machine Learning/train.csv')
test = h2o.import_file('/Users/niki/Documents/Machine Learning/test.csv')

# bug fix - from Keston
dummy_col = np.random.rand(test.shape[0])
test = test.cbind(h2o.H2OFrame(dummy_col))
cols = test.columns
cols[-1] = 'y'
test.columns = cols
print(train.shape)
print(test.shape)

(4209, 378)
(4209, 378)


Determine data types
--

In [54]:

def get_type_lists(frame=train, rejects=['ID', 'y']):

    """Creates lists of numeric and categorical variables.
    
    :param frame: The frame from which to determine types.
    :param rejects: Variable names not to be included in returned lists.
    :return: Tuple of lists for numeric and categorical variables in the frame.
    
    """
    
    nums, cats = [], []
    for key, val in frame.types.items():
        if key not in rejects:
            if val == 'enum':
                cats.append(key)
            else: 
                nums.append(key)
                
    print('Numeric =', nums)                
    print()
    print('Categorical =', cats)
    
    return nums, cats

In [55]:
original_nums, cats = get_type_lists()

Numeric = ['X282', 'X309', 'X354', 'X53', 'X125', 'X64', 'X340', 'X257', 'X44', 'X51', 'X30', 'X215', 'X278', 'X159', 'X54', 'X225', 'X363', 'X382', 'X375', 'X190', 'X385', 'X145', 'X367', 'X252', 'X383', 'X113', 'X243', 'X129', 'X287', 'X348', 'X50', 'X65', 'X89', 'X192', 'X176', 'X10', 'X11', 'X236', 'X93', 'X244', 'X67', 'X319', 'X31', 'X196', 'X352', 'X263', 'X164', 'X261', 'X302', 'X172', 'X339', 'X146', 'X63', 'X377', 'X245', 'X171', 'X182', 'X62', 'X280', 'X345', 'X324', 'X298', 'X364', 'X61', 'X189', 'X249', 'X266', 'X79', 'X177', 'X207', 'X71', 'X140', 'X365', 'X308', 'X237', 'X73', 'X133', 'X284', 'X267', 'X291', 'X358', 'X213', 'X161', 'X95', 'X233', 'X183', 'X380', 'X344', 'X66', 'X226', 'X162', 'X353', 'X219', 'X325', 'X346', 'X217', 'X123', 'X134', 'X356', 'X264', 'X314', 'X150', 'X242', 'X60', 'X57', 'X180', 'X87', 'X378', 'X32', 'X18', 'X126', 'X272', 'X155', 'X311', 'X260', 'X151', 'X323', 'X293', 'X212', 'X275', 'X351', 'X34', 'X256', 'X36', 'X271', 'X341', 'X370', 'X

Split into to train and validation (before doing data prep!!!)
---

In [56]:
train, valid = train.split_frame([0.7], seed=12345)
print(train.shape)
print(valid.shape)

(2936, 378)
(1273, 378)


Encode categorical vars using shrunken averages
---
http://helios.mm.di.uoa.gr/~rouvas/ssi/sigkdd/sigkdd.vol3.1/barreca.ps
    

In [58]:
def target_encoder(training_frame, test_frame, x, y, lambda_=0.15, threshold=150, test=False):

    """ Applies simple target encoding to categorical variables.

    :param training_frame: Training frame which to create target means and to be encoded.
    :param test_frame: Test frame to be encoded using information from training frame.
    :param x: Name of input variable to be encoded.
    :param y: Name of target variable to use for encoding.
    :param lambda_: Balance between level mean and overall mean for small groups.
    :param threshold: Number below which a level is considered small enough to be shrunken.
    :param test: Whether or not to print the row_val_dict for testing purposes.
    :return: Tuple of encoded variable from train and test set as H2OFrames.

    """

    # convert to pandas
    trdf = training_frame.as_data_frame().loc[:, [x,y]] # df
    tss = test_frame.as_data_frame().loc[:, x]          # series


    # create dictionary of level:encode val

    encode_name = x + '_Tencode'
    overall_mean = trdf[y].mean()
    row_val_dict = {}

    for level in trdf[x].unique():
        level_df = trdf[trdf[x] == level][y]
        level_n = level_df.shape[0]
        level_mean = level_df.mean()
        if level_n >= threshold:
            row_val_dict[level] = level_mean
        else:
            row_val_dict[level] = ((1 - lambda_) * level_mean) +\
                                  (lambda_ * overall_mean)

    row_val_dict[np.nan] = overall_mean # handle missing values

    if test:
        print(row_val_dict)

    # apply the transform to training data
    trdf[encode_name] = trdf[x].apply(lambda i: row_val_dict[i])

    # apply the transform to test data
    tsdf = pd.DataFrame(columns=[x, encode_name])
    tsdf[x] = tss
    tsdf.loc[:, encode_name] = overall_mean # handle previously unseen values
    # handle values that are seen in tsdf but not row_val_dict
    for i, col_i in enumerate(tsdf[x]):
        try:
            row_val_dict[col_i]
        except:
            # a value that appeared in tsdf isn't in the row_val_dict so just
            # make it the overall_mean
            row_val_dict[col_i] = overall_mean
    tsdf[encode_name] = tsdf[x].apply(lambda i: row_val_dict[i])


    # convert back to H2O

    trdf = h2o.H2OFrame(trdf[encode_name].as_matrix())
    trdf.columns = [encode_name]

    tsdf = h2o.H2OFrame(tsdf[encode_name].as_matrix())
    tsdf.columns = [encode_name]

    return (trdf, tsdf)

Execute encoding
---

In [59]:
total = len(cats)
for i, var in enumerate(cats):
    
    tr_enc, _ = target_encoder(train, test, var, 'y')
    v_enc, ts_enc = target_encoder(valid, test, var, 'y')
    
    print('Encoding: ' + var + ' (' + str(i+1) + '/' + str(total) + ') ...')

    train = train.cbind(tr_enc)
    valid = valid.cbind(v_enc)
    test = test.cbind(ts_enc)    
    
print('Done.')

Encoding: X2 (1/8) ...
Encoding: X1 (2/8) ...
Encoding: X3 (3/8) ...
Encoding: X6 (4/8) ...
Encoding: X4 (5/8) ...
Encoding: X5 (6/8) ...
Encoding: X8 (7/8) ...
Encoding: X0 (8/8) ...
Done.


Redefine numerics and explore
---

In [60]:
encoded_nums, cats = get_type_lists(frame=train)

Numeric = ['X282', 'X309', 'X354', 'X53', 'X125', 'X64', 'X340', 'X257', 'X44', 'X51', 'X30', 'X215', 'X278', 'X159', 'X54', 'X225', 'X363', 'X382', 'X375', 'X190', 'X385', 'X145', 'X367', 'X252', 'X383', 'X113', 'X243', 'X328', 'X129', 'X287', 'X348', 'X50', 'X65', 'X89', 'X192', 'X176', 'X10', 'X11', 'X236', 'X93', 'X244', 'X67', 'X319', 'X31', 'X171', 'X352', 'X263', 'X261', 'X302', 'X172', 'X339', 'X146', 'X63', 'X377', 'X245', 'X182', 'X62', 'X280', 'X345', 'X324', 'X298', 'X364', 'X61', 'X189', 'X249', 'X266', 'X79', 'X177', 'X207', 'X71', 'X140', 'X365', 'X308', 'X237', 'X133', 'X284', 'X267', 'X291', 'X358', 'X213', 'X161', 'X95', 'X233', 'X183', 'X380', 'X344', 'X66', 'X226', 'X162', 'X353', 'X219', 'X325', 'X346', 'X217', 'X4_Tencode', 'X123', 'X134', 'X356', 'X264', 'X314', 'X150', 'X242', 'X60', 'X57', 'X180', 'X87', 'X378', 'X32', 'X18', 'X126', 'X272', 'X155', 'X311', 'X260', 'X151', 'X323', 'X293', 'X6_Tencode', 'X212', 'X275', 'X351', 'X34', 'X256', 'X36', 'X271', 'X341

Create combination features
---

In [61]:
def feature_combiner(training_frame, test_frame, nums):
    
    """ Combines numeric features using simple arithmatic operations.
    
    :param training_frame: Training frame from which to generate features and onto which generated 
                           feeatures will be cbound.
    :param test_frame: Test frame from which to generate features and onto which generated 
                       feeatures will be cbound.
    :param nums: List of original numeric features from which to generate combined features.
    
    """

    total = len(nums)
    
    # convert to pandas
    train_df = training_frame.as_data_frame()
    test_df = test_frame.as_data_frame()
    
    for i, col_i in enumerate(nums):
        
        print('Combining: ' + col_i + ' (' + str(i+1) + '/' + str(total) + ') ...')        
        
        for j, col_j in enumerate(nums):
            
            # don't repeat (i*j = j*i)
            if i < j:
                
                # convert to pandas
                col_i_train_df = train_df[col_i]
                col_j_train_df = train_df[col_j]
                col_i_test_df = test_df[col_i]
                col_j_test_df = test_df[col_j] 

                # multiply, convert back to h2o
                train_df[str(col_i + '|' + col_j)] = col_i_train_df.values*col_j_train_df.values
                test_df[str(col_i + '|' + col_j)] = col_i_test_df.values*col_j_test_df.values
                
    print('Done.')
    
    # convert back to h2o
    
    print('Converting to H2OFrame ...')
    
    training_frame = h2o.H2OFrame(train_df)
    training_frame.columns = list(train_df)
    test_frame = h2o.H2OFrame(test_df)
    test_frame.columns = list(test_df)
    
    print('Done.')
    print()
    
    # conserve memory 
    del train_df
    del test_df 
    
    return training_frame, test_frame

In [64]:
train, _ = feature_combiner(train, test, encoded_nums)
valid, test = feature_combiner(valid, test, encoded_nums)

KeyboardInterrupt: 

Redefine numerics and explore
---

In [63]:
encoded_combined_nums, cats = get_type_lists(frame=train)

Numeric = ['X282', 'X309', 'X354', 'X53', 'X125', 'X64', 'X340', 'X257', 'X44', 'X51', 'X30', 'X215', 'X278', 'X159', 'X54', 'X225', 'X363', 'X382', 'X375', 'X190', 'X385', 'X145', 'X367', 'X252', 'X383', 'X113', 'X243', 'X328', 'X129', 'X287', 'X348', 'X50', 'X65', 'X89', 'X192', 'X176', 'X10', 'X11', 'X236', 'X93', 'X244', 'X67', 'X319', 'X31', 'X171', 'X352', 'X263', 'X261', 'X302', 'X172', 'X339', 'X146', 'X63', 'X377', 'X245', 'X182', 'X62', 'X280', 'X345', 'X324', 'X298', 'X364', 'X61', 'X189', 'X249', 'X266', 'X79', 'X177', 'X207', 'X71', 'X140', 'X365', 'X308', 'X237', 'X133', 'X284', 'X267', 'X291', 'X358', 'X213', 'X161', 'X95', 'X233', 'X183', 'X380', 'X344', 'X66', 'X226', 'X162', 'X353', 'X219', 'X325', 'X346', 'X217', 'X4_Tencode', 'X123', 'X134', 'X356', 'X264', 'X314', 'X150', 'X242', 'X60', 'X57', 'X180', 'X87', 'X378', 'X32', 'X18', 'X126', 'X272', 'X155', 'X311', 'X260', 'X151', 'X323', 'X293', 'X6_Tencode', 'X212', 'X275', 'X351', 'X34', 'X256', 'X36', 'X271', 'X341

In [None]:
# check number of created variables is correct
# 1 id column, 1 target column, 58 original + encoded numeric columns, 8 original categorical variables
# sum(range(1, 58)) combined variables
print(train.shape == (2936, sum(range(1, 58), (58 + 8 + 1 + 1))))
print(test.shape == (4209, sum(range(1, 58), (58 + 8 + 1 + 1))))

In [None]:
# check multiplication for a random column
ridx = np.random.choice(sum(range(1, 58)))
combined_only = [name for name in encoded_combined_nums if name not in encoded_nums]
combined_check_vars = combined_only[ridx].split('|')
combined_check_vars.append(combined_only[ridx])

print(train[736, combined_check_vars])
print(test[637, combined_check_vars])

print(train[736, combined_check_vars[0]]*train[736, combined_check_vars[1]])
print(test[637, combined_check_vars[0]]*test[637, combined_check_vars[1]])

Train models
---

In [None]:
h2o.show_progress() # turn on progress bars

In [None]:
# Check log transform - looks good
%matplotlib inline
train['y'].log().as_data_frame().hist()

# Execute log transform
train['y'] = train['y'].log()
valid['y'] = valid['y'].log()
print(train[0:3, 'y'])

Define ranked predictions plot function
---

In [None]:
def ranked_preds_plot(y, valid, preds):
    
    """ Generates ranked prediction plot.
    
    :param y: Name of target variable.
    :param valid: Name of validation H2OFrame.
    :param preds: Column vector of predictions to plot.

    """
    
    # plot top frame values
    preds.columns = ['predict']
    yhat_frame = valid.cbind(preds)
    print(yhat_frame[0:10, [y, 'predict']])

    # plot sorted predictions
    yhat_frame_df = yhat_frame[[y, 'predict']].as_data_frame()
    yhat_frame_df.sort_values(by='predict', inplace=True)
    yhat_frame_df.reset_index(inplace=True, drop=True)
    _ = yhat_frame_df.plot(title='Ranked Predictions Plot')

Function to generate submission file 
---

In [None]:
import re
import time

def gen_submission(preds, test=test):

    """ Generates submission file for Kaggle House Prices contest.
    
    :param preds: Column vector of predictions.
    :param test: Test data.
    
    """
    
    # create time stamp
    time_stamp = re.sub('[: ]', '_', time.asctime())

    # create predictions column
    sub = test['ID'].cbind(preds.exp())
    sub.columns = ['ID', 'y']
    
    # save file for submission
    sub_fname = '../data/submission_' + str(time_stamp) + '.csv'
    h2o.download_csv(sub, sub_fname)

Simple prediction blending function 
---

In [None]:
import os

def pred_blender(dir_, files):
    
    """ Performs simple blending of prediction files. 
    
    :param dir_: Directory in which files to be read are stored.
    :param files: List of prediction files to be blended.
    
    """
    
    # read predictions in files list and cbind
    for i, file in enumerate(files):
        if i == 0:
            df = pd.read_csv(dir_ + os.sep + file).drop('y', axis=1)
        col = pd.read_csv(dir_ + os.sep + file).drop('ID', axis=1)
        col.columns = ['y' + str(i)]
        df = pd.concat([df, col], axis=1)
        
    # create mean prediction    
    df['mean'] = df.iloc[:, 1:].mean(axis=1)
    print(df.head())
        
    # create time stamp
    time_stamp = re.sub('[: ]', '_', time.asctime())        
        
    # write new submission file    
    df = df[['ID', 'mean']]
    df.columns = ['ID', 'y']
    
    # save file for submission
    sub_fname = '../data/submission_' + str(time_stamp) + '.csv'
    df.to_csv(sub_fname, index=False)

Random forest model - typically not tuned as much as GBM
--

In [None]:
# initialize rf model
rf_model1 = H2ORandomForestEstimator(
    ntrees=10000,                    
    max_depth=10, 
    col_sample_rate_per_tree=0.1,
    sample_rate=0.8,
    stopping_rounds=50,
    score_each_iteration=True,
    nfolds=3,
    keep_cross_validation_predictions=True,
    seed=12345)           

# train rf model
rf_model1.train(
    x=encoded_combined_nums,
    y='y',
    training_frame=train,
    validation_frame=valid)

# print model information
print(rf_model1)

rf_preds1_val = rf_model1.predict(valid)
ranked_preds_plot('y', valid, rf_preds1_val) # valid RMSE not so hot ...
rf_preds1_test = rf_model1.predict(test)
gen_submission(rf_preds1_test) # public leaderboard

Extremely random trees model - typically not tuned as much as GBM
--

In [None]:
# initialize extra trees model
ert_model1 = H2ORandomForestEstimator(
    ntrees=10000,                    
    max_depth=10, 
    col_sample_rate_per_tree=0.1,
    sample_rate=0.8,
    stopping_rounds=50,
    score_each_iteration=True,
    nfolds=3,
    keep_cross_validation_predictions=True,
    seed=12345,
    histogram_type='random') # <- this is what makes it ERT instead of RF

# train ert model
ert_model1.train(
    x=encoded_combined_nums,
    y='y',
    training_frame=train,
    validation_frame=valid)

# print model information/create submission
print(ert_model1)
ert_preds1_val = ert_model1.predict(valid)
ranked_preds_plot('y', valid, ert_preds1_val) # valid RMSE not so hot ...
ert_preds1_test = ert_model1.predict(test)
gen_submission(ert_preds1_test) #  public leaderboard

H2O GBM model
--

In [None]:
# initialize H2O GBM
h2o_gbm_model = H2OGradientBoostingEstimator(
    ntrees = 10000,
    learn_rate = 0.005,
    sample_rate = 0.1, 
    col_sample_rate = 0.8,
    max_depth = 5,
    nfolds = 3,
    keep_cross_validation_predictions=True,
    stopping_rounds = 10,
    seed = 12345)

# execute training
h2o_gbm_model.train(x=encoded_combined_nums,
                    y='y',
                    training_frame=train,
                    validation_frame=valid)

# print model information/create submission
print(h2o_gbm_model)
h2o_gbm_preds1_val = h2o_gbm_model.predict(valid)
ranked_preds_plot('y', valid, h2o_gbm_preds1_val) # better validation error
h2o_gbm_preds1_test = h2o_gbm_model.predict(test)
gen_submission(h2o_gbm_preds1_test) #  public leaderboard

Train H2O XGBoost - very new!!
---

In [None]:
# initialize XGB GBM
h2o_xgb_model = H2OXGBoostEstimator(
    ntrees = 10000,
    learn_rate = 0.005,
    sample_rate = 0.1, 
    col_sample_rate = 0.8,
    max_depth = 5,
    nfolds = 3,
    keep_cross_validation_predictions=True,
    stopping_rounds = 10,
    seed = 12345)

# execute training 
h2o_xgb_model.train(x=encoded_combined_nums,
                    y='y',
                    training_frame=train,
                    validation_frame=valid)

# print model information/create submission
print(h2o_xgb_model)
h2o_xgb_preds1_val = h2o_xgb_model.predict(valid)
ranked_preds_plot('y', valid, h2o_xgb_preds1_val) 
h2o_xgb_preds1_test = h2o_xgb_model.predict(test)
gen_submission(h2o_xgb_preds1_test) #  on public leaderboard

Create blend
---

In [None]:
# create XGBoost blend
pred_blender('../data',
            [''])
#  on public leaderboard

Train H2O stacked model 
---

In [None]:
stack = H2OStackedEnsembleEstimator(training_frame=train, 
                                    validation_frame=valid, 
                                    base_models=[rf_model1, ert_model1, 
                                                 h2o_gbm_model])

stack.train(x=encoded_combined_nums,
            y='y',
            training_frame=train,
            validation_frame=valid)

# print model information/create submission
print(stack)
stack_preds1_val = stack.predict(valid)
ranked_preds_plot('y', valid, stack_preds1_val) 
stack_preds1_test = stack.predict(test)
gen_submission(stack_preds1_test)
#  on public leaderboard

Shutdown H2O
---

In [None]:
# Shutdown H2O - this will erase all your unsaved frames and models in H2O
h2o.cluster().shutdown(prompt=True)