In [41]:
import pickle
import h2o
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from fancyimpute import MICE as MICE
from copy import deepcopy
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score

from h2o.estimators.gbm import H2OGradientBoostingEstimator
from h2o.estimators.random_forest import H2ORandomForestEstimator
from h2o.grid.grid_search import H2OGridSearch


%matplotlib inline

In [2]:
h2o.init(nthreads=-1, min_mem_size="2G", max_mem_size = "6G")             #specify max number of bytes. uses all cores by default.
h2o.remove_all()                          #clean slate, in case cluster was already running

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "1.8.0_121"; OpenJDK Runtime Environment (Zulu 8.20.0.5-macosx) (build 1.8.0_121-b15); OpenJDK 64-Bit Server VM (Zulu 8.20.0.5-macosx) (build 25.121-b15, mixed mode)
  Starting server from /Users/songlin/anaconda2/envs/Python35/lib/python3.5/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/zs/j5bf_gzd48g571wnp7g6hdy00000gn/T/tmpxwvfl5on
  JVM stdout: /var/folders/zs/j5bf_gzd48g571wnp7g6hdy00000gn/T/tmpxwvfl5on/h2o_songlin_started_from_python.out
  JVM stderr: /var/folders/zs/j5bf_gzd48g571wnp7g6hdy00000gn/T/tmpxwvfl5on/h2o_songlin_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,America/Los_Angeles
H2O data parsing timezone:,UTC
H2O cluster version:,3.20.0.3
H2O cluster version age:,"21 days, 5 hours and 17 minutes"
H2O cluster name:,H2O_from_python_songlin_b3n9iz
H2O cluster total nodes:,1
H2O cluster free memory:,5.333 Gb
H2O cluster total cores:,4
H2O cluster allowed cores:,4


In [3]:
# This cell is just about making things look nice

# Make plots bigger by default.
# Has to occur after the %matplotlib inline in a different cell. Known issue
plt.rc('figure', figsize=(16.0, 10.0))
plt.rcParams.update({'font.size': 22})

# Make numpy outputs easier to read
np.set_printoptions(precision=3, formatter={'float': '{: 8.3f}'.format})

### Import imputed data

In [20]:
with open('./data/processed/X_train_age_imputed.pkl', 'rb') as picklefile:
    X_train = pickle.load(picklefile)
with open('./data/processed/y_train_age_completed.pkl', 'rb') as picklefile:
    y_train = pickle.load(picklefile)

In [30]:
def custom_dummify(df, threshold, scaler=None):
    df_concat = pd.DataFrame()
    df_noncat = df.select_dtypes(exclude='object')
    if (scaler == None):
        scaler=StandardScaler()
        df_noncat = pd.DataFrame(scaler.fit_transform(df_noncat), columns=df_noncat.columns)
    else:
        df_noncat = pd.DataFrame(scaler.transform(df_noncat), columns=df_noncat.columns)
    # print(df_noncat.shape)
    df = df.select_dtypes(include='object')
    for column in range(len(df.columns)):
        col_name = df.columns[column]
        levels = df.iloc[:,column].value_counts()
        count_all = np.sum(levels)
        significant_cols = []
        for level_values in levels.index:
            if (levels[level_values]/count_all < threshold):
                pass
            else:
                significant_cols.append(level_values)
        df_dummy = pd.get_dummies(df[col_name], prefix=col_name)
        cols_to_keep = [(col_name+"_"+x) for x in significant_cols]
        df_dummy_kept = pd.DataFrame(df_dummy[cols_to_keep], columns=cols_to_keep)
        df_dummy_agg = df_dummy.drop((cols_to_keep), axis=1)
        df_dummy_agg = pd.DataFrame(df_dummy_agg.sum(axis=1), columns=[col_name+'_catch_all'])
        df_unify = pd.concat([df_dummy_kept, df_dummy_agg], axis=1)
        df_concat = pd.concat([df_concat, df_unify], axis=1)
    # print(df_concat.shape)
    return pd.concat([df_noncat, df_concat], axis=1 ), scaler

## Build a baseline model based on h2o framework

In [33]:
df_train = pd.concat([X_train, y_train], axis=1)
h2o_train = h2o.H2OFrame(df_train)
train, valid, test = h2o_train.split_frame([0.6, 0.2], seed=1234)
X_train_h2o = h2o_train.col_names[:-1]
y_train_h2o = h2o_train.col_names[-1]

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [37]:
rf_v2 = H2ORandomForestEstimator(
    model_id="rf_country_v2",
    stopping_rounds = 10,
    stopping_tolerance = 0.00001,
    stopping_metric = 'auto',
    score_each_iteration = True,
    balance_classes = True,
    seed = 7)

In [38]:
hyper_parameters = {'ntrees':[10, 20, 40, 60, 90], 
                    'max_depth':[5, 10, 30, 50, 80, 90]}

In [39]:
criteria = {"strategy": "RandomDiscrete", 
            "stopping_rounds": 10,
            "stopping_tolerance": 0.00001,
            "stopping_metric": "misclassification"}

In [42]:
grid_search = H2OGridSearch(model = rf_v2, 
                            hyper_params = hyper_parameters,
                            search_criteria = criteria)

In [None]:
grid_search.train(X_train_h2o,
                  y_train_h2o,
                  training_frame = train,
                  validation_frame = valid)

In [None]:
sorted_grid = grid_search.get_grid(sort_by='auc', decreasing=False)
best_max_depth  = sorted_grid.sorted_metric_table()['max_depth'][0]
best_ntrees     = sorted_grid.sorted_metric_table()['ntrees'][0]
#print ('Best misclassification score is', sorted_grid.)

## Build sklearn model based on dummified variables

In [None]:
X_train_sk = ?
y_train_sk = ?
X_test_sk = ?
y_test_sk = ?

In [1]:
(2860+3845)/((2860+3845)+(7413+1890))

0.4188530734632684