In [1]:
import h2o
from h2o.automl import H2OAutoML
import random, os, sys
from datetime import datetime
import pandas as pd
import logging
import csv
import optparse
import time
import json
from distutils.util import strtobool
import psutil

In [2]:
data_path=None
all_variables=None
test_path=None
# target='search_term'
target=None
nthreads=1 
min_mem_size=6 
run_time=1500
classification=True
scale=True
max_models=20    
model_path=None
balance_y=False 
balance_threshold=0.2
name=None 
server_path=None  
analysis=0

In [3]:
#Funtion for meta data
def set_meta_data(analysis,run_id,server,data,test,model_path,target,run_time,classification,scale,model,balance,
                  balance_threshold,name,path,nthreads,min_mem_size):
  m_data={}
  m_data['start_time'] = time.time()
  m_data['target']=target
  m_data['server_path']=server
  m_data['data_path']=data 
  m_data['test_path']=test
  m_data['max_models']=model
  m_data['run_time']=run_time
  m_data['run_id'] =run_id
  m_data['scale']=scale
  m_data['classification']=classification
  m_data['scale']=False
  m_data['model_path']=model_path
  m_data['balance']=balance
  m_data['balance_threshold']=balance_threshold
  m_data['project'] =name
  m_data['end_time'] = time.time()
  m_data['execution_time'] = 0.0
  m_data['run_path'] =path
  m_data['nthreads'] = nthreads
  m_data['min_mem_size'] = min_mem_size
  m_data['analysis'] = analysis
  return m_data




In [4]:
#Function to convert dictionary into json file
def dict_to_json(dct,n):
  j = json.dumps(dct, indent=4)
  f = open(n, 'w')
  print(j, file=f)
  f.close()
    
#Function for random alphabet   
def alphabet(n):
    alpha='0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'    
    str=''
    r=len(alpha)-1   
    while len(str)<n:
        i=random.randint(0,r)
        str+=alpha[i]   
        return str

def impute_missing_values(df, x, scal=False):
    # determine column types
    ints, reals, enums = [], [], []
    for key, val in df.types.items():
        if key in x:
            if val == 'enum':
                enums.append(key)
            elif val == 'int':
                ints.append(key)            
            else: 
                reals.append(key)    
    _ = df[reals].impute(method='mean')
    _ = df[ints].impute(method='median')
    if scal:
        df[reals] = df[reals].scale()
        df[ints] = df[ints].scale()    
    return


    

def predictions(mod,data,run_id):
    test = h2o.import_file(data)
    mod_perf=mod_best.model_performance(test)
              
    stats_test={}
    stats_test=model_performance_stats(mod_perf)

    n=run_id+'_test_stats.json'
    dict_to_json(stats_test,n) 

    try:    
      cf=mod_perf.confusion_matrix(metrics=["f1","f2","f0point5","accuracy","precision","recall","specificity","absolute_mcc","min_per_class_accuracy","mean_per_class_accuracy"])
      cf_df=cf[0].table.as_data_frame()
      cf_df.to_csv(run_id+'_test_confusion_matrix.csv')
    except:
      pass

    predictions = mod_best.predict(test)
    predictions_df=test.cbind(predictions).as_data_frame() 
    predictions_df.to_csv(run_id+'_predictions.csv')
    return
def predictions_test(mod,test,run_id):
    mod_perf=mod_best.model_performance(test)          
    stats_test={}
    stats_test=model_performance_stats(mod_perf)
    n=run_id+'_test_stats.json'
    dict_to_json(stats_test,n) 
    try:
      cf=mod_perf.confusion_matrix()
#      cf=mod_perf.confusion_matrix(metrics=["f1","f2","f0point5","accuracy","precision","recall","specificity","absolute_mcc","min_per_class_accuracy","mean_per_class_accuracy"])
      cf_df=cf.table.as_data_frame()
      cf_df.to_csv(run_id+'_test_confusion_matrix.csv')
    except:
      pass
    predictions = mod_best.predict(test)    
    predictions_df=test.cbind(predictions).as_data_frame() 
    predictions_df.to_csv(run_id+'_predictions.csv')
    return predictions

def check_X(x,df):
    for name in x:
        if name not in df.columns:
          x.remove(name)  
    return x    
    
    
def get_stacked_ensemble(lst):
    se=None
    for model in model_set:
      if 'BestOfFamily' in model:
        se=model
    if se is None:     
      for model in model_set:
        if 'AllModels'in model:
          se=model           
    return se       
    
def get_variables_types(df):
    d={}
    for key, val in df.types.items():
        d[key]=val
    return d    

In [5]:
def se_stats(modl):
    d={}
    d['algo']=modl.algo
    d['model_id']=modl.model_id  
    d['auc']=modl.auc()   
    d['roc']=modl.roc()
    d['mse']=modl.mse()   
    d['null_degrees_of_freedom']=modl.null_degrees_of_freedom()
    d['null_deviance']=modl.null_deviance()
    d['residual_degrees_of_freedom']=modl.residual_degrees_of_freedom()   
    d['residual_deviance']=modl.residual_deviance()
    d['rmse']=modl.rmse()
    return d

def get_model_by_algo(algo,models_dict):
    mod=None
    mod_id=None    
    for m in list(models_dict.keys()):
        if m[0:3]==algo:
            mod_id=m
            mod=h2o.get_model(m)      
    return mod,mod_id     
    
    

In [6]:
def stackedensemble(mod):
    coef_norm=None
    try:
      metalearner = h2o.get_model(mod.metalearner()['name'])
      coef_norm=metalearner.coef_norm()
    except:
      pass        
    return coef_norm

def stackedensemble_df(df):
    bm_algo={ 'GBM': None,'GLM': None,'DRF': None,'XRT': None,'Dee': None}
    for index, row in df.iterrows():
      if len(row['model_id'])>3:
        key=row['model_id'][0:3]
        if key in bm_algo:
          if bm_algo[key] is None:
                bm_algo[key]=row['model_id']
    bm=list(bm_algo.values()) 
    bm=list(filter(None.__ne__, bm))             
    return bm


def gbm_stats(modl):
    d={}
    d['algo']=modl.algo
    d['model_id']=modl.model_id   
    d['varimp']=modl.varimp()  
    return d
    
    
def dl_stats(modl):
    d={}
    d['algo']=modl.algo
    d['model_id']=modl.model_id   
    d['varimp']=modl.varimp()  
    return d
    

In [7]:
def drf_stats(modl):
    d={}
    d['algo']=modl.algo
    d['model_id']=modl.model_id   
    d['varimp']=modl.varimp()  
    d['roc']=modl.roc()     
    return d
    
def xrt_stats(modl):
    d={}
    d['algo']=modl.algo
    d['model_id']=modl.model_id   
    d['varimp']=modl.varimp()  
    d['roc']=modl.roc()      
    return d
    
    
def glm_stats(modl):
    d={}
    d['algo']=modl.algo
    d['model_id']=modl.model_id   
    d['coef']=modl.coef()  
    d['coef_norm']=modl.coef_norm()      
    return d

In [8]:

    
def model_performance_stats(perf):
    d={}
    try:    
      d['mse']=perf.mse()
    except:
      pass      
    try:    
      d['rmse']=perf.rmse() 
    except:
      pass      
    try:    
      d['null_degrees_of_freedom']=perf.null_degrees_of_freedom()
    except:
      pass      
    try:    
      d['residual_degrees_of_freedom']=perf.residual_degrees_of_freedom()
    except:
      pass      
    try:    
      d['residual_deviance']=perf.residual_deviance() 
    except:      pass      
    try:    
      d['null_deviance']=perf.null_deviance() 
    except:
      pass      
    try:    
      d['aic']=perf.aic() 
    except:
      pass      
    try:
      d['logloss']=perf.logloss() 
    except:
      pass    
    try:
      d['auc']=perf.auc()
    except:
      pass  
    try:
      d['gini']=perf.gini()
    except:
      pass    
    return d

In [9]:
all_variables=None

In [10]:
data_path='C:/Users/pkash/Downloads/adult.csv'

In [11]:
run_id=alphabet(9)
if server_path==None:
    server_path=os.path.abspath(os.curdir)
os.chdir(server_path) 
run_dir = os.path.join(server_path,run_id)
os.mkdir(run_dir)
os.chdir(run_dir)    

# run_id to std out
print (run_id)

h


In [12]:
##For logs
logfile=run_id+'_autoh2o_log.zip'
logs_path=os.path.join(run_dir,'logs')
print(logs_path,' ',logfile)

C:\Users\pkash\h\logs   h_autoh2o_log.zip


In [13]:
h2o.init(ip="localhost", port=54323)

Checking whether there is an H2O instance running at http://localhost:54323 ..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.201-b09, mixed mode)
  Starting server from C:\Users\pkash\Anaconda\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\pkash\AppData\Local\Temp\tmpfjnte2e2
  JVM stdout: C:\Users\pkash\AppData\Local\Temp\tmpfjnte2e2\h2o_pkash_started_from_python.out
  JVM stderr: C:\Users\pkash\AppData\Local\Temp\tmpfjnte2e2\h2o_pkash_started_from_python.err
  Server is running at http://127.0.0.1:54323
Connecting to H2O server at http://127.0.0.1:54323 ... successful.


0,1
H2O cluster uptime:,02 secs
H2O cluster timezone:,America/New_York
H2O data parsing timezone:,UTC
H2O cluster version:,3.24.0.1
H2O cluster version age:,13 days
H2O cluster name:,H2O_from_python_pkash_ipfxdy
H2O cluster total nodes:,1
H2O cluster free memory:,1.743 Gb
H2O cluster total cores:,8
H2O cluster allowed cores:,8


In [16]:
# meta data
meta_data = set_meta_data(analysis, run_id,server_path,data_path,test_path,model_path,target,run_time,classification,scale,max_models,balance_y,balance_threshold,name,run_dir,nthreads,min_mem_size)
print(meta_data)

{'start_time': 1555264074.8204637, 'target': None, 'server_path': 'C:\\Users\\pkash', 'data_path': 'C:/Users/pkash/Downloads/adult.csv', 'test_path': None, 'max_models': 9, 'run_time': 1500, 'run_id': 'h', 'scale': False, 'classification': False, 'model_path': None, 'balance': False, 'balance_threshold': 0.2, 'project': None, 'end_time': 1555264074.8204637, 'execution_time': 0.0, 'run_path': 'C:\\Users\\pkash\\h', 'nthreads': 1, 'min_mem_size': 6, 'analysis': 0}


In [17]:
df = h2o.import_file(data_path)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [18]:
df.head()

age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
29,?,227026,HS-grad,9,Never-married,?,Unmarried,Black,Male,0,0,40,United-States,<=50K
63,Self-emp-not-inc,104626,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103,0,32,United-States,>50K
24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K
55,Private,104996,7th-8th,4,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,10,United-States,<=50K




In [17]:

df.describe()

Rows:48842
Cols:15




Unnamed: 0,age,workclass,fnlwgt,education,educational-num,marital-status,occupation,relationship,race,gender,capital-gain,capital-loss,hours-per-week,native-country,income
type,int,enum,int,enum,int,enum,enum,enum,enum,enum,int,int,int,enum,enum
mins,17.0,,12285.0,,1.0,,,,,,0.0,0.0,1.0,,
mean,38.643585438761704,,189664.13459727284,,10.078088530363212,,,,,,1079.0676262233324,87.50231358257237,40.42238237582409,,
maxs,90.0,,1490400.0,,16.0,,,,,,99999.0,4356.0,99.0,,
sigma,13.710509934443555,,105604.02542315728,,2.570972755592256,,,,,,7452.019057655393,403.00455212435907,12.391444024252307,,
zeros,0,,0,,0,,,,,,44807,46560,0,,
missing,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
0,25.0,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
1,38.0,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
2,28.0,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K


In [19]:
# dependent variable
# assign target and inputs for classification or regression
if target==None:
    target=df.columns[14]   
y = target
y

'income'

In [20]:
x = list(df.columns)
x.remove(y)
x
# Add independent variables

meta_data['x']=x 
# impute missing values

_=impute_missing_values(df,x, scale)


In [21]:
# # impute missing values

# _ = df[ints].impute(method='median')

# if scale:
    
#     df[ints] = df[ints].scale()

In [22]:
if analysis == 3:
  classification=False
elif analysis == 2:
  classification=True
elif analysis == 1:
  classification=True

In [23]:
def check_y(y,df):
  ok=False
  C = [name for name in df.columns if name == y]
  for key, val in df.types.items():
    if key in C:
      if val in ['real','int','enum']:        
        ok=True         
  return ok

In [24]:
ok=check_y(y,df)
if not ok:
    print(ok)

In [25]:
classification=True
if classification:
    print(df[y].levels())

[['<=50K', '>50K']]


In [26]:
def get_variables_types(df):
    d={}
    for key, val in df.types.items():
        d[key]=val           
    return d    

In [27]:
allV=get_variables_types(df)
allV

{'age': 'int',
 'workclass': 'enum',
 'fnlwgt': 'int',
 'education': 'enum',
 'educational-num': 'int',
 'marital-status': 'enum',
 'occupation': 'enum',
 'relationship': 'enum',
 'race': 'enum',
 'gender': 'enum',
 'capital-gain': 'int',
 'capital-loss': 'int',
 'hours-per-week': 'int',
 'native-country': 'enum',
 'income': 'enum'}

In [28]:
meta_data['variables']=allV

In [29]:
# Set up AutoML

aml = H2OAutoML(max_runtime_secs=run_time,project_name = name)

In [30]:
model_start_time = time.time()

In [31]:
aml.train(x=x,y=y,training_frame=df)

AutoML progress: |████████████████████████████████████████████████████████| 100%


In [32]:
meta_data['model_execution_time'] = time.time() - model_start_time

In [33]:
# get leaderboard
aml_leaderboard_df=aml.leaderboard.as_data_frame()

In [34]:
aml_leaderboard_df

Unnamed: 0,model_id,auc,logloss,mean_per_class_error,rmse,mse
0,GBM_2_AutoML_20190414_134833,0.928782,0.277182,0.171568,0.296435,0.087873
1,GBM_1_AutoML_20190414_134833,0.928271,0.278211,0.171073,0.2968,0.08809
2,GBM_3_AutoML_20190414_134833,0.928138,0.278434,0.165076,0.297099,0.088268
3,StackedEnsemble_AllModels_AutoML_20190414_134833,0.927902,0.292566,0.176637,0.300268,0.090161
4,GBM_4_AutoML_20190414_134833,0.927433,0.279514,0.172404,0.297857,0.088719
5,StackedEnsemble_BestOfFamily_AutoML_20190414_1...,0.927421,0.294523,0.173164,0.301016,0.090611
6,GBM_grid_1_AutoML_20190414_134833_model_11,0.92683,0.28122,0.170049,0.298375,0.089028
7,GBM_grid_1_AutoML_20190414_134833_model_9,0.926748,0.287142,0.171274,0.299349,0.08961
8,GBM_5_AutoML_20190414_134833,0.926247,0.281368,0.173704,0.298931,0.08936
9,GBM_grid_1_AutoML_20190414_134833_model_1,0.924765,0.284611,0.167595,0.299978,0.089987


In [35]:
# STart best model as first model

model_set=aml_leaderboard_df['model_id']
mod_best=h2o.get_model(model_set[0])

In [74]:
mod_best.params

{'model_id': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'schema_name': 'ModelKeyV3',
    'schema_type': 'Key<Model>'},
   'name': 'GBM_2_AutoML_20190414_134833',
   'type': 'Key<Model>',
   'URL': '/3/Models/GBM_2_AutoML_20190414_134833'}},
 'training_frame': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'schema_name': 'FrameKeyV3',
    'schema_type': 'Key<Frame>'},
   'name': 'automl_training_adult1.hex',
   'type': 'Key<Frame>',
   'URL': '/3/Frames/automl_training_adult1.hex'}},
 'validation_frame': {'default': None, 'actual': None},
 'nfolds': {'default': 0, 'actual': 5},
 'keep_cross_validation_models': {'default': True, 'actual': False},
 'keep_cross_validation_predictions': {'default': False, 'actual': True},
 'keep_cross_validation_fold_assignment': {'default': False, 'actual': False},
 'score_each_iteration': {'default': False, 'actual': False},
 'score_tree_interval': {'default': 0, 'actual': 5},
 'fold_assignment': {'default': 'AU

In [36]:
mod_best._id

'GBM_2_AutoML_20190414_134833'

In [37]:
# Get stacked ensemble  
def get_stacked_ensemble(lst):
    se=None
    for model in model_set:
      if 'BestOfFamily' in model:
        se=model
    if se is None:     
      for model in model_set:
        if 'AllModels'in model:
          se=model           
    return se       
    
se=get_stacked_ensemble(model_set)

In [38]:
print(se)

StackedEnsemble_BestOfFamily_AutoML_20190414_134833


In [39]:
if se is not None:
  mod_best=h2o.get_model(se)

In [40]:

dir(mod_best)

['F0point5',
 'F1',
 'F2',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_bc',
 '_bcin',
 '_check_and_save_parm',
 '_check_targets',
 '_compute_algo',
 '_end_time',
 '_estimator_type',
 '_future',
 '_get_metrics',
 '_have_mojo',
 '_have_pojo',
 '_id',
 '_is_xvalidated',
 '_job',
 '_keyify_if_h2oframe',
 '_metrics_class',
 '_model_json',
 '_parms',
 '_plot',
 '_requires_training_frame',
 '_resolve_model',
 '_run_time',
 '_start_time',
 '_train',
 '_verify_training_frame_params',
 '_xval_keys',
 'accuracy',
 'actual_params',
 'aic',
 'algo',
 'auc',
 'base_models',
 'biases',
 'blending_frame',
 'catoffsets',
 'coef',
 'coef_norm',
 'confusion_matrix',

In [41]:
mod_best._id

'StackedEnsemble_BestOfFamily_AutoML_20190414_134833'

In [42]:
mod_best._get_metrics

<function h2o.model.model_base.ModelBase._get_metrics(o, train, valid, xval)>

In [43]:
type(mod_best)

h2o.estimators.stackedensemble.H2OStackedEnsembleEstimator

In [44]:
mods=mod_best.coef_norm
print(mods)

Model Details
H2OStackedEnsembleEstimator :  Stacked Ensemble
Model Key:  StackedEnsemble_BestOfFamily_AutoML_20190414_134833
No model summary for this model


ModelMetricsBinomialGLM: stackedensemble
** Reported on train data. **

MSE: 0.07599351485717933
RMSE: 0.2756692127481401
LogLoss: 0.2514565276229559
Null degrees of freedom: 48841
Residual degrees of freedom: 48838
Null deviance: 53750.68147229072
Residual deviance: 24563.279444320826
AIC: 24571.279444320826
AUC: 0.951149017093998
pr_auc: 0.8489867163167949
Gini: 0.902298034187996
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.30481402048312295: 


0,1,2,3,4
,<=50K,>50K,Error,Rate
<=50K,33969.0,3186.0,0.0857,(3186.0/37155.0)
>50K,2333.0,9354.0,0.1996,(2333.0/11687.0)
Total,36302.0,12540.0,0.113,(5519.0/48842.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.3048140,0.7721963,237.0
max f2,0.1196860,0.8422139,323.0
max f0point5,0.6885012,0.8085062,109.0
max accuracy,0.4623069,0.8938414,182.0
max precision,0.9690007,1.0,0.0
max recall,0.0351034,1.0,397.0
max specificity,0.9690007,1.0,0.0
max absolute_mcc,0.4273261,0.7005614,195.0
max min_per_class_accuracy,0.2047659,0.8703688,280.0


Gains/Lift Table: Avg response rate: 23.93 %, avg score: 24.04 %



0,1,2,3,4,5,6,7,8,9,10,11,12,13
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100119,0.9680759,4.1791734,4.1791734,1.0,0.9687473,1.0,0.9687473,0.0418414,0.0418414,317.9173441,317.9173441
,2,0.0200033,0.9666753,4.1791734,4.1791734,1.0,0.9674017,1.0,0.9680752,0.0417558,0.0835972,317.9173441,317.9173441
,3,0.0300152,0.9647313,4.1791734,4.1791734,1.0,0.9657429,1.0,0.9672973,0.0418414,0.1254385,317.9173441,317.9173441
,4,0.0400066,0.9625829,4.1791734,4.1791734,1.0,0.9637369,1.0,0.9664081,0.0417558,0.1671943,317.9173441,317.9173441
,5,0.0500184,0.9568406,4.1791734,4.1791734,1.0,0.9603056,1.0,0.9651866,0.0418414,0.2090357,317.9173441,317.9173441
,6,0.1000164,0.8662928,3.9053537,4.0422916,0.9344799,0.9137236,0.9672467,0.9394604,0.1952597,0.4042954,290.5353723,304.2291608
,7,0.1500143,0.7243035,3.2139589,3.7662184,0.7690418,0.8035006,0.9011874,0.8941466,0.1606914,0.5649867,221.3958936,276.6218402
,8,0.2000123,0.5022464,2.5636371,3.4656039,0.6134316,0.6157076,0.8292558,0.8245440,0.1281766,0.6931633,156.3637106,246.5603853
,9,0.3000082,0.2146453,1.6934038,2.8749108,0.4052007,0.3379140,0.6879137,0.6623451,0.1693334,0.8624968,69.3403816,187.4910822




ModelMetricsBinomialGLM: stackedensemble
** Reported on cross-validation data. **

MSE: 0.0906107731356874
RMSE: 0.30101623400688443
LogLoss: 0.29452325604229096
Null degrees of freedom: 48841
Residual degrees of freedom: 48838
Null deviance: 53753.71062321005
Residual deviance: 28770.209743235147
AIC: 28778.209743235147
AUC: 0.9274205195427494
pr_auc: 0.812908356342766
Gini: 0.8548410390854988
Confusion Matrix (Act/Pred) for max f1 @ threshold = 0.315544098824348: 


0,1,2,3,4
,<=50K,>50K,Error,Rate
<=50K,33564.0,3591.0,0.0966,(3591.0/37155.0)
>50K,2918.0,8769.0,0.2497,(2918.0/11687.0)
Total,36482.0,12360.0,0.1333,(6509.0/48842.0)


Maximum Metrics: Maximum metrics at their respective thresholds



0,1,2,3
metric,threshold,value,idx
max f1,0.3155441,0.7293217,237.0
max f2,0.0977479,0.8064613,337.0
max f0point5,0.7273335,0.7646574,96.0
max accuracy,0.4598512,0.8735924,184.0
max precision,0.9698131,1.0,0.0
max recall,0.0336333,1.0,399.0
max specificity,0.9698131,1.0,0.0
max absolute_mcc,0.3960892,0.6433255,207.0
max min_per_class_accuracy,0.1846698,0.8430628,291.0


Gains/Lift Table: Avg response rate: 23.93 %, avg score: 23.93 %



0,1,2,3,4,5,6,7,8,9,10,11,12,13
,group,cumulative_data_fraction,lower_threshold,lift,cumulative_lift,response_rate,score,cumulative_response_rate,cumulative_score,capture_rate,cumulative_capture_rate,gain,cumulative_gain
,1,0.0100119,0.9680440,4.1791734,4.1791734,1.0,0.9690454,1.0,0.9690454,0.0418414,0.0418414,317.9173441,317.9173441
,2,0.0200033,0.9663018,4.1791734,4.1791734,1.0,0.9671927,1.0,0.9681200,0.0417558,0.0835972,317.9173441,317.9173441
,3,0.0300152,0.9643216,4.1620807,4.1734720,0.9959100,0.9653805,0.9986357,0.9672062,0.0416702,0.1252674,316.2080707,317.3471976
,4,0.0400066,0.9611820,4.1449179,4.1663408,0.9918033,0.9628990,0.9969294,0.9661305,0.0414135,0.1666809,314.4917921,316.6340769
,5,0.0500184,0.9535033,4.1449880,4.1620667,0.9918200,0.9578852,0.9959067,0.9644801,0.0414991,0.2081800,314.4987973,316.2066713
,6,0.1000164,0.8587364,3.6366681,3.8994212,0.8701884,0.9076926,0.9330604,0.9360921,0.1818260,0.3900060,263.6668125,289.9421196
,7,0.1500143,0.7076419,2.9110459,3.5700077,0.6965602,0.7900685,0.8542378,0.8874242,0.1455463,0.5355523,191.1045873,257.0007720
,8,0.2000123,0.4945970,2.3411586,3.2628269,0.5601966,0.6012446,0.7807350,0.8158867,0.1170531,0.6526055,134.1158586,226.2826884
,9,0.3000082,0.2185164,1.6181034,2.7146231,0.3871826,0.3348614,0.6495598,0.6555559,0.1618037,0.8144092,61.8103394,171.4623136



<bound method ModelBase.coef_norm of >


In [45]:
bm=stackedensemble_df(aml_leaderboard_df)

In [46]:
bm

['GBM_2_AutoML_20190414_134833',
 'GLM_grid_1_AutoML_20190414_134833_model_1',
 'DRF_1_AutoML_20190414_134833',
 'XRT_1_AutoML_20190414_134833',
 'DeepLearning_1_AutoML_20190414_134833']

In [47]:

aml_leaderboard_df

Unnamed: 0,model_id,auc,logloss,mean_per_class_error,rmse,mse
0,GBM_2_AutoML_20190414_134833,0.928782,0.277182,0.171568,0.296435,0.087873
1,GBM_1_AutoML_20190414_134833,0.928271,0.278211,0.171073,0.2968,0.08809
2,GBM_3_AutoML_20190414_134833,0.928138,0.278434,0.165076,0.297099,0.088268
3,StackedEnsemble_AllModels_AutoML_20190414_134833,0.927902,0.292566,0.176637,0.300268,0.090161
4,GBM_4_AutoML_20190414_134833,0.927433,0.279514,0.172404,0.297857,0.088719
5,StackedEnsemble_BestOfFamily_AutoML_20190414_1...,0.927421,0.294523,0.173164,0.301016,0.090611
6,GBM_grid_1_AutoML_20190414_134833_model_11,0.92683,0.28122,0.170049,0.298375,0.089028
7,GBM_grid_1_AutoML_20190414_134833_model_9,0.926748,0.287142,0.171274,0.299349,0.08961
8,GBM_5_AutoML_20190414_134833,0.926247,0.281368,0.173704,0.298931,0.08936
9,GBM_grid_1_AutoML_20190414_134833_model_1,0.924765,0.284611,0.167595,0.299978,0.089987


In [48]:
#  Get best_models and coef_norm()
best_models={}
best_models=stackedensemble(mod_best)
bm=[]
if best_models is not None: 
  if 'Intercept' in best_models.keys():
    del best_models['Intercept']
  bm=list(best_models.keys())
else:
  best_models={}
  bm=stackedensemble_df(aml_leaderboard_df)   
  for b in bm:   
    best_models[b]=None

if mod_best.model_id not in bm:
    bm.append(mod_best.model_id)

In [49]:

bm

['GBM_2_AutoML_20190414_134833',
 'XRT_1_AutoML_20190414_134833',
 'DRF_1_AutoML_20190414_134833',
 'DeepLearning_1_AutoML_20190414_134833',
 'GLM_grid_1_AutoML_20190414_134833_model_1',
 'StackedEnsemble_BestOfFamily_AutoML_20190414_134833']

In [50]:
# Best of Family leaderboard

aml_leaderboard_df=aml_leaderboard_df.loc[aml_leaderboard_df['model_id'].isin(bm)]

In [51]:
aml_leaderboard_df

Unnamed: 0,model_id,auc,logloss,mean_per_class_error,rmse,mse
0,GBM_2_AutoML_20190414_134833,0.928782,0.277182,0.171568,0.296435,0.087873
5,StackedEnsemble_BestOfFamily_AutoML_20190414_1...,0.927421,0.294523,0.173164,0.301016,0.090611
18,XRT_1_AutoML_20190414_134833,0.917033,0.299058,0.183626,0.307478,0.094543
19,DRF_1_AutoML_20190414_134833,0.91697,0.303577,0.17777,0.306905,0.094191
22,DeepLearning_1_AutoML_20190414_134833,0.909855,0.312267,0.183988,0.315831,0.099749
27,GLM_grid_1_AutoML_20190414_134833_model_1,0.906792,0.317975,0.197513,0.318892,0.101692


In [52]:
# save leaderboard
leaderboard_stats=run_id+'_leaderboard.csv'
aml_leaderboard_df.to_csv(leaderboard_stats)

In [75]:
mod_best.params

{'model_id': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'schema_name': 'ModelKeyV3',
    'schema_type': 'Key<Model>'},
   'name': 'GBM_2_AutoML_20190414_134833',
   'type': 'Key<Model>',
   'URL': '/3/Models/GBM_2_AutoML_20190414_134833'}},
 'training_frame': {'default': None,
  'actual': {'__meta': {'schema_version': 3,
    'schema_name': 'FrameKeyV3',
    'schema_type': 'Key<Frame>'},
   'name': 'automl_training_adult1.hex',
   'type': 'Key<Frame>',
   'URL': '/3/Frames/automl_training_adult1.hex'}},
 'validation_frame': {'default': None, 'actual': None},
 'nfolds': {'default': 0, 'actual': 5},
 'keep_cross_validation_models': {'default': True, 'actual': False},
 'keep_cross_validation_predictions': {'default': False, 'actual': True},
 'keep_cross_validation_fold_assignment': {'default': False, 'actual': False},
 'score_each_iteration': {'default': False, 'actual': False},
 'score_tree_interval': {'default': 0, 'actual': 5},
 'fold_assignment': {'default': 'AU

In [53]:
top=aml_leaderboard_df.iloc[0]['model_id']
print(top)

GBM_2_AutoML_20190414_134833


In [54]:
mod_best=h2o.get_model(top)
print(mod_best._id)
print(mod_best.algo)

GBM_2_AutoML_20190414_134833
gbm


In [55]:
meta_data['mod_best']=mod_best._id
meta_data['mod_best_algo']=mod_best.algo

In [56]:
meta_data['models']=bm

In [57]:
models_path=os.path.join(run_dir,'models')
for mod in bm:
  try:   
    m=h2o.get_model(mod) 
    h2o.save_model(m, path = models_path)
  except:    
    pass

In [58]:
print(models_path)

C:\Users\pkash\h\models


In [59]:
# GBM
 
mod,mod_id=get_model_by_algo("GBM",best_models)
if mod is not None:
    try:     
        sh_df=mod.scoring_history()
        sh_df.to_csv(run_id+'_gbm_scoring_history.csv') 
    except:
        pass   
    try:     
        stats_gbm={}
        stats_gbm=gbm_stats(mod)
        n=run_id+'_gbm_stats.json'
        dict_to_json(stats_gbm,n)
        print(stats_gbm)
    except:
        pass

{'algo': 'gbm', 'model_id': 'GBM_2_AutoML_20190414_134833', 'varimp': [('marital-status', 4982.685546875, 1.0, 0.21046852555396994), ('relationship', 4489.21044921875, 0.9009620227859364, 0.1896241485158783), ('capital-gain', 4253.80859375, 0.8537180509851497, 0.1796807794296534), ('education', 2188.07421875, 0.4391355220323503, 0.09242420584052165), ('occupation', 2005.0487060546875, 0.40240321954737795, 0.08469321229630708), ('educational-num', 1368.537841796875, 0.2746586813320346, 0.057807007690551), ('capital-loss', 1321.5484619140625, 0.26522814845157155, 0.05582217734001172), ('age', 1097.81005859375, 0.22032497300221274, 0.046371472210493946), ('hours-per-week', 674.6400146484375, 0.13539686747271312, 0.028496779061608205), ('native-country', 556.0067138671875, 0.11158775897786671, 0.02348571110194338), ('workclass', 399.66064453125, 0.08020988697187723, 0.016881656645821582), ('fnlwgt', 205.52452087402344, 0.04124774058899274, 0.008681351144197942), ('gender', 78.3515319824218

In [60]:
if mod is not None:
    try:    
        sh_df=mod.scoring_history()
        sh_df.to_csv(run_id+'_dl_scoring_history.csv') 
    except:
        pass 
    try:
        stats_dl={}
        stats_dl=dl_stats(mod)
        n=run_id+'_dl_stats.json'
        dict_to_json(stats_dl,n)
        print(stats_dl)
    except:
        pass    
    try:
        cf=mod.confusion_matrix()    
        cf_df.to_csv(run_id+'_dl_confusion_matrix.csv')
    except:
        pass

{'algo': 'gbm', 'model_id': 'GBM_2_AutoML_20190414_134833', 'varimp': [('marital-status', 4982.685546875, 1.0, 0.21046852555396994), ('relationship', 4489.21044921875, 0.9009620227859364, 0.1896241485158783), ('capital-gain', 4253.80859375, 0.8537180509851497, 0.1796807794296534), ('education', 2188.07421875, 0.4391355220323503, 0.09242420584052165), ('occupation', 2005.0487060546875, 0.40240321954737795, 0.08469321229630708), ('educational-num', 1368.537841796875, 0.2746586813320346, 0.057807007690551), ('capital-loss', 1321.5484619140625, 0.26522814845157155, 0.05582217734001172), ('age', 1097.81005859375, 0.22032497300221274, 0.046371472210493946), ('hours-per-week', 674.6400146484375, 0.13539686747271312, 0.028496779061608205), ('native-country', 556.0067138671875, 0.11158775897786671, 0.02348571110194338), ('workclass', 399.66064453125, 0.08020988697187723, 0.016881656645821582), ('fnlwgt', 205.52452087402344, 0.04124774058899274, 0.008681351144197942), ('gender', 78.3515319824218

In [61]:
# DRF

mod,mod_id=get_model_by_algo("DRF",best_models)
if mod is not None:
    try:     
         sh_df=mod.scoring_history()
         sh_df.to_csv(run_id+'_drf_scoring_history.csv') 
    except:
         pass  
    try: 
         stats_drf={}
         stats_drf=drf_stats(mod)
         n=run_id+'_drf_stats.json'
         dict_to_json(stats_drf,n)
         print(stats_drf)
    except:
         pass

{'algo': 'drf', 'model_id': 'DRF_1_AutoML_20190414_134833', 'varimp': [('relationship', 38074.50390625, 1.0, 0.17143447305800447), ('capital-gain', 35791.21875, 0.940031125241393, 0.16115374061388119), ('marital-status', 27549.97265625, 0.7235806072243431, 0.12404666011449617), ('occupation', 19606.16015625, 0.5149419728363581, 0.0882788057686503), ('age', 19287.576171875, 0.506574589109982, 0.0868443477486449), ('educational-num', 15617.2763671875, 0.41017675254920116, 0.07031843543391579), ('education', 15397.677734375, 0.4044091492902536, 0.06932966940841048), ('hours-per-week', 13397.8212890625, 0.3518843297880297, 0.06032510465457993), ('fnlwgt', 11093.0556640625, 0.29135128566288565, 0.049947654132388934), ('capital-loss', 8600.2822265625, 0.2258803489006392, 0.03872367858793928), ('workclass', 7074.70263671875, 0.18581207661007565, 0.03185459544146187), ('native-country', 6015.7763671875, 0.15800012475540093, 0.027086668130541124), ('race', 2415.246337890625, 0.06343474215284937




In [62]:
# XRT

mod,mod_id=get_model_by_algo("XRT",best_models)
if mod is not None:
    try:     
         sh_df=mod.scoring_history()
         sh_df.to_csv(run_id+'_xrt_scoring_history.csv')
    except:
         pass     
    try:        
         stats_xrt={}
         stats_xrt=xrt_stats(mod)
         n=run_id+'_xrt_stats.json'
         dict_to_json(stats_xrt,n)
         print(stats_xrt)
    except:
         pass

{'algo': 'drf', 'model_id': 'XRT_1_AutoML_20190414_134833', 'varimp': [('marital-status', 42355.29296875, 1.0, 0.17544589551883344), ('relationship', 39223.8359375, 0.9260669254829519, 0.1624746410517293), ('capital-gain', 32067.669921875, 0.7571112764002064, 0.13283206589544125), ('educational-num', 27336.921875, 0.6454192607089119, 0.11323616018017849), ('occupation', 22754.20703125, 0.5372222793509703, 0.09425344389339989), ('age', 16493.537109375, 0.38940911402841755, 0.06832023073391125), ('education', 16280.908203125, 0.38438898805722244, 0.06743947023727756), ('hours-per-week', 11570.66015625, 0.2731809732678961, 0.04792848049369253), ('capital-loss', 8209.34765625, 0.1938210570826864, 0.03400510893027885), ('native-country', 6846.90673828125, 0.1616540993668239, 0.028361548227702905), ('fnlwgt', 6453.9111328125, 0.15237555168309747, 0.026733665120217322), ('workclass', 6225.85595703125, 0.1469912145720424, 0.02578900527399298), ('gender', 3771.224365234375, 0.08903785338037463,




In [63]:
# GLM

mod,mod_id=get_model_by_algo("GLM",best_models)
if mod is not None:
    try:     
         stats_glm={}
         stats_glm=glm_stats(mod)
         n=run_id+'_glm_stats.json'
         dict_to_json(stats_glm,n)
         print(stats_glm)
    except:
         pass

{'algo': 'glm', 'model_id': 'GLM_grid_1_AutoML_20190414_134833_model_1', 'coef': {'Intercept': -8.130286272637353, 'native-country.?': 0.065729266621754, 'native-country.Cambodia': 0.5572126666010029, 'native-country.Canada': 0.5901026055722867, 'native-country.China': -0.38607416793358634, 'native-country.Columbia': -1.0542652388600449, 'native-country.Cuba': 0.23552384656444758, 'native-country.Dominican-Republic': -0.525465533873249, 'native-country.Ecuador': -0.13221664295236543, 'native-country.El-Salvador': -0.29393625283317865, 'native-country.England': 0.49511135261503186, 'native-country.France': 0.5537852277774868, 'native-country.Germany': 0.21203688770410686, 'native-country.Greece': -0.004900891142730397, 'native-country.Guatemala': -0.18203537829312894, 'native-country.Haiti': 0.21250593926755865, 'native-country.Holand-Netherlands': -0.005788764323843744, 'native-country.Honduras': 0.021601287801425277, 'native-country.Hong': -0.1714622937455509, 'native-country.Hungary'




In [64]:
# split into training and test for showing how to predict
train, test = df.split_frame([0.8])

In [65]:
def predictions_test(mod,test,run_id):
    mod_perf=mod_best.model_performance(test)          
    stats_test={}
    stats_test=model_performance_stats(mod_perf)
    n=run_id+'_test_stats.json'
    dict_to_json(stats_test,n) 
    try:
      cf=mod_perf.confusion_matrix()
#      cf=mod_perf.confusion_matrix(metrics=["f1","f2","f0point5","accuracy","precision","recall","specificity","absolute_mcc","min_per_class_accuracy","mean_per_class_accuracy"])
      cf_df=cf.table.as_data_frame()
      cf_df.to_csv(run_id+'_test_confusion_matrix.csv')
    except:
      pass
    predictions = mod_best.predict(test)    
    predictions_df=test.cbind(predictions).as_data_frame() 
    predictions_df.to_csv(run_id+'_predictions.csv')
    return predictions
predictions_df=predictions_test(mod_best,test,run_id)

gbm prediction progress: |████████████████████████████████████████████████| 100%


In [66]:

predictions_df.describe()

Rows:9671
Cols:3




Unnamed: 0,predict,<=50K,>50K
type,enum,real,real
mins,,0.0025501664982057193,0.0008823267062313822
mean,,0.7578658951048259,0.24213410489517412
maxs,,0.9991176732937687,0.9974498335017943
sigma,,0.31058609488580313,0.31058609488580313
zeros,,0,0
missing,0,0,0
0,<=50K,0.8299518453155703,0.17004815468442974
1,<=50K,0.9955580090784486,0.004441990921551417
2,<=50K,0.8339106381201675,0.16608936187983253


In [67]:
# Update and save meta data

meta_data['end_time'] = time.time()
meta_data['execution_time'] = meta_data['end_time'] - meta_data['start_time']
  
n=run_id+'_meta_data.json'
dict_to_json(meta_data,n)

In [68]:
meta_data

{'start_time': 1555264074.8204637,
 'target': None,
 'server_path': 'C:\\Users\\pkash',
 'data_path': 'C:/Users/pkash/Downloads/adult.csv',
 'test_path': None,
 'max_models': 9,
 'run_time': 1500,
 'run_id': 'h',
 'scale': False,
 'classification': False,
 'model_path': None,
 'balance': False,
 'balance_threshold': 0.2,
 'project': None,
 'end_time': 1555265503.9605212,
 'execution_time': 1429.1400575637817,
 'run_path': 'C:\\Users\\pkash\\h',
 'nthreads': 1,
 'min_mem_size': 6,
 'analysis': 0,
 'x': ['age',
  'workclass',
  'fnlwgt',
  'education',
  'educational-num',
  'marital-status',
  'occupation',
  'relationship',
  'race',
  'gender',
  'capital-gain',
  'capital-loss',
  'hours-per-week',
  'native-country'],
 'variables': {'age': 'int',
  'workclass': 'enum',
  'fnlwgt': 'int',
  'education': 'enum',
  'educational-num': 'int',
  'marital-status': 'enum',
  'occupation': 'enum',
  'relationship': 'enum',
  'race': 'enum',
  'gender': 'enum',
  'capital-gain': 'int',
  'cap

In [69]:
# Save logs
h2o.download_all_logs(dirname=logs_path, filename=logfile)

Writing H2O logs to C:\Users\pkash\h\logs\h_autoh2o_log.zip


'C:\\Users\\pkash\\h\\logs\\h_autoh2o_log.zip'

In [70]:
# Clean up
os.chdir(server_path)

In [71]:
def predictions_test(mod,test,run_id):
    mod_perf=mod_best.model_performance(test)          
    stats_test={}
    stats_test=model_performance_stats(mod_perf)
    print(stats_test)   
    n=run_id+'_test_stats.json'
    dict_to_json(stats_test,n) 
    try:
      cf=mod_perf.confusion_matrix()
      print(cf)   
#      cf=mod_perf.confusion_matrix(metrics=["f1","f2","f0point5","accuracy","precision","recall","specificity","absolute_mcc","min_per_class_accuracy","mean_per_class_accuracy"])
      cf_df=cf.table.as_data_frame()
      print(cf_df)    
      cf_df.to_csv(run_id+'_test_confusion_matrix.csv')
    except:
      pass
    predictions = mod_best.predict(test)    
    predictions_df=test.cbind(predictions).as_data_frame() 
    predictions_df.to_csv(run_id+'_predictions.csv')
    return predictions

In [72]:
h2o.cluster().shutdown()

H2O session _sid_b770 closed.
