# Research using H2O's restful API in python to read, train, and store models

**0. Import libraries**


In [286]:
import requests
from requests.compat import urljoin, quote_plus
from requests.utils import requote_uri
from time import sleep
import shutil
import os

**1. Define variables**

In [54]:
bucket_lookup = {'cottage': 'datalake.cottage-health.e1.a3bc.phi', 
                 'salinas': 'datalake.salinas-valley-memorial-healthcare-sy.e1.0803.phi'}

In [210]:
base_url = 'http://10.115.118.48:54321/3/'
import_url = urljoin(base_url, "ImportFiles")
parse_setup_url = urljoin(base_url, "ParseSetup")
parse_url = urljoin(base_url, "Parse")
jobs_url = urljoin(base_url, 'Jobs')
gbm_url = urljoin(base_url, 'ModelBuilders/gbm')
xgboost_url = urljoin(base_url, 'ModelBuilders/xgboost')

In [43]:
member_name = 'cottage'
mode = 'dev'
filepath = 'data-science/201808/training-data/training_kbm_only.csv'
bucket_name = bucket_lookup[member_name]
root_path = 's3n://{0}/{1}/cdp/{2}'.format(bucket_name, mode, filepath)
print(root_path)

s3n://datalake.cottage-health.e1.a3bc.phi/dev/cdp/data-science/201808/training-data/training_kbm_only.csv


**2. Make a sample request for importing data using H2O's import API**

In [4]:
# curl: curl -X GET http://10.115.118.48:54321/3/ImportFiles?path=s3n://datalake.cottage-health.e1.a3bc.phi/dev/cdp/data-science/201808/training-data/training_kbm_only.csv

In [189]:
r = requests.get('http://10.115.118.48:54321/3/ImportFiles?path=s3n://datalake.cottage-health.e1.a3bc.phi/dev/cdp/data-science/201808/training-data/training_kbm_only.csv')

In [190]:
r.content

b'{"__meta":{"schema_version":3,"schema_name":"ImportFilesV3","schema_type":"ImportFiles"},"_exclude_fields":"","path":"s3n://datalake.cottage-health.e1.a3bc.phi/dev/cdp/data-science/201808/training-data/training_kbm_only.csv","pattern":null,"files":["s3n://datalake.cottage-health.e1.a3bc.phi/dev/cdp/data-science/201808/training-data/training_kbm_only.csv/part-00000-066747cf-5598-4c17-b159-400555d4f040-c000.csv"],"destination_frames":["s3n://datalake.cottage-health.e1.a3bc.phi/dev/cdp/data-science/201808/training-data/training_kbm_only.csv/part-00000-066747cf-5598-4c17-b159-400555d4f040-c000.csv"],"fails":[],"dels":[]}'

In [11]:
# Pass as parameters

In [192]:
import_params = {'path': root_path}

In [193]:
r = requests.get(import_url, params=import_params)

In [194]:
r.content

b'{"__meta":{"schema_version":3,"schema_name":"ImportFilesV3","schema_type":"ImportFiles"},"_exclude_fields":"","path":"s3n://datalake.cottage-health.e1.a3bc.phi/dev/cdp/data-science/201808/training-data/training_kbm_only.csv","pattern":null,"files":["s3n://datalake.cottage-health.e1.a3bc.phi/dev/cdp/data-science/201808/training-data/training_kbm_only.csv/part-00000-066747cf-5598-4c17-b159-400555d4f040-c000.csv"],"destination_frames":["s3n://datalake.cottage-health.e1.a3bc.phi/dev/cdp/data-science/201808/training-data/training_kbm_only.csv/part-00000-066747cf-5598-4c17-b159-400555d4f040-c000.csv"],"fails":[],"dels":[]}'

In [195]:
# Retrieve content
import_result = r.json()

In [196]:
destination_frames = import_result['destination_frames']
print(destination_frames)

['s3n://datalake.cottage-health.e1.a3bc.phi/dev/cdp/data-science/201808/training-data/training_kbm_only.csv/part-00000-066747cf-5598-4c17-b159-400555d4f040-c000.csv']


**3.parse setup**

In [197]:
# parse setup
# curl -X POST http://10.115.118.48:54321/3/ParseSetup --data 'source_frames=["s3n://datalake.cottage-health.e1.a3bc.phi/dev/cdp/data-science/201808/training-data/training_kbm_only.csv/part-00000-066747cf-5598-4c17-b159-400555d4f040-c000.csv"]'


In [198]:
data_params = {'source_frames': destination_frames}

In [199]:
r = requests.post(parse_setup_url, data=data_params)

In [200]:
parse_setup_result = r.json()

In [201]:
parse_setup_result

{'__meta': {'schema_version': 3,
  'schema_name': 'ParseSetupV3',
  'schema_type': 'ParseSetup'},
 '_exclude_fields': '',
 'source_frames': [{'__meta': {'schema_version': 3,
    'schema_name': 'FrameKeyV3',
    'schema_type': 'Key<Frame>'},
   'name': 's3n://datalake.cottage-health.e1.a3bc.phi/dev/cdp/data-science/201808/training-data/training_kbm_only.csv/part-00000-066747cf-5598-4c17-b159-400555d4f040-c000.csv',
   'type': 'Key<Frame>',
   'URL': '/3/Frames/s3n://datalake.cottage-health.e1.a3bc.phi/dev/cdp/data-science/201808/training-data/training_kbm_only.csv/part-00000-066747cf-5598-4c17-b159-400555d4f040-c000.csv'}],
 'parse_type': 'CSV',
 'separator': 44,
 'single_quotes': False,
 'check_header': 1,
 'column_names': ['person_key',
  'city',
  'state',
  'zipcode',
  'zipplus4',
  'unit_number_present_flag',
  'carrier_route',
  'numeric_state_code',
  'numeric_county_code',
  'designated_market_area',
  'core_based_statistical_area',
  'nielsen_county_size',
  'latitude',
  'lon

**4. parse**

In [146]:
#curl -X POST http://127.0.0.1:54321/3/Parse --data 'destination_frame=arrhythmia.hex&source_frames=["http://s3.amazonaws.com/h2o-public-test-data/smalldata/flow_examples/arrhythmia.csv.gz"]&parse_type=CSV&separator=44&number_columns=280&single_quotes=false&column_names=&column_types=["Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric","Numeric"]&check_header=-1&delete_on_done=true&chunk_size=4194304'


In [202]:
parse_params = {'destination_frame': 'kbm.hex',
                'source_frames': [parse_setup_result['source_frames'][0]['name']],
                'parse_type': parse_setup_result['parse_type'], 
                'separator': parse_setup_result['separator'],
                'number_columns': parse_setup_result['number_columns'],
                'single_quotes': parse_setup_result['single_quotes'],
                'column_names': parse_setup_result['column_names'],
                'column_types': parse_setup_result['column_types'],
                'check_header': parse_setup_result['check_header'],
                'delete_on_done': 'true',
                'chunk_size': parse_setup_result['chunk_size']}

In [203]:
r = requests.post(parse_url, data=parse_params)

In [204]:
parse_result = r.json()

In [259]:
def poll(job_key):
    while True:
        try:
            r = requests.get(jobs_url + '/' + quote_plus(job_key))
        except:
            print("Catched error")
            raise
        response = r.json()
        jobs = response['jobs']
        if len(jobs) != 1:
            raise RuntimeError('Could not find the job')
        else:    
            status = response['jobs'][0]['status']
            if status != 'RUNNING':
                print(status)
                break
            else:
                sleep(1)
                print("RUNNING")
                print('progress: {0}'.format(response['jobs'][0]['progress']))
    return response['jobs'][0]

In [206]:
parse_result = poll(quote_plus(parse_result['job']['key']['name']))

DONE


In [207]:
parse_result

{'__meta': {'schema_version': 3, 'schema_name': 'JobV3', 'schema_type': 'Job'},
 'key': {'__meta': {'schema_version': 3,
   'schema_name': 'JobKeyV3',
   'schema_type': 'Key<Job>'},
  'name': '$03010a73763032d4ffffffff$_bbea25da3100d990ca519888ab364db7',
  'type': 'Key<Job>',
  'URL': '/3/Jobs/$03010a73763032d4ffffffff$_bbea25da3100d990ca519888ab364db7'},
 'description': 'Parse',
 'status': 'DONE',
 'progress': 1.0,
 'progress_msg': 'Done.',
 'start_time': 1535739865749,
 'msec': 77492,
 'dest': {'__meta': {'schema_version': 3,
   'schema_name': 'FrameKeyV3',
   'schema_type': 'Key<Frame>'},
  'name': 'kbm.hex',
  'type': 'Key<Frame>',
  'URL': '/3/Frames/kbm.hex'},
 'exception': None,
 'stacktrace': None,
 'ready_for_view': True}

**5. Training**

In [None]:
# Train
# curl -X POST http://127.0.0.1:54321/3/ModelBuilders/gbm --data 'model_id=gbm-51b9780b-70d0-40d0-9b5a-c723a3f358c1&training_frame=arrhythmia.hex&score_each_iteration=false&response_column=C1&ntrees=20&max_depth=5&min_rows=25&nbins=20&learn_rate=0.3&distribution=AUTO&balance_classes=false&max_confusion_matrix_size=20&max_hit_ratio_k=10&class_sampling_factors=&max_after_balance_size=5&seed=0'


In [227]:
train_columns = ["d_age_group_kbm",
          "d_generations_imputed",
          "ethnic_group",
          "insurance_responder_index_auto",
          "gender",
          "channel_preference_index_phone",
          "student_loan_index",
          "uninsured_index",
          "insurance_index_prearranged_funeral",
          "channel_preference_index_web",
          "health_insurance_direct_pay_index",
          "channel_preference_index_text",
          "medicare_supplement_insurance_buyer_index",
          "health_index_use_wearable_deviceto_manage_health",
          "d_individual_occupation_grouped",
          "ed_count_response"]
ignored_columns = [x for x in parse_params['column_names'] if not x in train_columns]

In [223]:
len(parse_params['column_names'])

846

In [231]:
gbm1_params = {'model_id': 'gbm_test',
               'response_column': 'ed_count_response',
               'ignored_columns': ignored_columns,
               'training_frame': parse_result['dest']['name'],
               'distribution': "gaussian",
               'ntrees': 50,
               'max_depth': 3,
               'min_rows': 2,
               'learn_rate': 0.4,
               'nfolds': 5,
               "fold_assignment": "Stratified",
               'keep_cross_validation_predictions': 'true',
               'seed': 1}

In [260]:
r = requests.post(gbm_url, data=gbm1_params)

In [261]:
train_result = r.json()

In [262]:
poll(train_result['job']['key']['name'])

RUNNING
progress: 0.02
RUNNING
progress: 0.15333334
RUNNING
progress: 0.32333332
RUNNING
progress: 0.45
RUNNING
progress: 0.5933333
RUNNING
progress: 0.75
RUNNING
progress: 0.8333333
RUNNING
progress: 0.8333333
DONE


{'__meta': {'schema_version': 3, 'schema_name': 'JobV3', 'schema_type': 'Job'},
 'key': {'__meta': {'schema_version': 3,
   'schema_name': 'JobKeyV3',
   'schema_type': 'Key<Job>'},
  'name': '$03010a73763032d4ffffffff$_b7d91679cf7c28417f3c921cb09f45f4',
  'type': 'Key<Job>',
  'URL': '/3/Jobs/$03010a73763032d4ffffffff$_b7d91679cf7c28417f3c921cb09f45f4'},
 'description': 'GBM',
 'status': 'DONE',
 'progress': 1.0,
 'progress_msg': 'Done.',
 'start_time': 1535745164072,
 'msec': 8312,
 'dest': {'__meta': {'schema_version': 3,
   'schema_name': 'ModelKeyV3',
   'schema_type': 'Key<Model>'},
  'name': 'gbm_test',
  'type': 'Key<Model>',
  'URL': '/3/Models/gbm_test'},
 'exception': None,
 'stacktrace': None,
 'ready_for_view': True}

In [320]:
# view model
r = requests.get(base_url + 'Models/gbm_test')


**6. Save the model to MOJO**

In [321]:
r = requests.get(base_url + 'Models/gbm_test/mojo', stream=True)


In [324]:
type(r.content)

bytes

In [323]:
with open('gbm_test.zip', 'wb') as f:
    for chunk in r.iter_content(8192):
        f.write(chunk)