## Loading the libraries and dataset

In [28]:
#Load the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 


#Load the data
df = pd.read_csv("/Users/christine/Desktop/group_project/pharmacy_tx.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13910244 entries, 0 to 13910243
Data columns (total 9 columns):
 #   Column       Dtype  
---  ------       -----  
 0   tx_date      object 
 1   pharmacy     object 
 2   diagnosis    object 
 3   drug         object 
 4   bin          int64  
 5   pcn          object 
 6   group        object 
 7   rejected     bool   
 8   patient_pay  float64
dtypes: bool(1), float64(1), int64(1), object(6)
memory usage: 862.3+ MB


In [42]:
df['bin'] = df['bin'].astype(str)
df.select_dtypes(include='object').columns

df.drop(['tx_date', 'pcn', 'group'],axis=1)


Unnamed: 0,pharmacy,diagnosis,drug,bin,rejected,patient_pay
0,Pharmacy #6,G99.93,branded tanoclolol,725700,False,13.39
1,Pharmacy #42,U60.52,branded oxasoted,664344,False,7.02
2,Pharmacy #37,Q85.91,branded cupitelol,725700,False,13.39
3,Pharmacy #30,U60.52,generic oxasoted,571569,False,10.84
4,Pharmacy #18,N55.01,branded mamate,664344,False,47.00
...,...,...,...,...,...,...
13910239,Pharmacy #42,U27.71,branded colifunene,322463,True,0.00
13910240,Pharmacy #45,N59.44,generic tafistitrisin,664344,False,6.28
13910241,Pharmacy #54,W50.87,generic tanoclolol,691847,False,6.94
13910242,Pharmacy #0,I68.27,branded prazinib,96934,False,13.93


## Testing out H20 (classifier)

In [43]:
import os

java_path = "/Library/Java/JavaVirtualMachines/jdk-13.0.2.jdk/Contents/Home"
java_home = os.environ.get('JAVA_HOME', None)

if (not java_home) or (java_path not in java_home):
    os.environ['JAVA_HOME'] = java_path

print("Updated Java Home: ",os.environ.get('JAVA_HOME', None))

Updated Java Home:  /Library/Java/JavaVirtualMachines/jdk-13.0.2.jdk/Contents/Home


In [44]:
import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator

h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,34 mins 05 secs
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.4
H2O_cluster_version_age:,"2 years, 5 months and 19 days !!!"
H2O_cluster_name:,H2O_from_python_christine_cusijr
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.488 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [45]:
covermymeds = h2o.H2OFrame(df)

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [48]:
training_columns = ["pharmacy", "diagnosis", "drug", "bin", "rejected"]
response_column = "patient_pay"

train, test = covermymeds.split_frame(ratios=[0.8])

In [49]:
from h2o.estimators import H2ORandomForestEstimator
model = H2ORandomForestEstimator(ntrees=50, max_depth=20, nfolds=10)
model.train(x=training_columns, y=response_column, training_frame=train)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [51]:
performance = model.model_performance(test_data=test)
print(performance)


ModelMetricsRegression: drf
** Reported on test data. **

MSE: 393.94293526322207
RMSE: 19.847995749274588
MAE: 9.905121687888428
RMSLE: 0.6593340761834802
Mean Residual Deviance: 393.94293526322207



In [57]:
from h2o.estimators.gbm import H2OGradientBoostingEstimator
gbmtrain, gbmvalid, gbmtest = covermymeds.split_frame(ratios=[0.6,0.2], seed=100)
training_columns = ["pharmacy", "diagnosis", "drug", "bin", "rejected"]
response_column = "patient_pay"

gbm = H2OGradientBoostingEstimator()


In [58]:
gbm.train(x=training_columns, y=response_column, training_frame=gbmtrain)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [59]:
print(gbm)

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_model_python_1669009420170_3


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,50.0,50.0,28636.0,5.0,5.0,5.0,21.0,32.0,26.16




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 229.53813861611474
RMSE: 15.150516117153064
MAE: 6.081173017139265
RMSLE: NaN
Mean Residual Deviance: 229.53813861611474

Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2022-11-21 00:26:44,0.001 sec,0.0,39.545644,20.793193,1563.857992
1,,2022-11-21 00:26:45,1.640 sec,1.0,36.373624,19.109769,1323.040493
2,,2022-11-21 00:26:47,2.934 sec,2.0,33.559755,17.600768,1126.257185
3,,2022-11-21 00:26:51,6.936 sec,6.0,25.34129,13.07039,642.180985
4,,2022-11-21 00:26:55,11.568 sec,11.0,19.818957,9.895332,392.791056
5,,2022-11-21 00:27:00,16.353 sec,16.0,17.310196,8.256889,299.64287
6,,2022-11-21 00:27:04,20.446 sec,20.0,16.402787,7.495479,269.051422
7,,2022-11-21 00:27:08,24.687 sec,24.0,15.907915,6.991141,253.061771
8,,2022-11-21 00:27:13,28.865 sec,28.0,15.643187,6.672821,244.70931
9,,2022-11-21 00:27:17,33.002 sec,32.0,15.486011,6.456046,239.816531



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,drug,35391710000.0,1.0,0.603783
1,rejected,19051710000.0,0.53831,0.325023
2,bin,4070591000.0,0.115015,0.069444
3,pharmacy,64986390.0,0.001836,0.001109
4,diagnosis,37577470.0,0.001062,0.000641





In [60]:
perf = gbm.model_performance(gbmvalid)
print(perf)


ModelMetricsRegression: gbm
** Reported on test data. **

MSE: 227.1602705507389
RMSE: 15.071836999872938
MAE: 6.069839826655207
RMSLE: NaN
Mean Residual Deviance: 227.1602705507389



In [62]:
from h2o.estimators import H2OXGBoostEstimator

xgb = H2OXGBoostEstimator(
    ntrees = 3000,
    learn_rate = 0.05,
    stopping_rounds = 20,
    stopping_metric = "RMSE",
    nfolds=4, 
    seed=100)

xgb.train(x=training_columns, y=response_column, training_frame = gbmtrain, validation_frame=gbmvalid)

xgboost Model Build progress: |████████████████████████

H2OConnectionError: Unexpected HTTP error: ('Connection aborted.', BadStatusLine('GET /3/Jobs/$03017f00000132d4ffffffff$_b08aa1326108d0fae98fd940ce414489 HTTP/1.1\r\n'))

In [65]:
from h2o.automl import H2OAutoML
aml = H2OAutoML(max_models = 10, max_runtime_secs=100, seed = 1)
aml.train(x=training_columns, y=response_column, training_frame=gbmtrain, validation_frame=gbmvalid)

AutoML progress: |
09:54:20.694: User specified a validation frame with cross-validation still enabled. Please note that the models will still be validated using cross-validation only, the validation frame will be used to provide purely informative validation metrics on the trained models.

████████████████████████████████████████████████████████| 100%


In [66]:
lb = aml.leaderboard
lb

model_id,mean_residual_deviance,rmse,mse,mae,rmsle
XGBoost_1_AutoML_20221121_095420,1374.55,37.0749,1374.55,17.0338,0.983309




AttributeError: type object 'ModelBase' has no attribute 'metalearner'