## Loading the libraries and dataset

In [28]:
#Load the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 


#Load the data
df = pd.read_csv("/Users/christine/Desktop/group_project/pharmacy_tx.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13910244 entries, 0 to 13910243
Data columns (total 9 columns):
 #   Column       Dtype  
---  ------       -----  
 0   tx_date      object 
 1   pharmacy     object 
 2   diagnosis    object 
 3   drug         object 
 4   bin          int64  
 5   pcn          object 
 6   group        object 
 7   rejected     bool   
 8   patient_pay  float64
dtypes: bool(1), float64(1), int64(1), object(6)
memory usage: 862.3+ MB


In [42]:
#Change bin into string (originally it is a number)
#Dropping date and pcn/group (only keeping the bin insurance information)

df['bin'] = df['bin'].astype(str)
df.select_dtypes(include='object').columns

df.drop(['tx_date', 'pcn', 'group'],axis=1)


Unnamed: 0,pharmacy,diagnosis,drug,bin,rejected,patient_pay
0,Pharmacy #6,G99.93,branded tanoclolol,725700,False,13.39
1,Pharmacy #42,U60.52,branded oxasoted,664344,False,7.02
2,Pharmacy #37,Q85.91,branded cupitelol,725700,False,13.39
3,Pharmacy #30,U60.52,generic oxasoted,571569,False,10.84
4,Pharmacy #18,N55.01,branded mamate,664344,False,47.00
...,...,...,...,...,...,...
13910239,Pharmacy #42,U27.71,branded colifunene,322463,True,0.00
13910240,Pharmacy #45,N59.44,generic tafistitrisin,664344,False,6.28
13910241,Pharmacy #54,W50.87,generic tanoclolol,691847,False,6.94
13910242,Pharmacy #0,I68.27,branded prazinib,96934,False,13.93


## Testing out H20 (Random Forest)

The reason we are testing out h2o is because you can enter in categorical data (without one-hot encoding). We ended up not using h2o, but this is our prelminary exploration. This original test is using 

In [43]:
#I don't have the correct version of Java so I need to load a specific version

import os

java_path = "/Library/Java/JavaVirtualMachines/jdk-13.0.2.jdk/Contents/Home"
java_home = os.environ.get('JAVA_HOME', None)

if (not java_home) or (java_path not in java_home):
    os.environ['JAVA_HOME'] = java_path

print("Updated Java Home: ",os.environ.get('JAVA_HOME', None))


Updated Java Home:  /Library/Java/JavaVirtualMachines/jdk-13.0.2.jdk/Contents/Home


In [44]:
#I couldn't get the updated H2O version to run -- it keeps detecting this older version.
#Since we ended up not using h2o, I never followed up on this issue.

import h2o
from h2o.estimators.gbm import H2OGradientBoostingEstimator

h2o.init()


Checking whether there is an H2O instance running at http://localhost:54321 . connected.


0,1
H2O_cluster_uptime:,34 mins 05 secs
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.4
H2O_cluster_version_age:,"2 years, 5 months and 19 days !!!"
H2O_cluster_name:,H2O_from_python_christine_cusijr
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.488 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [45]:
#Loading in the edited dataframe as an h2o object

covermymeds = h2o.H2OFrame(df)



Parse progress: |█████████████████████████████████████████████████████████| 100%


In [48]:
#setting the training and response columsn
#splitting the training and testing data

training_columns = ["pharmacy", "diagnosis", "drug", "bin", "rejected"]
response_column = "patient_pay"

train, test = covermymeds.split_frame(ratios=[0.8])

In [49]:
#Building and testing the model

from h2o.estimators import H2ORandomForestEstimator
model = H2ORandomForestEstimator(ntrees=50, max_depth=20, nfolds=10)
model.train(x=training_columns, y=response_column, training_frame=train)



drf Model Build progress: |███████████████████████████████████████████████| 100%


In [51]:
#Assessing the performance

performance = model.model_performance(test_data=test)
print(performance)




ModelMetricsRegression: drf
** Reported on test data. **

MSE: 393.94293526322207
RMSE: 19.847995749274588
MAE: 9.905121687888428
RMSLE: 0.6593340761834802
Mean Residual Deviance: 393.94293526322207



In [57]:
#Trying out H2O gradient boost (GBM)


from h2o.estimators.gbm import H2OGradientBoostingEstimator
gbmtrain, gbmvalid, gbmtest = covermymeds.split_frame(ratios=[0.6,0.2], seed=100)
training_columns = ["pharmacy", "diagnosis", "drug", "bin", "rejected"]
response_column = "patient_pay"

gbm = H2OGradientBoostingEstimator()


In [58]:
#Training the GBM

gbm.train(x=training_columns, y=response_column, training_frame=gbmtrain)

gbm Model Build progress: |███████████████████████████████████████████████| 100%


In [59]:
#Print out the model results/metrics
#Interesting to evaluate the variable importance

print(gbm)

Model Details
H2OGradientBoostingEstimator :  Gradient Boosting Machine
Model Key:  GBM_model_python_1669009420170_3


Model Summary: 


Unnamed: 0,Unnamed: 1,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
0,,50.0,50.0,28636.0,5.0,5.0,5.0,21.0,32.0,26.16




ModelMetricsRegression: gbm
** Reported on train data. **

MSE: 229.53813861611474
RMSE: 15.150516117153064
MAE: 6.081173017139265
RMSLE: NaN
Mean Residual Deviance: 229.53813861611474

Scoring History: 


Unnamed: 0,Unnamed: 1,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
0,,2022-11-21 00:26:44,0.001 sec,0.0,39.545644,20.793193,1563.857992
1,,2022-11-21 00:26:45,1.640 sec,1.0,36.373624,19.109769,1323.040493
2,,2022-11-21 00:26:47,2.934 sec,2.0,33.559755,17.600768,1126.257185
3,,2022-11-21 00:26:51,6.936 sec,6.0,25.34129,13.07039,642.180985
4,,2022-11-21 00:26:55,11.568 sec,11.0,19.818957,9.895332,392.791056
5,,2022-11-21 00:27:00,16.353 sec,16.0,17.310196,8.256889,299.64287
6,,2022-11-21 00:27:04,20.446 sec,20.0,16.402787,7.495479,269.051422
7,,2022-11-21 00:27:08,24.687 sec,24.0,15.907915,6.991141,253.061771
8,,2022-11-21 00:27:13,28.865 sec,28.0,15.643187,6.672821,244.70931
9,,2022-11-21 00:27:17,33.002 sec,32.0,15.486011,6.456046,239.816531



Variable Importances: 


Unnamed: 0,variable,relative_importance,scaled_importance,percentage
0,drug,35391710000.0,1.0,0.603783
1,rejected,19051710000.0,0.53831,0.325023
2,bin,4070591000.0,0.115015,0.069444
3,pharmacy,64986390.0,0.001836,0.001109
4,diagnosis,37577470.0,0.001062,0.000641





In [60]:
perf = gbm.model_performance(gbmvalid)
print(perf)


ModelMetricsRegression: gbm
** Reported on test data. **

MSE: 227.1602705507389
RMSE: 15.071836999872938
MAE: 6.069839826655207
RMSLE: NaN
Mean Residual Deviance: 227.1602705507389



In [69]:
h2o.shutdown()


  h2o.shutdown()


H2O session _sid_b10f closed.


We have decided to change how we process the data -- instead of using the dates, we are converting them into day of year. We are also dropping pharmacy and splitting the drug column into 2 (branded vs generic) and drug name. Again, ultimately, we did not use h2o to run this random forest, but this is our preliminary data that we used for comparisons. Some of the parsing code is borrowed from Will's notebook.

In [1]:
#Again, opening the right version of Java that is needed.

import os

java_path = "/Library/Java/JavaVirtualMachines/jdk-13.0.2.jdk/Contents/Home"
java_home = os.environ.get('JAVA_HOME', None)

if (not java_home) or (java_path not in java_home):
    os.environ['JAVA_HOME'] = java_path


    print("Updated Java Home: ",os.environ.get('JAVA_HOME', None))

Updated Java Home:  /Library/Java/JavaVirtualMachines/jdk-13.0.2.jdk/Contents/Home


In [2]:
import h2o
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321 ..... not found.
Attempting to start a local H2O server...
  Java Version: java version "13.0.2" 2020-01-14; Java(TM) SE Runtime Environment (build 13.0.2+8); Java HotSpot(TM) 64-Bit Server VM (build 13.0.2+8, mixed mode, sharing)
  Starting server from /opt/anaconda3/lib/python3.9/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/n3/j1h1x5x96wd16py26gsf_yf80000gn/T/tmptvo2e_lf
  JVM stdout: /var/folders/n3/j1h1x5x96wd16py26gsf_yf80000gn/T/tmptvo2e_lf/h2o_christine_started_from_python.out
  JVM stderr: /var/folders/n3/j1h1x5x96wd16py26gsf_yf80000gn/T/tmptvo2e_lf/h2o_christine_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,America/Los_Angeles
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.30.0.4
H2O_cluster_version_age:,2 years and 6 months !!!
H2O_cluster_name:,H2O_from_python_christine_znmkks
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,4 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [26]:
#Load the required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns 
from datetime import datetime
from datetime import date
from sklearn.model_selection import train_test_split
#Load the data
df = pd.read_csv("/Users/christine/Desktop/group_project/pharmacy_tx.csv")

df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13910244 entries, 0 to 13910243
Data columns (total 9 columns):
 #   Column       Dtype  
---  ------       -----  
 0   tx_date      object 
 1   pharmacy     object 
 2   diagnosis    object 
 3   drug         object 
 4   bin          int64  
 5   pcn          object 
 6   group        object 
 7   rejected     bool   
 8   patient_pay  float64
dtypes: bool(1), float64(1), int64(1), object(6)
memory usage: 862.3+ MB


In [27]:
#Converting the date to the day of the year

def get_doy(d):
    return d.dayofyear

df['day'] = pd.to_datetime(df.tx_date).apply(get_doy)
df = df.drop(columns = ['tx_date'])
df.head()

Unnamed: 0,pharmacy,diagnosis,drug,bin,pcn,group,rejected,patient_pay,day
0,Pharmacy #6,G99.93,branded tanoclolol,725700,1UQC,,False,13.39,2
1,Pharmacy #42,U60.52,branded oxasoted,664344,,52H8KH0F83K,False,7.02,2
2,Pharmacy #37,Q85.91,branded cupitelol,725700,1UQC,,False,13.39,2
3,Pharmacy #30,U60.52,generic oxasoted,571569,KB38N,6BYJBW,False,10.84,2
4,Pharmacy #18,N55.01,branded mamate,664344,,ZX2QUWR,False,47.0,2


In [28]:
#Remove rejected column -- and remove rows in which rejected is TRUE (which is when copay is 0)
#Dropping pharmacy, group, pcn.

df = df[df['rejected']==False]
df = df.drop(columns = ['pharmacy', 'group', 'pcn', 'rejected'])
df.head()

Unnamed: 0,diagnosis,drug,bin,patient_pay,day
0,G99.93,branded tanoclolol,725700,13.39,2
1,U60.52,branded oxasoted,664344,7.02,2
2,Q85.91,branded cupitelol,725700,13.39,2
3,U60.52,generic oxasoted,571569,10.84,2
4,N55.01,branded mamate,664344,47.0,2


In [29]:
#Split drug into drug and drug type (so we can look at branded vs generic as well as the type of drug)

df['drug_type'] = df['drug'].apply(lambda x: x.split(' ')[0])
df['drug'] = df['drug'].apply(lambda x: x.split(' ')[1])
df.head()

Unnamed: 0,diagnosis,drug,bin,patient_pay,day,drug_type
0,G99.93,tanoclolol,725700,13.39,2,branded
1,U60.52,oxasoted,664344,7.02,2,branded
2,Q85.91,cupitelol,725700,13.39,2,branded
3,U60.52,oxasoted,571569,10.84,2,generic
4,N55.01,mamate,664344,47.0,2,branded


In [30]:
#Loading in df and splitting training and testing

covermymeds = h2o.H2OFrame(df)
train, test = covermymeds.split_frame(ratios=[0.8])

Parse progress: |█████████████████████████████████████████████████████████| 100%


In [35]:
#Setting the training and testing sets

training_columns = ["diagnosis", "drug", "bin", "day", "drug_type"]
response_column = "patient_pay"

train, test = covermymeds.split_frame(ratios=[0.8])

In [37]:
#building and testing h2o random forest

from h2o.estimators import H2ORandomForestEstimator

model = H2ORandomForestEstimator(ntrees=50, max_depth=20, nfolds=10)
model.train(x=training_columns, y=response_column, training_frame=train)


drf Model Build progress: |███████████████████████████████████████████████| 100%


In [38]:
#assessing the performance

performance = model.model_performance(test_data=test)
print(performance)


ModelMetricsRegression: drf
** Reported on test data. **

MSE: 362.4776154060449
RMSE: 19.038844907347844
MAE: 8.976525293288937
RMSLE: 0.44291229534701765
Mean Residual Deviance: 362.4776154060449



In [39]:
#same as above, but dropping day as a feature.

training_columns2 = ["diagnosis", "drug", "bin", "drug_type"]
response_column = "patient_pay"

model2 = H2ORandomForestEstimator(ntrees=50, max_depth=20, nfolds=10)
model2.train(x=training_columns2, y=response_column, training_frame=train)

drf Model Build progress: |███████████████████████████████████████████████| 100%


In [40]:
#assessing the performance (without day), results ended up slightly worse (RMSE)

performance2 = model2.model_performance(test_data=test)
print(performance2)


ModelMetricsRegression: drf
** Reported on test data. **

MSE: 402.1061856219034
RMSE: 20.05258550965195
MAE: 9.961808288213282
RMSLE: 0.4704073534086275
Mean Residual Deviance: 402.1061856219034

