## Import Data

In [13]:
import os
import csv
import numpy as np
import pandas as pd
import utils.baseline_functions as base

In [2]:
# Import data
os.chdir('/Users/jingyuanhu/Desktop/Research/Interpretable_Opioid')
SAMPLE = pd.read_csv('Data/SAMPLE_LABEL_FEATURE.csv', delimiter = ",")

# Encode categorical variable
SAMPLE['patient_gender'] = SAMPLE['patient_gender'].astype('category')
SAMPLE["patient_gender_cat"] = SAMPLE["patient_gender"].cat.codes
SAMPLE = pd.get_dummies(SAMPLE, columns=["drug", "payment"])

## Baseline Models: 
- Decision Tree(CART), Explainable Boosting Machine (EBM)
- L1/L2 Logistic, Linear SVM, Random Forest, XGBoost

In [3]:
x = SAMPLE[['Age','patient_gender_cat', 'quantity', 'days_supply',\
            'past_prescription', 'prescriber_num_presc','prescriber_num_pat', 'prescriber_num_pat_long',\
            'AvgMME', 'Avg_days_supply', 'concurrent_opioid', 'concurrent_benzo',\
            'count_prior_presc', 'count_prior_days_supply', 'count_prior_quantity',\
            'drug_Codeine', 'drug_Fentanyl','drug_Hydrocodone', 'drug_Hydromorphone',\
            'drug_Methadone', 'drug_Morphine', 'drug_Oxycodone', 'drug_Oxymorphone',\
            'payment_CashCredit', 'payment_CommercialIns', 'payment_IndianNation',\
            'payment_Medicaid', 'payment_Medicare', 'payment_MilitaryIns',\
            'payment_Other', 'payment_WorkersComp']]
y = SAMPLE['long_term_presc'].values

In [4]:
# Decision Tree
depth = [1,2,3,4,5]
dt_summary = base.DecisionTree(X=x, Y=y, depth=depth, seed=42)

# EBM
n_estimators = [50,100]
depth = [1,2,3,4,5]
ebm_summary = base.EBM(X=x, Y=y, depth=depth, estimators=n_estimators, seed=42)

# L2 logistic
c = np.linspace(1e-4, 1e-1, 5).tolist()
logistic_summary = base.Logistic(X=x, Y=y, C=c, seed=42)

# L1 logistic
c = np.linspace(1e-4, 1e-1, 5).tolist()
lasso_summary = base.Lasso(X=x, Y=y, C=c, seed=42)

# LinearSVM
c = np.linspace(1e-4, 1e-1, 5).tolist()
svm_summary = base.LinearSVM(X=x, Y=y, C=c, seed=42)

# Random Forest 
n_estimators =  [50,100] 
depth = [1,2,3,4,5]
rf_summary = base.RF(X=x, Y=y, depth=depth, estimators=n_estimators, seed=42)

# XGBoost
n_estimators =  [50,100] 
depth = [1,2,3,4,5]
xgb_summary = base.XGB(X=x, Y=y, depth=depth, estimators=n_estimators, seed=42)

In [31]:
# Save results
results = {"Decision Tree": str((round(np.mean(dt_summary['holdout_test_auc']), 3))) + " (" + str(round(np.std(dt_summary['holdout_test_auc']), 3)) + ")", 
           "Explainable Boosting Machine": str((round(np.mean(ebm_summary['holdout_test_auc']), 3))) + " (" + str(round(np.std(ebm_summary['holdout_test_auc']), 3)) + ")", 
            "Logistic (L2)": str((round(np.mean(logistic_summary['holdout_test_auc']), 3))) + " (" + str(round(np.std(logistic_summary['holdout_test_auc']), 3)) + ")", 
            "Logistic (L1)": str(round(np.mean(lasso_summary['holdout_test_auc']),3)) + " (" + str(round(np.std(lasso_summary['holdout_test_auc']), 3)) + ")", 
            "Linear SVM": str(round(np.mean(svm_summary['holdout_test_auc']),3)) + " (" + str(round(np.std(svm_summary['holdout_test_auc']), 3)) + ")", 
            "Random Forest": str(round(np.mean(rf_summary['holdout_test_auc']),3)) + " (" + str(round(np.std(rf_summary['holdout_test_auc']), 3)) + ")",
          "XG Boost": str((round(np.mean(xgb_summary['holdout_test_auc']), 3))) + " (" + str(round(np.std(xgb_summary['holdout_test_auc']), 3)) + ")"}

results = pd.DataFrame.from_dict(results, orient='index', columns=['Holdout Test AUC (mean and sd)'])
results

In [30]:
# Export results
path = './Results/baselines/'
results.to_csv(path + "baseline_results.csv")

## Interpretable Models