In [3]:
import pandas as pd
import numpy as np
import math
from scipy.sparse import save_npz, load_npz
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split
import os
import sys

proj_dir = os.environ['CMS_ROOT']
sys.path.append(proj_dir)
from utils.data import load_data, get_minority_size, df_to_csr

pd.set_option('display.max_columns', 500)

In [4]:
write_sample = False
sample_filename = '2012-2015-subset.csv.gz'

## Load Data

In [5]:
sample = load_data(1000000)
print(f'Minority class size: {get_minority_size(sample)} %')

if write_sample:
  sample.to_csv(os.path.join(proj_dir, 'data', sample_filename), compression='gzip', index=False)

KeyboardInterrupt: 

In [None]:
sample.head()

## Separate Class Labels and Drop Columns Not Needed

In [None]:
y = sample['exclusion']
sample = sample.drop(columns=['index', 'npi', 'year', 'exclusion'])

## Create Train/Test Indices

We want to use the same indices for all data sets so that we can compare results.

In [None]:
train_ind, test_ind = train_test_split(np.arange(0, sample.shape[0], 1), test_size=0.2, random_state=42)
train_ind.shape, test_ind.shape

## Create One-Hot HCPCS Data Set

In [None]:
%%time

one_hot_hcpcs = pd.get_dummies(sample, sparse=True)

# convert to sparse matrix
one_hot_hcpcs = df_to_csr(one_hot_hcpcs)

# create train test splits
train_x = one_hot_hcpcs[train_ind]
test_x = one_hot_hcpcs[test_ind]
train_y = y[train_ind]
test_y = y[test_ind]

# normalize
scaler = MaxAbsScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

# save results
np.save(os.path.join(proj_dir, 'data', 'y-train.npy'), train_y)
np.save(os.path.join(proj_dir, 'data', 'y-test.npy'), test_y)

save_npz(os.path.join(proj_dir, 'data', 'x-train-onehot-hcpcs.npz'), train_x)
save_npz(os.path.join(proj_dir, 'data', 'x-test-onehot-hcpcs.npz'), test_x)

## Create No-HCPCS Data Set

We won't save the targets again, they are the same across all data sets.

In [None]:
%%time

no_hcpcs = pd.get_dummies(sample.drop(columns=['hcpcs_code']), sparse=True)

# convert to sparse matrix
no_hcpcs = df_to_csr(no_hcpcs)

# create train test splits
train_x = no_hcpcs[train_ind]
test_x = no_hcpcs[test_ind]

# normalize
scaler = MaxAbsScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

# save results
save_npz(os.path.join(proj_dir, 'data', 'x-train-no-hcpcs.npz'), train_x)
save_npz(os.path.join(proj_dir, 'data', 'x-test-no-hcpcs.npz'), test_x)

## Classify with XGBoost Learner

In [6]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [7]:
train_x = load_npz(os.path.join(proj_dir, 'data', 'x-train-onehot-hcpcs.npz'))
test_x = load_npz(os.path.join(proj_dir, 'data', 'x-test-onehot-hcpcs.npz'))

train_y = np.load(os.path.join(proj_dir, 'data', 'y-train.npy'))
test_y = np.load(os.path.join(proj_dir, 'data', 'y-test.npy'))

In [8]:
train_x.shape

(4000000, 5891)

In [9]:
%%time

xgb = XGBClassifier(max_depth=8, n_jobs=-1)

xgb.fit(train_x, train_y)

CPU times: user 59min 20s, sys: 59.4 s, total: 1h 20s
Wall time: 12min 27s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [10]:
train_probs = xgb.predict_proba(train_x)[:, 1]
test_probs = xgb.predict_proba(test_x)[:, 1]

print(f'Train AUC: {roc_auc_score(train_y, train_probs)}')
print(f'TEST AUC: {roc_auc_score(test_y, test_probs)}')

Train AUC: 0.9030022416365732
TEST AUC: 0.867473684384475


In [11]:
preds = xgb.predict(train_x)

In [12]:
preds[:5]

array([0, 0, 0, 0, 0])

In [None]:
def getBestThreshold(model, train_x, train_y, interval=0.1):
  for i in range(math.floor(1.0 / interval)):
    thresh = round(interval * i, 4)
    

In [None]:
csv_path = os.path.join(results_dir, 'thresholds.csv')
    plot_path = os.path.join(results_dir, 'thresholds.png')
    for i in range(math.floor(1.0 / interval)):
        thresh = round(interval * i, 5)
        write_threshold_metrics(y_true, y_prob, thresh, csv_path)
    df = pd.read_csv(csv_path)
    if with_graph:
        plot_thresholds(df, plot_path)

## Classify with GBT Learner

In [None]:
gbt = GradientBoostingClassifier(max_depth=8, loss='exponential')

In [None]:
%%time

gbt.fit(train_x, train_y)

In [None]:
train_probs = gbt.predict_proba(train_x)[:, 1]
test_probs = gbt.predict_proba(test_x)[:, 1]

In [None]:
print(f'Train AUC: {roc_auc_score(train_y, train_probs)}')
print(f'TEST AUC: {roc_auc_score(test_y, test_probs)}')