In [7]:
import pandas as pd
import numpy as np
import math
from scipy.sparse import save_npz, load_npz
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split
import os
import sys

proj_dir = os.environ['CMS_ROOT']
sys.path.append(proj_dir)
from utils import load_data, get_minority_size, df_to_csr

pd.set_option('display.max_columns', 500)

In [2]:
write_sample = False
sample_filename = '2012-2015-subset.csv.gz'

## Load Data

In [3]:
sample_size = 5000000
sample = load_data(sample_size)
print(f'Minority class size: {get_minority_size(sample)} %')

if write_sample:
  sample.to_csv(os.path.join(proj_dir, 'data', sample_filename), compression='gzip', index=False)

Minority class size: 0.08476 %


In [4]:
sample.head()

Unnamed: 0,index,npi,provider_type,nppes_provider_state,nppes_provider_gender,hcpcs_code,line_srvc_cnt,bene_unique_cnt,bene_day_srvc_cnt,average_submitted_chrg_amt,average_medicare_payment_amt,year,exclusion
0,28637487,1760642011,Diagnostic Radiology,RI,M,74176,100.0,97,100,271.0,70.8383,2015,0
1,7149344,1194722108,Nephrology,IL,F,90960,473.0,62,473,648.0,237.272093,2014,0
2,34589983,1922253566,Physical Therapist,NY,F,97140,939.0,75,939,31.1,18.531182,2014,0
3,3375954,1093703605,Gastroenterology,AZ,M,99213,91.0,67,91,163.0,46.747033,2015,0
4,2960235,1073829990,Nephrology,IL,M,90962,19.0,14,19,430.0,155.37,2013,0


## Separate Class Labels and Drop Columns Not Needed

In [5]:
y = sample['exclusion']
sample = sample.drop(columns=['index', 'npi', 'year', 'exclusion'])

## Create Train/Test Indices

We want to use the same indices for all data sets so that we can compare results.

In [6]:
train_ind, test_ind = train_test_split(np.arange(0, sample.shape[0], 1), test_size=0.2, random_state=42)
train_ind.shape, test_ind.shape

((4000000,), (1000000,))

## Create One-Hot HCPCS Data Set

In [None]:
%%time

one_hot_hcpcs = pd.get_dummies(sample, sparse=True)

# convert to sparse matrix
one_hot_hcpcs = df_to_csr(one_hot_hcpcs)

# create train test splits
train_x = one_hot_hcpcs[train_ind]
test_x = one_hot_hcpcs[test_ind]
train_y = y[train_ind]
test_y = y[test_ind]

# normalize
scaler = MaxAbsScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

# save results
np.save(os.path.join(proj_dir, 'data', 'y-train.npy'), train_y)
np.save(os.path.join(proj_dir, 'data', 'y-test.npy'), test_y)

save_npz(os.path.join(proj_dir, 'data', 'x-train-onehot-hcpcs.npz'), train_x)
save_npz(os.path.join(proj_dir, 'data', 'x-test-onehot-hcpcs.npz'), test_x)

## Create No-HCPCS Data Set

We won't save the targets again, they are the same across all data sets.

In [None]:
%%time

no_hcpcs = pd.get_dummies(sample.drop(columns=['hcpcs_code']), sparse=True)

# convert to sparse matrix
no_hcpcs = df_to_csr(no_hcpcs)

# create train test splits
train_x = no_hcpcs[train_ind]
test_x = no_hcpcs[test_ind]

# normalize
scaler = MaxAbsScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

# save results
save_npz(os.path.join(proj_dir, 'data', 'x-train-no-hcpcs.npz'), train_x)
save_npz(os.path.join(proj_dir, 'data', 'x-test-no-hcpcs.npz'), test_x)

## Classify with XGBoost Learner

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [None]:
train_x = load_npz(os.path.join(proj_dir, 'data', 'x-train-onehot-hcpcs.npz'))
test_x = load_npz(os.path.join(proj_dir, 'data', 'x-test-onehot-hcpcs.npz'))

train_y = np.load(os.path.join(proj_dir, 'data', 'y-train.npy'))
test_y = np.load(os.path.join(proj_dir, 'data', 'y-test.npy'))

In [None]:
%%time

xgb = XGBClassifier(max_depth=8, n_jobs=-1)

xgb.fit(train_x, train_y)

In [None]:
train_probs = xgb.predict_proba(train_x)[:, 1]
test_probs = xgb.predict_proba(test_x)[:, 1]

print(f'Train AUC: {roc_auc_score(train_y, train_probs)}')
print(f'TEST AUC: {roc_auc_score(test_y, test_probs)}')

## Classify with GBT Learner

In [None]:
gbt = GradientBoostingClassifier(max_depth=8, loss='exponential')

In [None]:
%%time

gbt.fit(train_x, train_y)

In [None]:
train_probs = gbt.predict_proba(train_x)[:, 1]
test_probs = gbt.predict_proba(test_x)[:, 1]

In [None]:
print(f'Train AUC: {roc_auc_score(train_y, train_probs)}')
print(f'TEST AUC: {roc_auc_score(test_y, test_probs)}')