In [23]:
import pandas as pd
import numpy as np
import math
from scipy.sparse import save_npz, load_npz
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import train_test_split
import os
import utils

pd.set_option('display.max_columns', 500)

ModuleNotFoundError: No module named 'utils'

## Load Data

In [7]:
sample_size = 5000000
sample = load_data(sample_size)

maj, mino = sample['exclusion'].value_counts()
print(f'Minority class size: {} %')

sample.to_csv(os.path.join(proj_dir, 'data', '2012-2015-subset.csv.gz'), compression='gzip', index=False)

## Separate Class Labels and Drop Columns Not Needed

In [10]:
y = sample['exclusion']
sample = sample.drop(columns=['index', 'npi', 'year', 'exclusion'])

## Create Train/Test Indices

We want to use the same indices for all data sets so that we can compare results.

In [11]:
train_ind, test_ind = train_test_split(np.arange(0, sample.shape[0], 1), test_size=0.2, random_state=42)
train_ind.shape, test_ind.shape

((4000000,), (1000000,))

## Create One-Hot HCPCS Data Set

In [12]:
%%time

one_hot_hcpcs = pd.get_dummies(sample, sparse=True)

# convert to sparse matrix
one_hot_hcpcs = one_hot_hcpcs.to_sparse()
one_hot_hcpcs = one_hot_hcpcs.to_coo()
one_hot_hcpcs = one_hot_hcpcs.astype('float32')
one_hot_hcpcs = one_hot_hcpcs.tocsr()

# create train test splits
train_x = one_hot_hcpcs[train_ind]
test_x = one_hot_hcpcs[test_ind]
train_y = y[train_ind]
test_y = y[test_ind]

# normalize
scaler = MaxAbsScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

# save results
np.save(os.path.join(proj_dir, 'data', 'y-train.npy'), train_y)
np.save(os.path.join(proj_dir, 'data', 'y-test.npy'), test_y)

save_npz(os.path.join(proj_dir, 'data', 'x-train-onehot-hcpcs.npz'), train_x)
save_npz(os.path.join(proj_dir, 'data', 'x-test-onehot-hcpcs.npz'), test_x)

  after removing the cwd from sys.path.
Use a Series with sparse values instead.

    >>> series = pd.Series(pd.SparseArray(...))

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  return klass(values, index=self.index, name=items, fastpath=True)


CPU times: user 58.6 s, sys: 4.89 s, total: 1min 3s
Wall time: 41.7 s


## Create No-HCPCS Data Set

We won't save the targets again, they are the same across all data sets.

In [13]:
%%time

no_hcpcs = pd.get_dummies(sample.drop(columns=['hcpcs_code']), sparse=True)

# convert to sparse matrix
no_hcpcs = no_hcpcs.to_sparse()
no_hcpcs = no_hcpcs.to_coo()
no_hcpcs = no_hcpcs.astype('float32')
no_hcpcs = no_hcpcs.tocsr()

# create train test splits
train_x = no_hcpcs[train_ind]
test_x = no_hcpcs[test_ind]

# normalize
scaler = MaxAbsScaler()
train_x = scaler.fit_transform(train_x)
test_x = scaler.transform(test_x)

# save results
save_npz(os.path.join(proj_dir, 'data', 'x-train-no-hcpcs.npz'), train_x)
save_npz(os.path.join(proj_dir, 'data', 'x-test-no-hcpcs.npz'), test_x)

  after removing the cwd from sys.path.
Use a Series with sparse values instead.

    >>> series = pd.Series(pd.SparseArray(...))

See http://pandas.pydata.org/pandas-docs/stable/user_guide/sparse.html#migrating for more.

  return klass(values, index=self.index, name=items, fastpath=True)


CPU times: user 44.9 s, sys: 4 s, total: 48.9 s
Wall time: 30.1 s


## Classify with XGBoost Learner

In [14]:
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

In [17]:
train_x = load_npz(os.path.join(proj_dir, 'data', 'x-train-onehot-hcpcs.npz'))
test_x = load_npz(os.path.join(proj_dir, 'data', 'x-test-onehot-hcpcs.npz'))

train_y = np.load(os.path.join(proj_dir, 'data', 'y-train.npy'))
test_y = np.load(os.path.join(proj_dir, 'data', 'y-test.npy'))

In [18]:
%%time

xgb = XGBClassifier(max_depth=8, n_jobs=-1)

xgb.fit(train_x, train_y)

CPU times: user 1h 1min 23s, sys: 26.6 s, total: 1h 1min 50s
Wall time: 12min 3s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=8,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=-1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [19]:
train_probs = xgb.predict_proba(train_x)[:, 1]
test_probs = xgb.predict_proba(test_x)[:, 1]

print(f'Train AUC: {roc_auc_score(train_y, train_probs)}')
print(f'TEST AUC: {roc_auc_score(test_y, test_probs)}')

Train AUC: 0.9030022416365732
TEST AUC: 0.867473684384475


## Classify with GBT Learner

In [None]:
gbt = GradientBoostingClassifier(max_depth=8, loss='exponential')

In [None]:
%%time

gbt.fit(train_x, train_y)

In [None]:
train_probs = gbt.predict_proba(train_x)[:, 1]
test_probs = gbt.predict_proba(test_x)[:, 1]

In [None]:
print(f'Train AUC: {roc_auc_score(train_y, train_probs)}')
print(f'TEST AUC: {roc_auc_score(test_y, test_probs)}')