In [1]:
import pandas as pd
import numpy as np
import os
import timeit
import matplotlib.pyplot as plt

import sys
sys.path.append(os.environ['CMS_ROOT'])

from cms_modules.utils import (
    apply_ros_rus,
    get_binary_imbalance_ratio,
    split_on_binary_attribute)

pd.set_option('display.max_rows',500)
pd.set_option('display.max_columns',500)

### Load Data

In [2]:
data_path = os.environ['CMS_PARTB_PATH']

In [3]:
# take random sample from the training data
train_data = pd.read_hdf(data_path, 'partB_train_normalized')
test_data = pd.read_hdf(data_path, 'partB_test_normalized')
pos_train, neg_train = split_on_binary_attribute(train_data, attribute='exclusion', pos_label=1, neg_label=0)
train_data = apply_ros_rus(pos_train, neg_train, ros_rate=1, rus_rate=0.05)
del pos_train
del neg_train

In [4]:
# separate features from labels
train_y = train_data['exclusion']
train_x = train_data.drop(columns=['exclusion'])
test_y = test_data['exclusion']
test_x = test_data.drop(columns=['exclusion'])

In [5]:
print(f'Training data shape {train_x.shape}')
print(f'Test data shape {test_x.shape}')

Training data shape (188840, 125)
Test data shape (938474, 125)


# Chi-Squared Feature Selection

In [6]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [7]:
lr_model = LogisticRegression(n_jobs=6)
rfe = RFE(estimator=lr_model, n_features_to_select=100, step=5)
rfe.fit(train_x, train_y)

  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))
  " = {}.".format(effective_n_jobs(self.n_jobs)))


RFE(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                 fit_intercept=True, intercept_scaling=1,
                                 l1_ratio=None, max_iter=100,
                                 multi_class='warn', n_jobs=6, penalty='l2',
                                 random_state=None, solver='warn', tol=0.0001,
                                 verbose=0, warm_start=False),
    n_features_to_select=100, step=5, verbose=0)

In [8]:
print(f'Using features {train_x.columns[rfe.support_].values}')

Using features ['line_srvc_cnt_sum' 'bene_day_srvc_cnt_sum'
 'average_submitted_chrg_amt_sum' 'line_srvc_cnt_mean'
 'average_submitted_chrg_amt_mean' 'average_submitted_chrg_amt_median'
 'average_medicare_payment_amt_median' 'line_srvc_cnt_sd'
 'bene_day_srvc_cnt_sd' 'average_submitted_chrg_amt_sd'
 'average_medicare_payment_amt_sd' 'line_srvc_cnt_min'
 'bene_day_srvc_cnt_min' 'average_submitted_chrg_amt_min'
 'average_medicare_payment_amt_min' 'line_srvc_cnt_max'
 'bene_day_srvc_cnt_max' 'average_submitted_chrg_amt_max' 'F' 'M'
 'Addiction Medicine' 'All Other Suppliers' 'Ambulance Service Supplier'
 'Ambulatory Surgical Center' 'Anesthesiologist Assistants'
 'Audiologist (billing independently)' 'Cardiac Electrophysiology'
 'Cardiac Surgery' 'Cardiology' 'Centralized Flu'
 'Certified Clinical Nurse Specialist' 'Certified Nurse Midwife'
 'Chiropractic' 'Clinical Laboratory' 'Clinical Psychologist'
 'Colorectal Surgery (formerly proctology)' 'Critical Care (Intensivists)'
 'CRNA' 'Derm

In [9]:
posteriors = rfe.predict_proba(test_x)

In [56]:
# feature_selector = SelectKBest(chi2, k=50).fit(train_x, train_y)
# train_x_new = feature_selector.transform(train_x)
# test_x_new = feature_selector.transform(test_x)

In [57]:
lr_model = LogisticRegression(n_jobs=6)

In [58]:
start = timeit.default_timer()

lr_model.fit(train_x_new, train_y)

end = timeit.default_timer()
print(f'Training completed in {end - start} seconds')

  " = {}.".format(effective_n_jobs(self.n_jobs)))


Training completed in 0.5847295409912476 seconds


In [59]:
posteriors = lr_model.predict_proba(test_x_new)

In [10]:
roc_auc_score(test_y, posteriors[:, 1])

0.7984453132515584