In [17]:
import pandas as pd
import numpy as np
import os
import timeit
import matplotlib.pyplot as plt

import sys
sys.path.append(os.environ['CMS_ROOT'])

from cms_modules.utils import (
    apply_ros_rus,
    get_binary_imbalance_ratio,
    split_on_binary_attribute)

pd.set_option('display.max_rows',500)
pd.set_option('display.max_columns',500)

### Load Data

In [18]:
data_path = os.environ['CMS_PARTB_PATH']

In [20]:
# take random sample from the training data
train_data = pd.read_hdf(data_path, 'partB_train_normalized')
test_data = pd.read_hdf(data_path, 'partB_test_normalized')
pos_train, neg_train = split_on_binary_attribute(train_data, attribute='exclusion', pos_label=1, neg_label=0)
train_data = apply_ros_rus(pos_train, neg_train, ros_rate=1, rus_rate=0.05)
del pos_train
del neg_train

In [21]:
# separate features from labels
train_y = train_data['exclusion']
train_x = train_data.drop(columns=['exclusion'])
test_y = test_data['exclusion']
test_x = test_data.drop(columns=['exclusion'])

In [22]:
print(f'Training data shape {train_x.shape}')
print(f'Test data shape {test_x.shape}')

Training data shape (188840, 125)
Test data shape (938474, 125)


# Chi-Squared Feature Selection

In [25]:
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score

In [56]:
feature_selector = SelectKBest(chi2, k=50).fit(train_x, train_y)
train_x_new = feature_selector.transform(train_x)
test_x_new = feature_selector.transform(test_x)

In [57]:
lr_model = LogisticRegression(n_jobs=6)

In [58]:
start = timeit.default_timer()

lr_model.fit(train_x_new, train_y)

end = timeit.default_timer()
print(f'Training completed in {end - start} seconds')

  " = {}.".format(effective_n_jobs(self.n_jobs)))


Training completed in 0.5847295409912476 seconds


In [59]:
posteriors = lr_model.predict_proba(test_x_new)

In [60]:
roc_auc_score(test_y, posteriors[:, 1])

0.8028849159333186