In [1]:
import numpy as np
import cudf as pd
from cuml.preprocessing import LabelEncoder
from cuml.linear_model import LogisticRegression, LinearRegression
from cuml.ensemble import RandomForestClassifier
from cuml.model_selection import train_test_split
from cuml.cluster import KMeans
from cuml.metrics import accuracy_score
from cuml.metrics import confusion_matrix

# Input filename

In [2]:
in_filename = 'sample_data_20m.csv'
out_filename = 'write_speed_test_gpu.csv'

# Define variables

In [3]:
label_col_name = 'Medwatch or 3911 drug' # the name of the column that contain the labels (0 for negative, 1 for positive)

predictor_col_names = ['Size of U.S. market', # the names of the columns that will be used to predict the label
                       'Price per unit', 
                       'RX/OTC', 
                       'Indication',
                       'Drug Class',
                       'Shortage',
                       'Twitter Mentions'
                      ] 

cetegoric_predictor_col_names = ['RX/OTC', # list of predictor columns that are not values; will use a label encoder on these
                                 'Indication',
                                 'Drug Class', 
                                 'Shortage'
                                ]

# models perform better when values are between 0 and 1. 
# thus, we use a scaling value to attempt to do this
scaling_dict = {'Size of U.S. market': 1/100000000000, # each column in this dictionary will be multiplied by the values here
                'Price per unit': 1/10000,
                'Twitter Mentions': 1/200000
               }

val_set_percent = 0.1 # percent of data to be reserved for validation
test_set_percent = 0.1 # percent of data to be reserved for testing

In [4]:
random_seed = 42 # to make results reproducible

# Load & write data

In [5]:
np.random.seed(seed=random_seed)

In [6]:
%%time
df = pd.read_csv(in_filename)

CPU times: user 1.36 s, sys: 1.17 s, total: 2.52 s
Wall time: 2.54 s


In [7]:
%%time
df.to_csv(out_filename, index=False)

CPU times: user 522 ms, sys: 2.91 s, total: 3.43 s
Wall time: 3.75 s


# Common DataFrame Operations

### describe the dataframe

In [8]:
%%time
df.describe()

CPU times: user 164 ms, sys: 381 ms, total: 545 ms
Wall time: 547 ms


Unnamed: 0,Medwatch or 3911 drug,Size of U.S. market,Price per unit,SNOMED,Twitter Mentions
count,20000000.0,20000000.0,20000000.0,20000000.0,20000000.0
mean,0.38,2320627000.0,207.5574,258689500.0,9100.454
std,0.485386,3183251000.0,307.4475,275913100.0,5705.274
min,0.0,10006.0,0.1,73211010.0,0.0
25%,0.0,33922320.0,1.75,80659010.0,4237.0
50%,0.0,67769550.0,3.4,119292000.0,8473.0
75%,1.0,4636946000.0,394.27,373621000.0,13907.0
max,1.0,9999989000.0,1000.0,840358000.0,20000.0


### Set Index
for each categorical variable, set the DataFrame index to that variable

In [9]:
%%time
for a_cat in cetegoric_predictor_col_names:
    _ = df.set_index(a_cat)

CPU times: user 9.1 ms, sys: 121 ms, total: 130 ms
Wall time: 130 ms


### Concat multiple DataFrames
split data frame into 3 parts, and concatenate them

In [10]:
%%time
one_third_n_rows = round(df.shape[0] / 3)
_ = pd.concat([df.iloc[0:one_third_n_rows], df.iloc[one_third_n_rows:2*one_third_n_rows], df.iloc[2*one_third_n_rows::]])

CPU times: user 15.5 ms, sys: 43.3 ms, total: 58.8 ms
Wall time: 57.8 ms


### Groupby function
mean for each categorical variable

In [11]:
%%time
for a_cat in cetegoric_predictor_col_names:
    _ = df.groupby(a_cat).mean()

CPU times: user 11.2 ms, sys: 276 ms, total: 287 ms
Wall time: 289 ms


# Preprocess data

In [12]:
df_input = df.copy()

### fit label encoder
first we create a label encoder for each column specified in the variable 'cetegoric_predictor_col_names' 
defined at the top of the notebook

In [13]:
%%time
le_dict = {}
for col in df_input.columns:
    if col in cetegoric_predictor_col_names:
        le_dict[col] = LabelEncoder()
        le_dict[col].fit(df_input[col].unique())

CPU times: user 206 ms, sys: 1.62 s, total: 1.83 s
Wall time: 1.83 s


### encode categoric columns
Then we apply the label encoder to the values of those columns

In [14]:
%%time
for col in df_input.columns:
    if col in cetegoric_predictor_col_names:
        df_input[col] = le_dict[col].transform(df_input[col])

CPU times: user 233 ms, sys: 1.94 s, total: 2.18 s
Wall time: 2.18 s


### scale value variable columns
Next we apply the scaling by the variable 'scaling_dict' defined at the top of the notebook

In [15]:
%%time
for col in scaling_dict.keys():
    df_input[col] = df_input[col] * scaling_dict[col]

CPU times: user 3.52 ms, sys: 11.4 ms, total: 14.9 ms
Wall time: 12.3 ms


We split the dataframe into inputs (X) and outputs/targets (y)

In [16]:
X = df_input[predictor_col_names].copy()
y = df_input[label_col_name].copy()

Then we split the data into training and test sets

In [17]:
not_train_prct = val_set_percent + test_set_percent
train_prct = 1. - not_train_prct
test_prct = test_set_percent / not_train_prct
val_prct = 1. - test_prct

In [18]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=not_train_prct, random_state=random_seed)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_prct, random_state=random_seed)

CPU times: user 66.7 ms, sys: 81.2 ms, total: 148 ms
Wall time: 146 ms


In [19]:
del df, df_input

# Goal

Our goal is to **maximize** the number of **correct possitive labels**

# OLS Regression

As a baseline, we train a model using a simple OLS regression. 

It's possible that only a few of our predictor variables account for high accuracy, and that not all of them are needed. 
To make sure all are needed, we will first run regressions using EACH INDIVIDUAL predictor variable by itself, 
then we will run a regregression using ALL predictor variables. 
We will use the accuracy metric to make this determination.

In [20]:
%%time
reg = LinearRegression()
reg.fit(X_train, y_train)

CPU times: user 560 ms, sys: 524 ms, total: 1.08 s
Wall time: 1.08 s


LinearRegression()

In [21]:
del reg

We can see that using all the columns give us a better accuracy than each individual column.

Now lets view some **metrics on** the **test data**,  
and **determine** a **baseline for** our **goal**

# Logistic Regression

we will now train a Logistic regression model, see our scores for the test dataset, and finally,  
see if the logistic regression performs better than our 2226 baseline

In [22]:
%%time
logit_reg = LogisticRegression(penalty='none')#, class_weight=class_weight)
logit_reg.fit(X_train, y_train)

CPU times: user 1.06 s, sys: 7.04 s, total: 8.09 s
Wall time: 8.08 s


LogisticRegression()

In [23]:
del logit_reg

# K-Means

We now perform the same analysis using K-Means clustering

In [24]:
%%time
kmeans = KMeans(n_clusters=len(y_train.unique()), random_state=random_seed)
kmeans.fit(X_train)

CPU times: user 320 ms, sys: 2.24 s, total: 2.56 s
Wall time: 2.56 s


KMeans()

In [25]:
del kmeans

# Random Forest Regression
Finally, we perform a random forest regression

In [26]:
%%time
rando_forest = RandomForestClassifier(max_depth=1, random_state=random_seed, n_streams=16)
rando_forest.fit(X_train.astype(np.float32), y_train.astype(np.float32))

  return func(**kwargs)


CPU times: user 13.8 s, sys: 20.8 s, total: 34.6 s
Wall time: 15.3 s


RandomForestClassifier()

In [27]:
del rando_forest

# Gradient Boosting

In [28]:
%%time
from xgboost import XGBClassifier
xgb = XGBClassifier(tree_method='gpu_hist', use_label_encoder=False, random_state=random_seed)
xgb.fit(X_train, y_train)

CPU times: user 839 ms, sys: 3.09 s, total: 3.93 s
Wall time: 3.94 s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=0, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=16,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='gpu_hist', use_label_encoder=False,
              validate_parameters=1, verbosity=None)