In [1]:
import numpy as np
import cudf as pd
from cuml.preprocessing import LabelEncoder
from cuml.linear_model import LogisticRegression, LinearRegression
from cuml.ensemble import RandomForestClassifier
from cuml.model_selection import train_test_split
from cuml.cluster import KMeans
from cuml.metrics import accuracy_score
from cuml.metrics import confusion_matrix

# Input filename

In [58]:
in_filename = 'sample_data_500k.csv'
out_filename = 'write_speed_test_gpu.csv'

# Define variables

In [59]:
label_col_name = 'is_counterfeit' # the name of the column that contain the labels (0 for negative, 1 for positive)

predictor_col_names = ['Size of U.S. market', # the names of the columns that will be used to predict the label
                       'Price per unit', 
                       'RX/OTC', 
                       'Indication',
                       'Drug Class',
                       'Shortage',
                       'Twitter Mentions'
                      ] 

cetegoric_predictor_col_names = ['RX/OTC', # list of predictor columns that are not values; will use a label encoder on these
                                 'Indication',
                                 'Drug Class', 
                                 'Shortage'
                                ]

# models perform better when values are between 0 and 1. 
# thus, we use a scaling value to attempt to do this
scaling_dict = {'Size of U.S. market': 1/100000000000, # each column in this dictionary will be multiplied by the values here
                'Price per unit': 1/10000,
                'Twitter Mentions': 1/200000
               }

val_set_percent = 0.1 # percent of data to be reserved for validation
test_set_percent = 0.1 # percent of data to be reserved for testing

chunksize = 1000000 # number of rows to write at a time

random_seed = 42 # to make results reproducible

In [60]:
in_filename = './sample_data/' + in_filename
out_filename = './output_data/' + out_filename

# ETL

## Load & write data

In [61]:
%%time
df = pd.read_csv(in_filename)

CPU times: user 23.1 ms, sys: 20.2 ms, total: 43.4 ms
Wall time: 42.7 ms


In [62]:
%%time
df.to_csv(out_filename, index=False, chunksize=chunksize)

CPU times: user 30.7 ms, sys: 67.2 ms, total: 97.9 ms
Wall time: 98.1 ms


## Common DataFrame Operations

### describe the dataframe

In [8]:
%%time
df.describe()

CPU times: user 176 ms, sys: 4.05 ms, total: 180 ms
Wall time: 178 ms


Unnamed: 0,is_counterfeit,Size of U.S. market,Price per unit,Twitter Mentions,SNOMED
count,500000.0,500000.0,500000.0,500000.0,500000.0
mean,0.38,2316747000.0,207.262052,9097.0476,259068200.0
std,0.485387,3179636000.0,307.345847,5701.091269,276243700.0
min,0.0,10225.0,0.1,0.0,73211010.0
25%,0.0,33821260.0,1.75,4230.0,74732010.0
50%,0.0,67790620.0,3.41,8482.0,119292000.0
75%,1.0,4633315000.0,393.81,13894.0,373621000.0
max,1.0,9999840000.0,999.98,20000.0,840358000.0


### Set Index
for each categorical variable, set the DataFrame index to that variable

In [9]:
%%time
for a_cat in cetegoric_predictor_col_names:
    _ = df.set_index(a_cat)

CPU times: user 0 ns, sys: 22.3 ms, total: 22.3 ms
Wall time: 20.8 ms


### Concat multiple DataFrames
split data frame into 3 parts, and concatenate them

In [10]:
%%time
one_third_n_rows = round(df.shape[0] / 3)
_ = pd.concat([df.iloc[0:one_third_n_rows], df.iloc[one_third_n_rows:2*one_third_n_rows], df.iloc[2*one_third_n_rows::]])

CPU times: user 10.6 ms, sys: 4.9 ms, total: 15.5 ms
Wall time: 14.1 ms


### Groupby function
mean for each categorical variable

In [11]:
%%time
for a_cat in cetegoric_predictor_col_names:
    _ = df.groupby(a_cat).mean()

CPU times: user 18.6 ms, sys: 7.91 ms, total: 26.5 ms
Wall time: 25.9 ms


## Preprocess data

In [12]:
df_input = df.copy()

### fit label encoder
first we create a label encoder for each column specified in the variable 'cetegoric_predictor_col_names' 
defined at the top of the notebook

In [13]:
%%time
le_dict = {}
for col in df_input.columns:
    if col in cetegoric_predictor_col_names:
        le_dict[col] = LabelEncoder()
        le_dict[col].fit(df_input[col].unique())

CPU times: user 27.4 ms, sys: 14.4 ms, total: 41.7 ms
Wall time: 41 ms


### encode categoric columns
Then we apply the label encoder to the values of those columns

In [14]:
%%time
for col in df_input.columns:
    if col in cetegoric_predictor_col_names:
        df_input[col] = le_dict[col].transform(df_input[col])

CPU times: user 61.6 ms, sys: 24.5 ms, total: 86.1 ms
Wall time: 85.6 ms


### scale value variable columns
Next we apply the scaling by the variable 'scaling_dict' defined at the top of the notebook

In [15]:
%%time
for col in scaling_dict.keys():
    df_input[col] = df_input[col] * scaling_dict[col]

CPU times: user 3.64 ms, sys: 0 ns, total: 3.64 ms
Wall time: 2.53 ms


We split the dataframe into inputs (X) and outputs/targets (y)

In [16]:
X = df_input[predictor_col_names].copy()
y = df_input[label_col_name].copy()

Then we split the data into training and test sets

In [17]:
not_train_prct = val_set_percent + test_set_percent
train_prct = 1. - not_train_prct
test_prct = test_set_percent / not_train_prct
val_prct = 1. - test_prct

In [18]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=not_train_prct, random_state=random_seed)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_prct, random_state=random_seed)

CPU times: user 21.8 ms, sys: 8.24 ms, total: 30 ms
Wall time: 29.2 ms


In [19]:
del df, df_input

# Models

## OLS Regression

In [20]:
%%time
reg = LinearRegression()
reg.fit(X_train, y_train)

CPU times: user 646 ms, sys: 298 ms, total: 944 ms
Wall time: 940 ms


LinearRegression()

In [21]:
del reg

## Logistic Regression

In [22]:
%%time
logit_reg = LogisticRegression(penalty='none')#, class_weight=class_weight)
logit_reg.fit(X_train, y_train)

CPU times: user 449 ms, sys: 325 µs, total: 449 ms
Wall time: 448 ms


LogisticRegression()

In [23]:
del logit_reg

## K-Means

In [24]:
%%time
kmeans = KMeans(n_clusters=len(y_train.unique()), random_state=random_seed)
kmeans.fit(X_train)

CPU times: user 47.9 ms, sys: 6.66 ms, total: 54.6 ms
Wall time: 54.2 ms


KMeans()

In [25]:
del kmeans

## Random Forest Regression

In [26]:
%%time
rando_forest = RandomForestClassifier(max_depth=1, random_state=random_seed)
rando_forest.fit(X_train.astype(np.float32), y_train.astype(np.float32))

  return func(**kwargs)


CPU times: user 495 ms, sys: 221 ms, total: 715 ms
Wall time: 394 ms


RandomForestClassifier()

In [27]:
del rando_forest

## Gradient Boosting

In [28]:
%%time
from xgboost import XGBClassifier
if 'cudf' in str(type(X_train)):
    print('using GPU...')
    xgb = XGBClassifier(use_label_encoder=False, random_state=random_seed, tree_method='gpu_hist')
else:
    print('using CPU...')
    xgb = XGBClassifier(use_label_encoder=False, random_state=random_seed)
xgb.fit(X_train, y_train)

using GPU...
CPU times: user 522 ms, sys: 69.3 ms, total: 592 ms
Wall time: 593 ms


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=0, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=12,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='gpu_hist', use_label_encoder=False,
              validate_parameters=1, verbosity=None)