In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

# Input filename

In [2]:
in_filename = 'sample_data_20m.csv'
out_filename = 'write_speed_test_cpu.csv'

# Define variables

In [3]:
label_col_name = 'Medwatch or 3911 drug' # the name of the column that contain the labels (0 for negative, 1 for positive)

predictor_col_names = ['Size of U.S. market', # the names of the columns that will be used to predict the label
                       'Price per unit', 
                       'RX/OTC', 
                       'Indication',
                       'Drug Class',
                       'Shortage',
                       'Twitter Mentions'
                      ] 

cetegoric_predictor_col_names = ['RX/OTC', # list of predictor columns that are not values; will use a label encoder on these
                                 'Indication',
                                 'Drug Class', 
                                 'Shortage'
                                ]

# models perform better when values are between 0 and 1. 
# thus, we use a scaling value to attempt to do this
scaling_dict = {'Size of U.S. market': 1/100000000000, # each column in this dictionary will be multiplied by the values here
                'Price per unit': 1/10000,
                'Twitter Mentions': 1/200000
               }

val_set_percent = 0.1 # percent of data to be reserved for validation
test_set_percent = 0.1 # percent of data to be reserved for testing

In [4]:
random_seed = 42 # to make results reproducible

# Load & write data

In [5]:
np.random.seed(seed=random_seed)

In [6]:
%%time
df = pd.read_csv(in_filename)

CPU times: user 16.2 s, sys: 1.82 s, total: 18 s
Wall time: 18.1 s


In [None]:
%%time
df.to_csv(out_filename, index=False)

# Common DataFrame Operations

### describe the dataframe

In [None]:
%%time
df.describe()

### Set Index
for each categorical variable, set the DataFrame index to that variable

In [None]:
%%time
for a_cat in cetegoric_predictor_col_names:
    _ = df.set_index(a_cat)

### Concat multiple DataFrames
split data frame into 3 parts, and concatenate them

In [None]:
%%time
one_third_n_rows = round(df.shape[0] / 3)
_ = pd.concat([df.iloc[0:one_third_n_rows], df.iloc[one_third_n_rows:2*one_third_n_rows], df.iloc[2*one_third_n_rows::]])

### Groupby function
mean for each categorical variable

In [None]:
%%time
for a_cat in cetegoric_predictor_col_names:
    _ = df.groupby(a_cat).mean()

# Preprocess data

In [None]:
df_input = df.copy()

### fit label encoder
first we create a label encoder for each column specified in the variable 'cetegoric_predictor_col_names' 
defined at the top of the notebook

In [None]:
%%time
le_dict = {}
for col in df_input.columns:
    if col in cetegoric_predictor_col_names:
        le_dict[col] = LabelEncoder()
        le_dict[col].fit(df_input[col].unique())

### encode categoric columns
Then we apply the label encoder to the values of those columns

In [None]:
%%time
for col in df_input.columns:
    if col in cetegoric_predictor_col_names:
        df_input[col] = le_dict[col].transform(df_input[col])

### scale value variable columns
Next we apply the scaling by the variable 'scaling_dict' defined at the top of the notebook

In [None]:
%%time
for col in scaling_dict.keys():
    df_input[col] = df_input[col] * scaling_dict[col]

We split the dataframe into inputs (X) and outputs/targets (y)

In [None]:
X = df_input[predictor_col_names].copy()
y = df_input[label_col_name].copy()

Then we split the data into training and test sets

In [None]:
not_train_prct = val_set_percent + test_set_percent
train_prct = 1. - not_train_prct
test_prct = test_set_percent / not_train_prct
val_prct = 1. - test_prct

In [None]:
%%time
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=not_train_prct, random_state=random_seed)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=test_prct, random_state=random_seed)

In [None]:
del df, df_input

# Goal

Our goal is to **maximize** the number of **correct possitive labels**

# OLS Regression

As a baseline, we train a model using a simple OLS regression. 

It's possible that only a few of our predictor variables account for high accuracy, and that not all of them are needed. 
To make sure all are needed, we will first run regressions using EACH INDIVIDUAL predictor variable by itself, 
then we will run a regregression using ALL predictor variables. 
We will use the accuracy metric to make this determination.

In [None]:
%%time
reg = LinearRegression()
reg.fit(X_train, y_train)

In [None]:
del reg

We can see that using all the columns give us a better accuracy than each individual column.

Now lets view some **metrics on** the **test data**,  
and **determine** a **baseline for** our **goal**

# Logistic Regression

we will now train a Logistic regression model, see our scores for the test dataset, and finally,  
see if the logistic regression performs better than our 2226 baseline

In [None]:
%%time
logit_reg = LogisticRegression(penalty='none')#, class_weight=class_weight)
logit_reg.fit(X_train, y_train)

In [None]:
del logit_reg

# K-Means

We now perform the same analysis using K-Means clustering

In [None]:
%%time
kmeans = KMeans(n_clusters=len(y_train.unique()), random_state=random_seed)
kmeans.fit(X_train)

In [None]:
del kmeans

# Random Forest Regression
Finally, we perform a random forest regression

In [None]:
%%time
rando_forest = RandomForestClassifier(max_depth=1, random_state=random_seed)
rando_forest.fit(X_train.astype(np.float32), y_train.astype(np.float32))

In [None]:
del rando_forest

# Gradient Boosting

In [None]:
%%time
from xgboost import XGBClassifier
xgb = XGBClassifier(use_label_encoder=False, random_state=random_seed)
xgb.fit(X_train, y_train)