# Project Setup

In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
import sys
import math
import tqdm
import html
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import sklearn

from bs4 import BeautifulSoup

Setting up Kaggle's API to directly download the required datasets.

In [51]:
!pip install -q kaggle
from google.colab import files
print("If you have not already, please follow the instructions to create a Kggle API token file. We shall use it to download the dataset for this Colab Notebook.")
print("Here is a link for your convenience: https://www.kaggle.com/docs/api#authentication")
print("Please upload your Kaggle API token file")
files.upload()

!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

If you have not already, please follow the instructions to create a Kggle API token file. We shall use it to download the dataset for this Colab Notebook.
Here is a link for your convenience: https://www.kaggle.com/docs/api#authentication
Please upload your Kaggle API token file


Saving kaggle.json to kaggle (1).json
mkdir: cannot create directory ‘/root/.kaggle’: File exists


This section downloads the dataset. This section must be run once at the beginning every time a new session is created.

In [3]:
!kaggle datasets download -d jessicali9530/kuc-hackathon-winter-2018
!unzip -o kuc-hackathon-winter-2018.zip && rm kuc-hackathon-winter-2018.zip

Downloading kuc-hackathon-winter-2018.zip to /content
 81% 33.0M/40.7M [00:02<00:00, 9.93MB/s]
100% 40.7M/40.7M [00:02<00:00, 18.2MB/s]
Archive:  kuc-hackathon-winter-2018.zip
  inflating: drugsComTest_raw.csv    
  inflating: drugsComTrain_raw.csv   


In [4]:
# Load data
train_drug_df = pd.read_csv('drugsComTrain_raw.csv')
test_drug_df = pd.read_csv('drugsComTest_raw.csv')

In [5]:
# See dataframe
train_drug_df

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount
0,206461,Valsartan,Left Ventricular Dysfunction,"""It has no side effect, I take it in combinati...",9,20-May-12,27
1,95260,Guanfacine,ADHD,"""My son is halfway through his fourth week of ...",8,27-Apr-10,192
2,92703,Lybrel,Birth Control,"""I used to take another oral contraceptive, wh...",5,14-Dec-09,17
3,138000,Ortho Evra,Birth Control,"""This is my first time using any form of birth...",8,3-Nov-15,10
4,35696,Buprenorphine / naloxone,Opiate Dependence,"""Suboxone has completely turned my life around...",9,27-Nov-16,37
...,...,...,...,...,...,...,...
161292,191035,Campral,Alcohol Dependence,"""I wrote my first report in Mid-October of 201...",10,31-May-15,125
161293,127085,Metoclopramide,Nausea/Vomiting,"""I was given this in IV before surgey. I immed...",1,1-Nov-11,34
161294,187382,Orencia,Rheumatoid Arthritis,"""Limited improvement after 4 months, developed...",2,15-Mar-14,35
161295,47128,Thyroid desiccated,Underactive Thyroid,"""I&#039;ve been on thyroid medication 49 years...",10,19-Sep-15,79


# Data Pre-processing

This section mostly contains the extra data that needed to be extracted out of the dataset before baselines could be run or any kind of pre-processing that needs to be done on them.

In [6]:
def clean_reviews(review):
    clean_review = BeautifulSoup(review, features='lxml')
    clean_review = clean_review.get_text().replace('"', "")
    clean_review = clean_review.replace('\n', ' ').replace('\r', '').replace('\t', '')
    clean_review = clean_review.replace('\s{2,}', ' ')
    return clean_review


def cap_col_val(val, usefulCount_range):
    if val > usefulCount_range[-1]:
        val = usefulCount_range[-1]
    return val


def assign_bucket(val, buckets):
    for i, bucket in enumerate(buckets):
        if bucket[0] <= val < bucket[1]:
            new_val = i
    return new_val


def get_buckets(df, quantiles):
    cutoffs = []
    buckets = []
    for i in quantiles:
        cutoffs.append(df['usefulCount'].quantile(q=i))
    for i in range(1, len(cutoffs)):
        if i == 1:
            buckets.append([0, cutoffs[i-1]])
        buckets.append([cutoffs[i-1], cutoffs[i]])
        if i == len(cutoffs) - 1:
            buckets.append([cutoffs[i], np.inf])
    if len(cutoffs)==1:
        buckets.append([0, cutoffs[0]])
        buckets.append([cutoffs[0], np.inf])
    return buckets


def load_data(df, year_range=[2008, 2017], usefulCount_range=[0, 10000], usefulCount_quantile=None,
              quantiles_for_class=[0.25, 0.5, 0.75]):
    # Remove duplicate reviews
    df = df.drop_duplicates(subset=['review', 'condition', 'date', 'rating', 'usefulCount'])

    # Get most common conditions (by review count)
    top_conditions = list(df.groupby('condition').count().reset_index().sort_values(by='uniqueID', ascending=False)[:10]['condition'])
    df = df.loc[df['condition'].isin(top_conditions)]

    # Filter the reviews by the input year_range
    df['date'] = pd.to_datetime(df['date'])
    df = df.loc[(df.date.dt.year >= year_range[0]) & (df.date.dt.year <= year_range[1]), :]

    # Create onehot encoding for the condition so we can use it as a feature as well
    df = pd.concat([df, pd.get_dummies(df['condition'])], axis=1)

    # Clean review text
    df['cleanReview'] = df['review'].apply(clean_reviews)

    # Create standardized usefulScoreLog column (log of usefulCount normalized to be between 0 and 1)
    with np.errstate(divide='ignore', invalid='ignore'):
        df['usefulScoreLog'] = np.log(df['usefulCount']) / np.max(np.log(df['usefulCount']))
    df['usefulScoreLog'] = df['usefulScoreLog'].replace(-np.Inf, 0)

    # Cap the usefulCount to create a new target variable column
    if usefulCount_quantile is not None:
        usefulCount_range = [0, int(df['usefulCount'].quantile(q=usefulCount_quantile))]
    df['usefulCountCapped'] = df['usefulCount'].apply(lambda row : cap_col_val(row, usefulCount_range))

    # Normalize usefulCountCapped
    df['usefulCountCappedNormalized'] = df['usefulCountCapped'] / max(df['usefulCountCapped'])

    # Create normalized rating (0 to 1) to be used as a metadata feature
    df['ratingNormalized'] = df['rating'] / np.max(df['rating'])

    # Cast the date column to be a date datatype and compute the review age (with 0 corresponding to the most recent review)
    df['daysOld'] = (max(df['date']) - df['date']).astype('timedelta64[s]') / (60*60*24)

    # Compute an age score as daysOld normalized to be between 0 and 1
    with np.errstate(divide='ignore', invalid='ignore'):
        df['ageScore'] = df['daysOld'] / np.max(df['daysOld'])
    df['ageScore'] = df['ageScore'].replace(-np.Inf, 0)

    # Create a usefulCountClass column to treat usefulness prediction as a classification problem
    if quantiles_for_class is not None:
        buckets = get_buckets(df=df, quantiles=quantiles_for_class)
        df['usefulCountClass'] = df['usefulCount'].apply(lambda row : assign_bucket(row, buckets))

    # Split data into train and val
    train = df.sample(frac=0.75, random_state=8)
    val = df.loc[~df['uniqueID'].isin(train['uniqueID'])]

    return train, val

In [7]:
def preprocess_drug_df(drug_df):
    useless_mask = (drug_df['usefulCount'] == 0)
    drug_df['tempCount'] = drug_df['usefulCount'] + useless_mask  # makes sure that log of the value is defined and sets it to 1 when taking the log
    drug_df['usefulScore'] = np.log(drug_df['tempCount'])
    max_score = np.max(drug_df['usefulScore'])
    drug_df['usefulScore'] = drug_df['usefulScore'] / max_score  # normalizing useful score to be in between 0 and 1
    drug_df['date'] = pd.to_datetime(drug_df['date'])
    recording_date = pd.Timestamp('2018-11-12')
    drug_df['age'] = (recording_date - drug_df['date']).dt.days
    all_drug_names = list(set(drug_df['drugName']))
    drug_name_to_idx = {drug_name: idx for idx, drug_name in enumerate(all_drug_names)}
    drug_df['drugId'] = drug_df['drugName'].replace(drug_name_to_idx)
    all_condition_names = list(set(drug_df['condition']))
    condition_name_to_idx = {condition_name: idx for idx, condition_name in enumerate(all_condition_names)}
    drug_df['conditionId'] = drug_df['condition'].replace(condition_name_to_idx)
    return drug_df


In [10]:
train_drug_df, train_drug_val = load_data(train_drug_df, year_range=[2013, 2017], usefulCount_quantile=0.99)

In [11]:
train_drug_df

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,ADHD,Acne,Anxiety,Bipolar Disorde,Birth Control,Depression,Insomnia,Obesity,Pain,Weight Loss,cleanReview,usefulScoreLog,usefulCountCapped,usefulCountCappedNormalized,ratingNormalized,daysOld,ageScore,usefulCountClass,ADHD.1,Acne.1,Anxiety.1,Bipolar Disorde.1,Birth Control.1,Depression.1,Insomnia.1,Obesity.1,Pain.1,Weight Loss.1
82273,148765,Mirena,Birth Control,"""About a year ago the gynacology doctors recom...",1,2016-06-05,2,0,0,0,0,1,0,0,0,0,0,About a year ago the gynacology doctors recomm...,0.109662,2,0.012346,0.1,555.0,0.307309,0,0,0,0,0,1,0,0,0,0,0
56165,185423,Saxenda,Obesity,"""I started saxenda on May 21, by June 23 I hav...",10,2017-07-01,21,0,0,0,0,0,0,0,1,0,0,"I started saxenda on May 21, by June 23 I have...",0.481670,21,0.129630,1.0,164.0,0.090808,2,0,0,0,0,0,0,0,1,0,0
20981,78348,Lyza,Birth Control,"""I&#039;m not gonna go into detail because it ...",10,2016-03-22,22,0,0,0,0,1,0,0,0,0,0,I'm not gonna go into detail because it would ...,0.489030,22,0.135802,1.0,630.0,0.348837,2,0,0,0,0,1,0,0,0,0,0
66822,63373,Epiduo,Acne,"""I started this medicine when I was fairly you...",2,2017-04-12,0,0,1,0,0,0,0,0,0,0,0,I started this medicine when I was fairly youn...,0.000000,0,0.000000,0.2,244.0,0.135105,0,0,1,0,0,0,0,0,0,0,0
48773,131912,Effexor XR,Depression,"""I have been on this medication for 12 mths no...",9,2013-05-13,31,0,0,0,0,0,1,0,0,0,0,I have been on this medication for 12 mths now...,0.543286,31,0.191358,0.9,1674.0,0.926910,3,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67418,73344,Ethinyl estradiol / norethindrone,Birth Control,"""I am 20 years old and here&#039;s my experien...",5,2014-02-12,17,0,0,0,0,1,0,0,0,0,0,I am 20 years old and here's my experience wit...,0.448239,17,0.104938,0.5,1399.0,0.774640,2,0,0,0,0,1,0,0,0,0,0
89823,47781,Adapalene,Acne,"""I used differen when it first arrived on the ...",8,2017-02-18,23,0,1,0,0,0,0,0,0,0,0,I used differen when it first arrived on the m...,0.496062,23,0.141975,0.8,297.0,0.164452,2,0,1,0,0,0,0,0,0,0,0
55175,15567,Ethinyl estradiol / norethindrone,Birth Control,"""I had mild water retention (about 5lbs), mode...",5,2017-01-30,6,0,0,0,0,1,0,0,0,0,0,"I had mild water retention (about 5lbs), moder...",0.283472,6,0.037037,0.5,316.0,0.174972,1,0,0,0,0,1,0,0,0,0,0
18022,177842,Duloxetine,Depression,"""Cymbalta is so far the only medication that h...",9,2013-05-08,55,0,0,0,0,0,1,0,0,0,0,Cymbalta is so far the only medication that ha...,0.633995,55,0.339506,0.9,1679.0,0.929679,3,0,0,0,0,0,1,0,0,0,0


In [12]:
train_drug_val

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,ADHD,Acne,Anxiety,Bipolar Disorde,Birth Control,Depression,Insomnia,Obesity,Pain,Weight Loss,cleanReview,usefulScoreLog,usefulCountCapped,usefulCountCappedNormalized,ratingNormalized,daysOld,ageScore,usefulCountClass,ADHD.1,Acne.1,Anxiety.1,Bipolar Disorde.1,Birth Control.1,Depression.1,Insomnia.1,Obesity.1,Pain.1,Weight Loss.1
115547,132932,Loestrin 24 Fe,Acne,"""This birth control pill has negatively impact...",1,2015-05-08,7,0,1,0,0,0,0,0,0,0,0,This birth control pill has negatively impacte...,0.307860,7,0.043210,0.1,949.0,0.525471,1,0,1,0,0,0,0,0,0,0,0
13074,73731,Ethinyl estradiol / norethindrone,Birth Control,"""I was on this birth control for 8 months. The...",7,2013-04-10,4,0,0,0,0,1,0,0,0,0,0,I was on this birth control for 8 months. The ...,0.219324,4,0.024691,0.7,1707.0,0.945183,1,0,0,0,0,1,0,0,0,0,0
88841,219764,Amitriptyline,Pain,"""I was diagnosed with severe anterior bridging...",10,2017-02-06,77,0,0,0,0,0,0,0,0,1,0,I was diagnosed with severe anterior bridging ...,0.687227,77,0.475309,1.0,309.0,0.171096,3,0,0,0,0,0,0,0,0,1,0
112893,24076,Tretinoin,Acne,"""I&#039;m 13 years old with lots of acne i do ...",10,2016-05-12,5,0,1,0,0,0,0,0,0,0,0,I'm 13 years old with lots of acne i do lots o...,0.254627,5,0.030864,1.0,579.0,0.320598,1,0,1,0,0,0,0,0,0,0,0
21172,12224,Loryna,Birth Control,"""I&#039;ve been on this BC for a month now, th...",10,2017-02-17,7,0,0,0,0,1,0,0,0,0,0,"I've been on this BC for a month now, the firs...",0.307860,7,0.043210,1.0,298.0,0.165006,1,0,0,0,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98244,223562,Sonata,Insomnia,"""My doctor prescribed me 10mg sonata for my in...",7,2016-06-19,30,0,0,0,0,0,0,1,0,0,0,My doctor prescribed me 10mg sonata for my ins...,0.538099,30,0.185185,0.7,541.0,0.299557,2,0,0,0,0,0,0,1,0,0,0
27442,103841,Ethinyl estradiol / levonorgestrel,Birth Control,"""I started taking Jolessa Lo October 16, 2014....",6,2015-01-22,0,0,0,0,0,1,0,0,0,0,0,"I started taking Jolessa Lo October 16, 2014. ...",0.000000,0,0.000000,0.6,1055.0,0.584164,0,0,0,0,0,1,0,0,0,0,0
104866,216054,Copper,Birth Control,"""Just had mine put in today and it wasn&#039;t...",9,2015-12-16,6,0,0,0,0,1,0,0,0,0,0,Just had mine put in today and it wasn't as ba...,0.283472,6,0.037037,0.9,727.0,0.402547,1,0,0,0,0,1,0,0,0,0,0
32387,157808,Accutane,Acne,"""I&#039;ve been on accutane 3 months now. I&#0...",7,2015-03-14,9,0,1,0,0,0,0,0,0,0,0,I've been on accutane 3 months now. I've had p...,0.347620,9,0.055556,0.7,1004.0,0.555925,1,0,1,0,0,0,0,0,0,0,0


In [13]:
def get_target_data(drug_df):
    return torch.tensor(drug_df['usefulCountCappedNormalized'].to_numpy(dtype=np.float), dtype=torch.float).unsqueeze(1)

In [14]:
def get_non_textual_data(drug_df):
    # non_textual_cols = ['rating', 'tempCount', 'age', 'drugId', 'conditionId']
    non_textual_cols = ['ADHD', 'Acne', 'Anxiety', 'Bipolar Disorde', 'Birth Control', 'Depression', 'Insomnia', 'Obesity', 'Pain', 'Weight Loss', 'ratingNormalized']
    return torch.tensor(drug_df[non_textual_cols].to_numpy(dtype=np.float), dtype=torch.float), get_target_data(drug_df)

In [15]:
def get_unescaped_word_to_idx_mapping(drug_df):
    textual_data = drug_df['review'].to_list()
    word_split_by_sentence = [html.unescape(curr_sentence).split(' ') for curr_sentence in textual_data]
    all_text = ' '.join(textual_data)
    all_unescaped_text = html.unescape(all_text)
    all_unescaped_words = list(set(all_unescaped_text.split(' ')))
    # bow_matrix = torch.sparse_coo_tensor(size=(len(word_split_by_sentence), len(all_unescaped_words)))
    # bow_matrix = torch.zeros((len(word_split_by_sentence), len(all_unescaped_words)))
    unescaped_word_to_idx_mapping = {word: idx for idx, word in enumerate(all_unescaped_words)}
    return unescaped_word_to_idx_mapping

In [16]:
def flip_mapping(original_mapping):
    return {val: key for key, val in original_mapping.items()}

In [17]:
def get_bow_rep_data(drug_df, unescaped_word_to_idx_mapping):
    textual_data = drug_df['review'].to_list()
    idx_val_mapping = {}
    word_split_by_sentence = [html.unescape(curr_sentence).split(' ') for curr_sentence in textual_data]
    curr_sentence_idx = 0
    for curr_sentence in tqdm.tqdm(word_split_by_sentence):
        for curr_word in curr_sentence:
            idx_val_mapping[(curr_sentence_idx, unescaped_word_to_idx_mapping[curr_word])] = idx_val_mapping.get((curr_sentence_idx, unescaped_word_to_idx_mapping[curr_word]), 0) + 1
            # if idx_val_mapping.get((curr_sentence_idx, unescaped_word_to_idx_mapping[curr_word]), 0) > 500:
            #     print(curr_word, curr_sentence)
            # print(bow_matrix[curr_sentence_idx, unescaped_word_to_idx_mapping[curr_word]])
            # bow_matrix[curr_sentence_idx, unescaped_word_to_idx_mapping[curr_word]] = 1
            # bow_matrix[curr_sentence_idx, unescaped_word_to_idx_mapping[curr_word]] += 1
        curr_sentence_idx += 1
    # transpose of BoW representation
    # num_occurrences_per_sentence = torch.tensor([[curr_sentence.count(curr_word) for curr_sentence in word_split_by_sentence] for curr_word in tqdm.tqdm(all_unescaped_words)])
    idx_list = [[], []]
    val_list = []
    for (curr_idx_0, curr_idx_1), curr_val in idx_val_mapping.items():
        idx_list[0].append(curr_idx_0)
        idx_list[1].append(curr_idx_1)
        val_list.append(curr_val)
    return torch.sparse_coo_tensor(idx_list, val_list, (len(word_split_by_sentence), len(unescaped_word_to_idx_mapping)), dtype=torch.float), get_target_data(drug_df)

In [18]:
unescaped_word_to_idx_mapping = get_unescaped_word_to_idx_mapping(train_drug_df)
get_bow_rep_data(train_drug_df, unescaped_word_to_idx_mapping)

100%|██████████| 20277/20277 [00:01<00:00, 13972.15it/s]


(tensor(indices=tensor([[    0,     0,     0,  ..., 20276, 20276, 20276],
                        [25018, 12926, 23797,  ..., 21761, 55011,  1156]]),
        values=tensor([1., 2., 2.,  ..., 1., 1., 1.]),
        size=(20277, 67800), nnz=1462089, layout=torch.sparse_coo),
 tensor([[0.0123],
         [0.1296],
         [0.1358],
         ...,
         [0.0370],
         [0.3395],
         [0.4753]]))

## Util Function

In [32]:
def r2_loss(output, target):
    target_mean = torch.mean(target)
    ss_tot = torch.sum((target - target_mean) ** 2)
    ss_res = torch.sum((target - output) ** 2)
    r2 = 1 - ss_res / ss_tot
    return r2

# Baseline Models

In [40]:
class DrugLinearRegression(nn.Module):
    def __init__(self, HIDDEN_DIM=5, INPUT_LAYER_DIM=11, single_layer=False):
        super().__init__()
        self.HIDDEN_DIM = HIDDEN_DIM
        if single_layer:
            self.input_layer = nn.Linear(INPUT_LAYER_DIM, 1, device='cuda')
            self.layers = nn.Sequential(
                self.input_layer
            )
        else:
            self.input_layer = nn.Linear(INPUT_LAYER_DIM, self.HIDDEN_DIM, device='cuda')
            self.activation = nn.LeakyReLU().cuda()
            self.output_layer = nn.Linear(self.HIDDEN_DIM, 1, device='cuda')
            self.layers = nn.Sequential(
                self.input_layer,
                self.activation,
                self.output_layer
            )
    
    def forward(self, X):
        return self.layers(X)


In [24]:
def train(model, optimizer, criterion, X, Y, num_epochs=10000):
    with tqdm.trange(num_epochs) as progress_bar:
        for curr_epoch in progress_bar:
            optimizer.zero_grad()
            pred = model.forward(X)
            loss = criterion(pred, Y)
            loss.backward()
            optimizer.step()
            progress_bar.set_postfix(loss=loss)


## Linear Regression without Textual Data

In [33]:
baseline_model_1 = DrugLinearRegression(INPUT_LAYER_DIM=21)
train_X, train_Y = get_non_textual_data(train_drug_df)
test_X, test_Y = get_non_textual_data(train_drug_val)
optim_fn = torch.optim.Adam(baseline_model_1.parameters())
loss_fn = nn.MSELoss().cuda()
train(baseline_model_1, optim_fn, loss_fn, train_X.cuda(), train_Y.cuda(), num_epochs=2000)
print(f"Test RMSE Loss: {math.sqrt(loss_fn(baseline_model_1.forward(test_X.cuda()), test_Y.cuda()))}")
print(f"Test MAE Loss: {nn.L1Loss().cuda()(baseline_model_1.forward(test_X.cuda()), test_Y.cuda())}")
print(f"Test R2 Loss: {sklearn.metrics.r2_score(baseline_model_1.forward(test_X.cuda()).cpu().detach().numpy(), test_Y.numpy())}")

100%|██████████| 2000/2000 [00:09<00:00, 200.75it/s, loss=tensor(0.0231, device='cuda:0', grad_fn=<MseLossBackward0>)]


Test RMSE Loss: 0.1569445907072588
Test MAE Loss: 0.09748319536447525
Test R2 Loss: -1.072597384873732


In [34]:
weights = baseline_model_1.input_layer.weight[0]
total_weight = abs(sum(weights))
weights / total_weight
baseline_model_1.input_layer.bias / total_weight

tensor([-0.0312, -0.2231, -0.0569,  0.2069, -0.1833], device='cuda:0',
       grad_fn=<DivBackward0>)

In [35]:
torch.save(baseline_model_1.state_dict(), 'baseline_model_1.pt')

## Linear Regression with Bag of Words (BoW)

In [36]:
train_idx, valid_idx = train_test_split(np.arange(train_drug_df.shape[0]), test_size=0.2, random_state=42, shuffle=True)
split_train_drug_df, split_val_drug_df = train_drug_df.iloc[train_idx], train_drug_df.iloc[valid_idx]
word_to_idx_mapping = get_unescaped_word_to_idx_mapping(train_drug_df)
idx_to_word_mapping = flip_mapping(word_to_idx_mapping)

train_X, train_Y = get_bow_rep_data(split_train_drug_df, word_to_idx_mapping)
val_X, val_Y = get_bow_rep_data(split_val_drug_df, word_to_idx_mapping)

100%|██████████| 16221/16221 [00:01<00:00, 15015.74it/s]
100%|██████████| 4056/4056 [00:00<00:00, 16057.89it/s]


In [37]:
baseline_model_2 = DrugLinearRegression(INPUT_LAYER_DIM=len(word_to_idx_mapping))

optim_fn = torch.optim.Adam(baseline_model_2.parameters())
loss_fn = nn.MSELoss().cuda()
train(baseline_model_2, optim_fn, loss_fn, train_X.cuda(), train_Y.cuda(), num_epochs=60)
print(f"Test RMSE Loss: {math.sqrt(loss_fn(baseline_model_2.forward(val_X.cuda()), val_Y.cuda()))}")
print(f"Test MAE Loss: {nn.L1Loss().cuda()(baseline_model_2.forward(val_X.cuda()), val_Y.cuda())}")
print(f"Test R2 Loss: {sklearn.metrics.r2_score(baseline_model_2.forward(val_X.cuda()).cpu().detach().numpy(), val_Y.numpy())}")

100%|██████████| 60/60 [00:00<00:00, 88.18it/s, loss=tensor(0.0068, device='cuda:0', grad_fn=<MseLossBackward0>)]


Test RMSE Loss: 0.15927820141138635
Test MAE Loss: 0.10204453766345978
Test R2 Loss: -0.8003863661053547


### Linear Regression with Bag of Words (Analysis of words with highest and lowest impact on usefulness)

In [41]:
baseline_model_2 = DrugLinearRegression(INPUT_LAYER_DIM=len(word_to_idx_mapping), single_layer=True)

optim_fn = torch.optim.Adam(baseline_model_2.parameters())
loss_fn = nn.MSELoss().cuda()
train(baseline_model_2, optim_fn, loss_fn, train_X.cuda(), train_Y.cuda(), num_epochs=60)
print(f"Test RMSE Loss: {math.sqrt(loss_fn(baseline_model_2.forward(val_X.cuda()), val_Y.cuda()))}")
print(f"Test MAE Loss: {nn.L1Loss().cuda()(baseline_model_2.forward(val_X.cuda()), val_Y.cuda())}")
print(f"Test R2 Loss: {sklearn.metrics.r2_score(baseline_model_2.forward(val_X.cuda()).cpu().detach().numpy(), val_Y.numpy())}")

100%|██████████| 60/60 [00:00<00:00, 100.65it/s, loss=tensor(0.0052, device='cuda:0', grad_fn=<MseLossBackward0>)]


Test RMSE Loss: 0.16777247227667552
Test MAE Loss: 0.11633790284395218
Test R2 Loss: -0.592664936343662


In [42]:
torch.save(baseline_model_2.state_dict(), 'baseline_model_2.pt')

In [45]:
top_10_words = [idx_to_word_mapping[curr_idx] for curr_idx in torch.topk((baseline_model_2.input_layer.weight), k=10).indices[0].tolist()]
bottom_10_words = [idx_to_word_mapping[curr_idx] for curr_idx in torch.topk((baseline_model_2.input_layer.weight), k=10, largest=False).indices[0].tolist()]
print(f"Top 10 most impactful words on usefulness: {', '.join(top_10_words)}")
print(f"Top 10 least impactful words on usefulness: {', '.join(bottom_10_words)}")

Top 10 most impactful words on usefulness: "225lbs, Literally.", cravings!", 180lbs, Levels, rate/blood, accutane.", lifestyle.", Xanax/alprazolam, talking."
Top 10 least impactful words on usefulness: breeze!", tolerable", blackout, pounds;, side", bulimic, Contrace, depersonalize., tag, obsessed,


## Linear Regression with TF-IDF

In [46]:
def convert_scipy_sparse_to_torch_sparse(original_matrix):
    original_matrix = original_matrix.tocoo()
    original_matrix = torch.sparse_coo_tensor(np.vstack((original_matrix.row, original_matrix.col)), original_matrix.data, torch.Size(original_matrix.shape), dtype=torch.float)
    return original_matrix

In [47]:
train_drug_df_vectors = TfidfVectorizer().fit_transform(train_drug_df['review'])
train_drug_df_vectors

<20277x20883 sparse matrix of type '<class 'numpy.float64'>'
	with 1324741 stored elements in Compressed Sparse Row format>

In [48]:
train_idx, valid_idx = train_test_split(np.arange(train_drug_df.shape[0]), test_size=0.2, random_state=42, shuffle=True)
train_X, val_X = convert_scipy_sparse_to_torch_sparse(train_drug_df_vectors[train_idx]), convert_scipy_sparse_to_torch_sparse(train_drug_df_vectors[valid_idx])

train_Y, val_Y = get_target_data(train_drug_df.iloc[train_idx]), get_target_data(train_drug_df.iloc[valid_idx])

In [49]:
baseline_model_3 = DrugLinearRegression(INPUT_LAYER_DIM=train_drug_df_vectors.shape[1])

optim_fn = torch.optim.Adam(baseline_model_3.parameters())
loss_fn = nn.MSELoss().cuda()
train(baseline_model_3, optim_fn, loss_fn, train_X.cuda(), train_Y.cuda(), num_epochs=60)
print(f"Test RMSE Loss: {math.sqrt(loss_fn(baseline_model_3.forward(val_X.cuda()), val_Y.cuda()))}")
print(f"Test MAE Loss: {nn.L1Loss().cuda()(baseline_model_3.forward(val_X.cuda()), val_Y.cuda())}")
print(f"Test R2 Loss: {sklearn.metrics.r2_score(baseline_model_3.forward(val_X.cuda()).cpu().detach().numpy(), val_Y.numpy())}")

100%|██████████| 60/60 [00:00<00:00, 96.63it/s, loss=tensor(0.0283, device='cuda:0', grad_fn=<MseLossBackward0>)]


Test RMSE Loss: 0.17968485664413653
Test MAE Loss: 0.1193474754691124
Test R2 Loss: -11.532565238250571
