In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
import transformers
from transformers import AutoModel
import sys
import math
sys.path.insert(0,'..')

from utils.preprocessing import load_data
from utils.transformer_dataset import ReviewDataset
from utils.training import train_text_model, train_text_meta_model
from models.transformer_models import UsefulScoreRegressorTextOnly, UsefulScoreRegressorAllFeat

In [2]:
# Load data
train, val = load_data('../data/drugsComTrain_raw.csv')

In [3]:
# See dataframe
train

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,cleanReview,usefulScore,ratingNormalized,...,ADHD,Acne,Anxiety,Bipolar Disorde,Birth Control,Depression,Insomnia,Obesity,Pain,Weight Loss
126080,110122,Nexplanon,Birth Control,"""Hello, Ive had Nexplanon for four years (just...",8,2016-04-19,9,"Hello, Ive had Nexplanon for four years (just ...",0.306739,0.8,...,0,0,0,0,1,0,0,0,0,0
123803,6499,Wellbutrin XL,Depression,"""I started taking Wellbutrin XL August of 2016...",10,2017-09-15,16,I started taking Wellbutrin XL August of 2016....,0.387062,1.0,...,0,0,0,0,0,1,0,0,0,0
35971,39194,Contrave,Weight Loss,"""I tried Contrave for 5 days. I was just takin...",5,2017-04-19,8,I tried Contrave for 5 days. I was just taking...,0.290296,0.5,...,0,0,0,0,0,0,0,0,0,1
38384,137414,Isotretinoin,Acne,"""Accutane is an isotretinoin- the most powerfu...",10,2010-12-13,13,Accutane is an isotretinoin- the most powerful...,0.358074,1.0,...,0,1,0,0,0,0,0,0,0,0
89258,211662,Lamotrigine,Bipolar Disorde,"""I started on Lamictal after having manic epis...",10,2017-07-10,39,I started on Lamictal after having manic episo...,0.511444,1.0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25083,104070,Ethinyl estradiol / levonorgestrel,Birth Control,"""My favorite birth control by far! I have had ...",10,2013-11-13,22,My favorite birth control by far! I have had n...,0.431519,1.0,...,0,0,0,0,1,0,0,0,0,0
129693,210966,Mirtazapine,Anxiety,"""At 19 I went on Sertraline as I&#039;d strugg...",9,2015-11-29,65,At 19 I went on Sertraline as I'd struggled wi...,0.582757,0.9,...,0,0,1,0,0,0,0,0,0,0
62647,220149,Amitriptyline,Pain,"""I&#039;m 20 years old and have both fibromyal...",10,2015-08-17,100,I'm 20 years old and have both fibromyalgia an...,0.642895,1.0,...,0,0,0,0,0,0,0,0,1,0
147595,45327,Fluoxetine,Depression,"""I was prescribed prozac for depression about ...",8,2015-08-10,13,I was prescribed prozac for depression about 3...,0.358074,0.8,...,0,0,0,0,0,1,0,0,0,0


#### Develop BERT Model

In [4]:
##### Create pytorch dataset
nonTextCols = ['ratingNormalized', 'ageScore', 'ADHD', 'Acne', 'Anxiety', 'Bipolar Disorde',
                'Birth Control', 'Depression', 'Insomnia', 'Obesity', 'Pain', 'Weight Loss']
targetCol = 'usefulScore'

trainset = ReviewDataset(train, 'roberta-base', nonTextCols, targetCol)
valset = ReviewDataset(val, 'roberta-base', nonTextCols, targetCol)
train_loader = DataLoader(dataset=trainset, batch_size=8, shuffle=True)
val_loader = DataLoader(dataset=valset, batch_size=8, shuffle=False)

#### Train Models (Frozen Transformer Weights)

#### Text-only Model

In [9]:
##### Text-only Transformer Model
encoder = AutoModel.from_pretrained('roberta-base', return_dict=True)

# Freeze encoder parameters to avoid CUDA out of memory.
for param in encoder.parameters():
    param.requires_grad = False

model = UsefulScoreRegressorTextOnly(encoder)
model = model.cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

train_text_model(num_epochs=10, model=model, optimizer=optimizer,
                 train_loader=train_loader, val_loader=val_loader,
                 criterion=criterion, save_path='../models/RoBERTa_Frozen_TextOnly.pt')

Epoch 0, val loss: inf -> 0.01180, train loss: 0.01441
Epoch 1, val loss: 0.01180 -> 0.00757, train loss: 0.00808
Epoch 2, val loss: 0.00932, train loss: 0.00785
Epoch 3, val loss: 0.00757 -> 0.00580, train loss: 0.00399
Epoch 4, val loss: 0.00739, train loss: 0.00597
Epoch 5, val loss: 0.00608, train loss: 0.00459
Epoch 6, val loss: 0.00598, train loss: 0.00496
Epoch 7, val loss: 0.00580 -> 0.00551, train loss: 0.00491
Epoch 8, val loss: 0.00563, train loss: 0.00391
Epoch 9, val loss: 0.00551 -> 0.00495, train loss: 0.00455


#### Text + Metadata model

In [5]:
##### Text + metadata Transformer Model
encoder = AutoModel.from_pretrained('roberta-base', return_dict=True)

# Freeze encoder parameters to avoid CUDA out of memory.
for param in encoder.parameters():
    param.requires_grad = False

model = UsefulScoreRegressorAllFeat(encoder, num_meta_feats=len(nonTextCols))
model = model.cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

train_text_meta_model(num_epochs=10, model=model, optimizer=optimizer,
                      train_loader=train_loader, val_loader=val_loader,
                      criterion=criterion, save_path='../models/RoBERTa_Frozen_TextMeta.pt')

Epoch 0, val loss: inf -> 0.00498, train loss: 0.01852
Epoch 1, val loss: 0.00498 -> 0.00449, train loss: 0.00508
Epoch 2, val loss: 0.00603, train loss: 0.00491
Epoch 3, val loss: 0.00449 -> 0.00402, train loss: 0.00335
Epoch 4, val loss: 0.00402 -> 0.00369, train loss: 0.00325
Epoch 5, val loss: 0.00426, train loss: 0.00329
Epoch 6, val loss: 0.00421, train loss: 0.00307
Epoch 7, val loss: 0.00369 -> 0.00348, train loss: 0.00279
Epoch 8, val loss: 0.00543, train loss: 0.00286
Epoch 9, val loss: 0.00400, train loss: 0.00379


In [None]:
#### Check how the model is performing across each metadata feature group
#### Especially important to look at performance by age of review, to see if performance is good for young reviews
#### In practice, the newly posted reviews would be the ones that the model would help prioritize.

#### Train Models (Free Transformer Weights)

#### Text-only Model

In [None]:
##### Text-only Transformer Model
encoder = AutoModel.from_pretrained('roberta-base', return_dict=True)

model = UsefulScoreRegressorTextOnly(encoder)
model = model.cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

train_text_model(num_epochs=10, model=model, optimizer=optimizer,
                 train_loader=train_loader, val_loader=val_loader,
                 criterion=criterion, save_path='../models/RoBERTa_Free_TextOnly.pt')

#### Text + Metadata Model

In [None]:
##### Text + metadata Transformer Model
encoder = AutoModel.from_pretrained('roberta-base', return_dict=True)

model = UsefulScoreRegressorAllFeat(encoder, num_meta_feats=len(nonTextCols))
model = model.cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

train_text_meta_model(num_epochs=10, model=model, optimizer=optimizer,
                      train_loader=train_loader, val_loader=val_loader,
                      criterion=criterion, save_path='../models/RoBERTa_Free_TextMeta.pt')