In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
import transformers
from transformers import AutoModel
import sys
import math
sys.path.insert(0,'..')

from utils.preprocessing import load_data
from utils.transformer_dataset import ReviewDataset
from models.transformer_models import UsefulScoreRegressorTextOnly

In [2]:
# Load data
train, val = load_data('../data/drugsComTrain_raw.csv')

In [3]:
# See dataframe
train

Unnamed: 0,uniqueID,drugName,condition,review,rating,date,usefulCount,cleanReview,usefulScore,ratingNormalized,...,ADHD,Acne,Anxiety,Bipolar Disorde,Birth Control,Depression,Insomnia,Obesity,Pain,Weight Loss
126080,110122,Nexplanon,Birth Control,"""Hello, Ive had Nexplanon for four years (just...",8,2016-04-19,9,"Hello, Ive had Nexplanon for four years (just ...",0.306739,0.8,...,0,0,0,0,1,0,0,0,0,0
123803,6499,Wellbutrin XL,Depression,"""I started taking Wellbutrin XL August of 2016...",10,2017-09-15,16,I started taking Wellbutrin XL August of 2016....,0.387062,1.0,...,0,0,0,0,0,1,0,0,0,0
35971,39194,Contrave,Weight Loss,"""I tried Contrave for 5 days. I was just takin...",5,2017-04-19,8,I tried Contrave for 5 days. I was just taking...,0.290296,0.5,...,0,0,0,0,0,0,0,0,0,1
38384,137414,Isotretinoin,Acne,"""Accutane is an isotretinoin- the most powerfu...",10,2010-12-13,13,Accutane is an isotretinoin- the most powerful...,0.358074,1.0,...,0,1,0,0,0,0,0,0,0,0
89258,211662,Lamotrigine,Bipolar Disorde,"""I started on Lamictal after having manic epis...",10,2017-07-10,39,I started on Lamictal after having manic episo...,0.511444,1.0,...,0,0,0,1,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25083,104070,Ethinyl estradiol / levonorgestrel,Birth Control,"""My favorite birth control by far! I have had ...",10,2013-11-13,22,My favorite birth control by far! I have had n...,0.431519,1.0,...,0,0,0,0,1,0,0,0,0,0
129693,210966,Mirtazapine,Anxiety,"""At 19 I went on Sertraline as I&#039;d strugg...",9,2015-11-29,65,At 19 I went on Sertraline as I'd struggled wi...,0.582757,0.9,...,0,0,1,0,0,0,0,0,0,0
62647,220149,Amitriptyline,Pain,"""I&#039;m 20 years old and have both fibromyal...",10,2015-08-17,100,I'm 20 years old and have both fibromyalgia an...,0.642895,1.0,...,0,0,0,0,0,0,0,0,1,0
147595,45327,Fluoxetine,Depression,"""I was prescribed prozac for depression about ...",8,2015-08-10,13,I was prescribed prozac for depression about 3...,0.358074,0.8,...,0,0,0,0,0,1,0,0,0,0


#### Develop BERT Model

In [4]:
##### Create pytorch dataset
nonTextCols = ['ratingNormalized', 'ageScore', 'ADHD', 'Acne', 'Anxiety', 'Bipolar Disorde',
                'Birth Control', 'Depression', 'Insomnia', 'Obesity', 'Pain', 'Weight Loss']
targetCol = 'usefulScore'

trainset = ReviewDataset(train, 'roberta-base', nonTextCols, targetCol)
valset = ReviewDataset(val, 'roberta-base', nonTextCols, targetCol)
train_loader = DataLoader(dataset=trainset, batch_size=16, shuffle=True)
val_loader = DataLoader(dataset=valset, batch_size=16, shuffle=False)

In [5]:
##### Develop training loop

# Next steps: load model and optimizer. Develop training loop.

def train_model(num_epochs=1, model=None, optimizer=None,
                train_loader=None, val_loader=None,
                criterion=None, save_path=None):
    
    best_val_loss = float(math.inf)
    for epoch in range(num_epochs):
        avg_train_loss = 0
        tot_train_loss = 0
        tot_train_samples = 0
        
        model.train()
        for i, batch in enumerate(train_loader):
            batch = [el.cuda() for el in batch]
            tokens, attention_mask, nonText, target = batch
            optimizer.zero_grad()
            output = model(tokens, attention_mask).squeeze(1)
            train_loss = criterion(output, target)
            tot_train_loss += train_loss.item()
            tot_train_samples += tokens.shape[0]
            
            train_loss.backward()  # get gradients
            
            optimizer.step()  # update weights
            
        avg_train_loss = tot_train_loss / tot_train_samples
        
        # Val loss
        avg_val_loss = 0
        tot_val_loss = 0
        tot_val_samples = 0
        
        model.eval()
        for i, batch in enumerate(val_loader):
            batch = [el.cuda() for el in batch]
            tokens, attention_mask, nonText, target = batch
            optimizer.zero_grad()
            output = model(tokens, attention_mask).squeeze(1)  # get outputs
            val_loss = criterion(output, target)
            tot_val_loss += val_loss.item()
            tot_val_samples += tokens.shape[0]
            
        avg_val_loss = tot_val_loss / tot_val_samples
        
        if (avg_val_loss < best_val_loss):
            torch.save(model, save_path)
            print (f'Epoch {epoch}, val loss: {best_val_loss:.5f} -> {avg_val_loss:.5f}, train loss: {avg_train_loss:.5f}')
            best_val_loss = avg_val_loss
        else:
            print (f'Epoch {epoch}, val loss: {avg_val_loss:.5f}, train loss: {avg_train_loss:.5f}')
            
    
encoder = AutoModel.from_pretrained('roberta-base', return_dict=True)

# Freeze encoder parameters to avoid CUDA out of memory.
for param in encoder.parameters():
    param.requires_grad = False

model = UsefulScoreRegressorTextOnly(encoder)
model = model.cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

train_model(num_epochs=10, model=model, optimizer=optimizer,
            train_loader=train_loader, val_loader=val_loader,
            criterion=criterion, save_path='../models/test_model.pt')

Epoch 0, val loss: inf -> 0.00135, train loss: 0.00168
Epoch 1, val loss: 0.00141, train loss: 0.00152
Epoch 2, val loss: 0.00135 -> 0.00133, train loss: 0.00150


KeyboardInterrupt: 