In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
import transformers
from transformers import AutoModel
import sys
import math
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn.metrics as perf
from scipy.stats import pearsonr
sys.path.insert(0,'..')

from utils.preprocessing import load_data
from utils.transformer_dataset import ReviewDataset
from utils.training import train_text_model, train_text_meta_model, train_meta_model
from utils.evaluation import get_cls_perf, get_reg_perf, get_predictions
from models.transformer_models import UsefulScoreRegressorTextOnly, UsefulScoreRegressorAllFeat, UsefulScoreRegressorMetaOnly
from models.transformer_models import UsefulScoreRegressorLinearBaseline, DrugLinearRegression

In [2]:
nonTextCols = ['ADHD', 'Acne', 'Anxiety', 'Bipolar Disorde', 'Birth Control',
               'Depression', 'Insomnia', 'Obesity', 'Pain', 'Weight Loss', 'ratingNormalized']
targetCol = 'usefulCountCappedNormalized'

# No usefulCount cap and no year filter

In [3]:
train, val = load_data('../data/drugsComTrain_raw.csv')
trainset = ReviewDataset(train, 'distilbert-base-uncased', nonTextCols, targetCol)
valset = ReviewDataset(val, 'distilbert-base-uncased', nonTextCols, targetCol)
train_loader = DataLoader(dataset=trainset, batch_size=8, shuffle=True)
val_loader = DataLoader(dataset=valset, batch_size=8, shuffle=False)

## Train Text-Only and Text-Meta models to compare with Linear Baseline

#### Neural Meta

In [4]:
model = UsefulScoreRegressorMetaOnly(num_meta_feats=len(nonTextCols))
model = model.cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

train_meta_model(num_epochs=5, model=model, optimizer=optimizer,
                 train_loader=train_loader, val_loader=val_loader,
                 criterion=criterion, save_path='../models/Neural_Meta.pt', clip=10000.0,
                 classify=False)

Epoch 0, val loss: inf -> 0.00009060, train loss: 0.00010412
Epoch 1, val loss: 0.00009225, train loss: 0.00009068
Epoch 2, val loss: 0.00009294, train loss: 0.00009031
Epoch 3, val loss: 0.00009060 -> 0.00008901, train loss: 0.00009000
Epoch 4, val loss: 0.00009021, train loss: 0.00008984


#### Text

In [3]:
encoder = AutoModel.from_pretrained('distilbert-base-uncased', return_dict=True)

# Freeze encoder parameters
for param in encoder.parameters():
    param.requires_grad = False

model = UsefulScoreRegressorTextOnly(encoder)
model = model.cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

train_text_model(num_epochs=5, model=model, optimizer=optimizer,
                 train_loader=train_loader, val_loader=val_loader,
                 criterion=criterion, save_path='../models/distilBERT_Frozen_TextOnly.pt', clip=1.0,
                 classify=False)

Epoch 0, val loss: inf -> 0.00009937, train loss: 0.00017522
Epoch 1, val loss: 0.00010440, train loss: 0.00010534
Epoch 2, val loss: 0.00010670, train loss: 0.00010561
Epoch 3, val loss: 0.00010261, train loss: 0.00010607
Epoch 4, val loss: 0.00014371, train loss: 0.00010637


KeyboardInterrupt: 

#### Text-Meta

In [4]:
encoder = AutoModel.from_pretrained('distilbert-base-uncased', return_dict=True)

# Freeze encoder parameters
for param in encoder.parameters():
    param.requires_grad = False

model = UsefulScoreRegressorAllFeat(encoder, num_meta_feats=len(nonTextCols))
model = model.cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

train_text_meta_model(num_epochs=5, model=model, optimizer=optimizer,
                      train_loader=train_loader, val_loader=val_loader,
                      criterion=criterion, save_path='../models/distilBERT_Frozen_TextMeta.pt', clip=1.0,
                      classify=False)

Epoch 0, val loss: inf -> 0.00009712, train loss: 0.00015047
Epoch 1, val loss: 0.00009712 -> 0.00009151, train loss: 0.00009867
Epoch 2, val loss: 0.00009151 -> 0.00008863, train loss: 0.00009960
Epoch 3, val loss: 0.00009045, train loss: 0.00009989
Epoch 4, val loss: 0.00009301, train loss: 0.00009969


## Evaluate Text and Text-Meta Models

In [4]:
train, val = load_data('../data/drugsComTest_raw.csv', year_range=[2008, 2017], usefulCount_range=[0, 10000],
                      usefulCount_quantile=None)
test = pd.concat((train, val), axis=0)
testset = ReviewDataset(test, 'distilbert-base-uncased', nonTextCols, targetCol)
test_loader = DataLoader(dataset=testset, batch_size=8, shuffle=False)

#### Text

In [5]:
model = torch.load('../models/distilBERT_Frozen_TextOnly.pt')
model = model.cuda()
mae, rmse, r2 = get_reg_perf(model=model, loader=test_loader, model_type='text')
print(f'mae: {mae}')
print(f'rmse: {rmse}')
print(f'r2: {r2}')

mae: 0.02106926217675209
rmse: 0.03769624978303909
r2: 0.16104679665887278


#### Text-Meta

In [7]:
model = torch.load('../models/distilBERT_Frozen_TextMeta.pt')
model = model.cuda()
mae, rmse, r2 = get_reg_perf(model=model, loader=test_loader, model_type='text-meta')
print(f'mae: {mae}')
print(f'rmse: {rmse}')
print(f'r2: {r2}')

mae: 0.01857740804553032
rmse: 0.03731502220034599
r2: 0.17792984195345796


# usefulCount cap of 99th percentile and years 2009 to 2013 filter

In [3]:
train, val = load_data('../data/drugsComTrain_raw.csv', year_range=[2009, 2013], usefulCount_quantile=0.99)

nonTextCols = ['ADHD', 'Acne', 'Anxiety', 'Bipolar Disorde', 'Birth Control',
               'Depression', 'Insomnia', 'Obesity', 'Pain', 'Weight Loss', 'ratingNormalized']
targetCol = 'usefulCountCappedNormalized'

trainset = ReviewDataset(train, 'distilbert-base-uncased', nonTextCols, targetCol)
valset = ReviewDataset(val, 'distilbert-base-uncased', nonTextCols, targetCol)
train_loader = DataLoader(dataset=trainset, batch_size=8, shuffle=True)
val_loader = DataLoader(dataset=valset, batch_size=8, shuffle=False)

#### Text

In [4]:
encoder = AutoModel.from_pretrained('distilbert-base-uncased', return_dict=True)

# Freeze encoder parameters
for param in encoder.parameters():
    param.requires_grad = False

model = UsefulScoreRegressorTextOnly(encoder)
model = model.cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

train_text_model(num_epochs=10, model=model, optimizer=optimizer,
                 train_loader=train_loader, val_loader=val_loader,
                 criterion=criterion, save_path='../models/distilBERT_Frozen_TextOnly_Cap99_2009-2013.pt', clip=1.0,
                 classify=False)

Epoch 0, val loss: inf -> 0.00414581, train loss: 0.00399515
Epoch 1, val loss: 0.00414581 -> 0.00360033, train loss: 0.00367326
Epoch 2, val loss: 0.00382328, train loss: 0.00356392
Epoch 3, val loss: 0.00420140, train loss: 0.00359742
Epoch 4, val loss: 0.00371807, train loss: 0.00358709
Epoch 5, val loss: 0.00360033 -> 0.00359229, train loss: 0.00357028
Epoch 6, val loss: 0.00370034, train loss: 0.00354826
Epoch 7, val loss: 0.00362748, train loss: 0.00352941
Epoch 8, val loss: 0.00376224, train loss: 0.00354951
Epoch 9, val loss: 0.00363325, train loss: 0.00352178


#### Text-Meta

In [8]:
encoder = AutoModel.from_pretrained('distilbert-base-uncased', return_dict=True)

# Freeze encoder parameters
for param in encoder.parameters():
    param.requires_grad = False

model = UsefulScoreRegressorAllFeat(encoder, num_meta_feats=len(nonTextCols))
model = model.cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

train_text_meta_model(num_epochs=10, model=model, optimizer=optimizer,
                      train_loader=train_loader, val_loader=val_loader,
                      criterion=criterion, save_path='../models/distilBERT_Frozen_TextMeta_Cap99_2009-2013.pt', clip=1.0,
                      classify=False)

Epoch 0, val loss: inf -> 0.00493514, train loss: 0.00385133
Epoch 1, val loss: 0.00493514 -> 0.00356563, train loss: 0.00343594
Epoch 2, val loss: 0.00359935, train loss: 0.00340410
Epoch 3, val loss: 0.00356563 -> 0.00353534, train loss: 0.00336819
Epoch 4, val loss: 0.00353534 -> 0.00349383, train loss: 0.00332359
Epoch 5, val loss: 0.00356059, train loss: 0.00330114
Epoch 6, val loss: 0.00352374, train loss: 0.00326937
Epoch 7, val loss: 0.00349383 -> 0.00341862, train loss: 0.00325463
Epoch 8, val loss: 0.00347905, train loss: 0.00326640
Epoch 9, val loss: 0.00349266, train loss: 0.00323893


#### Neural Meta

In [7]:
model = UsefulScoreRegressorMetaOnly(num_meta_feats=len(nonTextCols))
model = model.cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

train_meta_model(num_epochs=10, model=model, optimizer=optimizer,
                 train_loader=train_loader, val_loader=val_loader,
                 criterion=criterion, save_path='../models/Neural_Meta_Cap99_2009-2013.pt', clip=10000.0,
                 classify=False)

Epoch 0, val loss: inf -> 0.00377398, train loss: 0.00432316
Epoch 1, val loss: 0.00377772, train loss: 0.00350335
Epoch 2, val loss: 0.00377398 -> 0.00372690, train loss: 0.00346241
Epoch 3, val loss: 0.00372690 -> 0.00371696, train loss: 0.00345475
Epoch 4, val loss: 0.00372163, train loss: 0.00342369
Epoch 5, val loss: 0.00371696 -> 0.00370639, train loss: 0.00341485
Epoch 6, val loss: 0.00371276, train loss: 0.00341115
Epoch 7, val loss: 0.00371667, train loss: 0.00338426
Epoch 8, val loss: 0.00370639 -> 0.00369019, train loss: 0.00338861
Epoch 9, val loss: 0.00369298, train loss: 0.00339327


#### Linear Meta

In [6]:
model = UsefulScoreRegressorLinearBaseline(num_meta_feats=len(nonTextCols))
model = model.cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

train_meta_model(num_epochs=10, model=model, optimizer=optimizer,
                 train_loader=train_loader, val_loader=val_loader,
                 criterion=criterion, save_path='../models/Linear_Meta_Cap99_2009-2013.pt', clip=10000.0,
                 classify=False)

Epoch 0, val loss: inf -> 0.00386727, train loss: 0.00474434
Epoch 1, val loss: 0.00386727 -> 0.00379575, train loss: 0.00347593
Epoch 2, val loss: 0.00380253, train loss: 0.00344485
Epoch 3, val loss: 0.00382117, train loss: 0.00344569
Epoch 4, val loss: 0.00379943, train loss: 0.00344794
Epoch 5, val loss: 0.00379575 -> 0.00377466, train loss: 0.00344950
Epoch 6, val loss: 0.00379991, train loss: 0.00345063
Epoch 7, val loss: 0.00378471, train loss: 0.00344849
Epoch 8, val loss: 0.00378576, train loss: 0.00344192
Epoch 9, val loss: 0.00379063, train loss: 0.00344910


# usefulCount cap of 99th percentile and years 2013 to 2017 with Age Feature

In [9]:
train, val = load_data('../data/drugsComTrain_raw.csv', year_range=[2013, 2017], usefulCount_quantile=0.99)

nonTextCols = ['ADHD', 'Acne', 'Anxiety', 'Bipolar Disorde', 'Birth Control',
               'Depression', 'Insomnia', 'Obesity', 'Pain', 'Weight Loss', 'ratingNormalized', 'ageScore']
targetCol = 'usefulCountCappedNormalized'

trainset = ReviewDataset(train, 'distilbert-base-uncased', nonTextCols, targetCol)
valset = ReviewDataset(val, 'distilbert-base-uncased', nonTextCols, targetCol)
train_loader = DataLoader(dataset=trainset, batch_size=8, shuffle=True)
val_loader = DataLoader(dataset=valset, batch_size=8, shuffle=False)

#### Text

In [10]:
encoder = AutoModel.from_pretrained('distilbert-base-uncased', return_dict=True)

# Freeze encoder parameters
for param in encoder.parameters():
    param.requires_grad = False

model = UsefulScoreRegressorTextOnly(encoder)
model = model.cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

train_text_model(num_epochs=10, model=model, optimizer=optimizer,
                 train_loader=train_loader, val_loader=val_loader,
                 criterion=criterion, save_path='../models/distilBERT_Frozen_TextOnly_Cap99_2013-2017_wAge.pt', clip=1.0,
                 classify=False)

Epoch 0, val loss: inf -> 0.00320778, train loss: 0.00371177
Epoch 1, val loss: 0.00379869, train loss: 0.00342485
Epoch 2, val loss: 0.00334750, train loss: 0.00337896
Epoch 3, val loss: 0.00327002, train loss: 0.00334159
Epoch 4, val loss: 0.00326901, train loss: 0.00334327
Epoch 5, val loss: 0.00330397, train loss: 0.00332823
Epoch 6, val loss: 0.00320778 -> 0.00314411, train loss: 0.00332504
Epoch 7, val loss: 0.00314411 -> 0.00303444, train loss: 0.00329159
Epoch 8, val loss: 0.00324357, train loss: 0.00328624
Epoch 9, val loss: 0.00345990, train loss: 0.00329043


#### Text-Meta

In [11]:
encoder = AutoModel.from_pretrained('distilbert-base-uncased', return_dict=True)

# Freeze encoder parameters
for param in encoder.parameters():
    param.requires_grad = False

model = UsefulScoreRegressorAllFeat(encoder, num_meta_feats=len(nonTextCols))
model = model.cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

train_text_meta_model(num_epochs=10, model=model, optimizer=optimizer,
                      train_loader=train_loader, val_loader=val_loader,
                      criterion=criterion, save_path='../models/distilBERT_Frozen_TextMeta_Cap99_2013-2017_wAge.pt', clip=1.0,
                      classify=False)

Epoch 0, val loss: inf -> 0.00282732, train loss: 0.00295192
Epoch 1, val loss: 0.00282732 -> 0.00232343, train loss: 0.00243734
Epoch 2, val loss: 0.00244299, train loss: 0.00236318
Epoch 3, val loss: 0.00245175, train loss: 0.00232723
Epoch 4, val loss: 0.00232343 -> 0.00210326, train loss: 0.00231071
Epoch 5, val loss: 0.00222636, train loss: 0.00233250
Epoch 6, val loss: 0.00228119, train loss: 0.00227522
Epoch 7, val loss: 0.00230737, train loss: 0.00227874
Epoch 8, val loss: 0.00210518, train loss: 0.00226679
Epoch 9, val loss: 0.00223481, train loss: 0.00224247


### Baselines

#### Neural Meta

In [12]:
model = UsefulScoreRegressorMetaOnly(num_meta_feats=len(nonTextCols))
model = model.cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

train_meta_model(num_epochs=10, model=model, optimizer=optimizer,
                 train_loader=train_loader, val_loader=val_loader,
                 criterion=criterion, save_path='../models/Neural_Meta_Cap99_2013-2017_wAge.pt', clip=10000.0,
                 classify=False)

Epoch 0, val loss: inf -> 0.00238019, train loss: 0.00263480
Epoch 1, val loss: 0.00238019 -> 0.00233620, train loss: 0.00238714
Epoch 2, val loss: 0.00235048, train loss: 0.00235544
Epoch 3, val loss: 0.00233620 -> 0.00230630, train loss: 0.00231485
Epoch 4, val loss: 0.00230630 -> 0.00230355, train loss: 0.00230843
Epoch 5, val loss: 0.00232830, train loss: 0.00231691
Epoch 6, val loss: 0.00232797, train loss: 0.00230143
Epoch 7, val loss: 0.00230355 -> 0.00227928, train loss: 0.00228926
Epoch 8, val loss: 0.00227928 -> 0.00227811, train loss: 0.00230391
Epoch 9, val loss: 0.00230870, train loss: 0.00230553


#### Linear Meta

In [13]:
model = UsefulScoreRegressorLinearBaseline(num_meta_feats=len(nonTextCols))
model = model.cuda()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters())

train_meta_model(num_epochs=10, model=model, optimizer=optimizer,
                 train_loader=train_loader, val_loader=val_loader,
                 criterion=criterion, save_path='../models/Linear_Meta_Cap99_2013-2017_wAge.pt', clip=10000.0,
                 classify=False)

Epoch 0, val loss: inf -> 0.00265827, train loss: 0.00380742
Epoch 1, val loss: 0.00266755, train loss: 0.00258430
Epoch 2, val loss: 0.00265827 -> 0.00265808, train loss: 0.00257895
Epoch 3, val loss: 0.00265808 -> 0.00265717, train loss: 0.00258296
Epoch 4, val loss: 0.00268114, train loss: 0.00258043
Epoch 5, val loss: 0.00267778, train loss: 0.00258328
Epoch 6, val loss: 0.00266439, train loss: 0.00258127
Epoch 7, val loss: 0.00266649, train loss: 0.00258429
Epoch 8, val loss: 0.00266351, train loss: 0.00258160
Epoch 9, val loss: 0.00265717 -> 0.00265645, train loss: 0.00258386


In [14]:
len(nonTextCols)

12