In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
import transformers
from transformers import AutoModel
import sys
import math
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn.metrics as perf
from scipy.stats import pearsonr
sys.path.insert(0,'..')

from utils.preprocessing import load_data
from utils.transformer_dataset import ReviewDataset
from utils.training import train_text_model, train_text_meta_model, train_meta_model
from utils.evaluation import get_cls_perf, get_reg_perf, get_predictions
from models.transformer_models import UsefulScoreRegressorTextOnly, UsefulScoreRegressorAllFeat, UsefulScoreRegressorMetaOnly
from models.transformer_models import UsefulScoreRegressorLinearBaseline, DrugLinearRegression

In [2]:
nonTextCols = ['ADHD', 'Acne', 'Anxiety', 'Bipolar Disorde', 'Birth Control',
               'Depression', 'Insomnia', 'Obesity', 'Pain', 'Weight Loss', 'ratingNormalized']
targetCol = 'usefulCountClass'

# No usefulCount cap and no year filter

# usefulCount cap of 99th percentile and years 2009 to 2013 filter

#### Text

In [3]:
for num_vals in range(2, 6):
    curr_quantiles = np.array([(1/num_vals)*i for i in range(1, num_vals)])
    print(f'num_vals: {num_vals}')
    print(f'curr_quantiles: {curr_quantiles}')

    train, val = load_data('../data/drugsComTrain_raw.csv', year_range=[2009, 2013], usefulCount_range=[0, 10000],
                           quantiles_for_class=curr_quantiles)

    trainset = ReviewDataset(train, 'distilbert-base-uncased', nonTextCols, targetCol)
    valset = ReviewDataset(val, 'distilbert-base-uncased', nonTextCols, targetCol)
    train_loader = DataLoader(dataset=trainset, batch_size=8, shuffle=True)
    val_loader = DataLoader(dataset=valset, batch_size=8, shuffle=False)

    ##### Text-only Transformer Model
    encoder = AutoModel.from_pretrained('distilbert-base-uncased', return_dict=True)

    # Freeze encoder parameters to avoid CUDA out of memory.
    for param in encoder.parameters():
        param.requires_grad = False

    model = UsefulScoreRegressorTextOnly(encoder, outputs=num_vals)  # classification
    model = model.cuda()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters())

    train_text_model(num_epochs=10, model=model, optimizer=optimizer,
                     train_loader=train_loader, val_loader=val_loader,
                     criterion=criterion,
                     save_path=f'../models/Classifiers/distilBERT_Frozen_TextOnly_Classify{num_vals}.pt',
                     clip=1.0,
                     classify=True)

    #### Load the best model for the training run and evaluate its performance
    model = torch.load(f'../models/Classifiers/distilBERT_Frozen_TextOnly_Classify{num_vals}.pt')
    f1, acc = get_cls_perf(model=model, loader=val_loader, model_type='TEXT')
    print(f'f1: {f1}, acc: {acc}')

    print()

num_vals: 2
curr_quantiles: [0.5]
Epoch 0, val loss: inf -> 0.07397957, train loss: 0.07862058
Epoch 1, val loss: 0.07397957 -> 0.07244341, train loss: 0.07569571
Epoch 2, val loss: 0.07464819, train loss: 0.07410152
Epoch 3, val loss: 0.07353341, train loss: 0.07354082
Epoch 4, val loss: 0.07244341 -> 0.07212694, train loss: 0.07356331
Epoch 5, val loss: 0.07239426, train loss: 0.07328345
Epoch 6, val loss: 0.07257499, train loss: 0.07250335
Epoch 7, val loss: 0.07244378, train loss: 0.07279101
Epoch 8, val loss: 0.07227632, train loss: 0.07209536
Epoch 9, val loss: 0.07458339, train loss: 0.07204516
f1: 0.7084690178343485, acc: 0.7093584177520502

num_vals: 3
curr_quantiles: [0.33333333 0.66666667]
Epoch 0, val loss: inf -> 0.12647324, train loss: 0.12629476
Epoch 1, val loss: 0.12647324 -> 0.12386728, train loss: 0.12218056
Epoch 2, val loss: 0.12386728 -> 0.11931118, train loss: 0.12074414
Epoch 3, val loss: 0.11931118 -> 0.11825148, train loss: 0.12030316
Epoch 4, val loss: 0.1206

#### Text-Meta

In [26]:
for num_vals in range(2, 6):
    curr_quantiles = np.array([(1/num_vals)*i for i in range(1, num_vals)])
    print(f'num_vals: {num_vals}')
    print(f'curr_quantiles: {curr_quantiles}')

    train, val = load_data('../data/drugsComTrain_raw.csv', year_range=[2009, 2013], usefulCount_range=[0, 10000],
                           quantiles_for_class=curr_quantiles)

    trainset = ReviewDataset(train, 'distilbert-base-uncased', nonTextCols, targetCol)
    valset = ReviewDataset(val, 'distilbert-base-uncased', nonTextCols, targetCol)
    train_loader = DataLoader(dataset=trainset, batch_size=8, shuffle=True)
    val_loader = DataLoader(dataset=valset, batch_size=8, shuffle=False)

    encoder = AutoModel.from_pretrained('distilbert-base-uncased', return_dict=True)

    # Freeze encoder parameters to avoid CUDA out of memory.
    for param in encoder.parameters():
        param.requires_grad = False

    model = UsefulScoreRegressorAllFeat(encoder, num_meta_feats=len(nonTextCols), outputs=num_vals)  # classification
    model = model.cuda()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters())

    train_text_meta_model(num_epochs=10, model=model, optimizer=optimizer,
                          train_loader=train_loader, val_loader=val_loader,
                          criterion=criterion,
                          save_path=f'../models/Classifiers/distilBERT_Frozen_TextMeta_Classify{num_vals}.pt',
                          clip=1.0,
                          classify=True)

    #### Load the best model for the training run and evaluate its performance
    model = torch.load(f'../models/Classifiers/distilBERT_Frozen_TextMeta_Classify{num_vals}.pt')
    f1, acc = get_cls_perf(model=model, loader=val_loader, model_type='TEXT-META')
    print(f'f1: {f1}, acc: {acc}')

    print()

num_vals: 2
curr_quantiles: [0.5]
Epoch 0, val loss: inf -> 0.07894914, train loss: 0.07412356
Epoch 1, val loss: 0.07894914 -> 0.07483272, train loss: 0.07099397
Epoch 2, val loss: 0.07483272 -> 0.07315473, train loss: 0.07054865
Epoch 3, val loss: 0.07315473 -> 0.07162509, train loss: 0.06930481
Epoch 4, val loss: 0.07162509 -> 0.07013874, train loss: 0.06984539
Epoch 5, val loss: 0.08012029, train loss: 0.06891072
Epoch 6, val loss: 0.07013874 -> 0.06934771, train loss: 0.06856231
Epoch 7, val loss: 0.07182907, train loss: 0.06897483
Epoch 8, val loss: 0.07228963, train loss: 0.06800907
Epoch 9, val loss: 0.07055551, train loss: 0.06779471
f1: 0.7241471284132024, acc: 0.7257597684515196

num_vals: 3
curr_quantiles: [0.33333333 0.66666667]
Epoch 0, val loss: inf -> 0.11995788, train loss: 0.12187148
Epoch 1, val loss: 0.11995788 -> 0.11339937, train loss: 0.11671966
Epoch 2, val loss: 0.11775153, train loss: 0.11595232
Epoch 3, val loss: 0.11472886, train loss: 0.11498868
Epoch 4, va

#### Neural Meta

In [25]:
for num_vals in range(2, 6):
    curr_quantiles = np.array([(1/num_vals)*i for i in range(1, num_vals)])
    print(f'num_vals: {num_vals}')
    print(f'curr_quantiles: {curr_quantiles}')

    train, val = load_data('../data/drugsComTrain_raw.csv', year_range=[2009, 2013], usefulCount_range=[0, 10000],
                           quantiles_for_class=curr_quantiles)

    trainset = ReviewDataset(train, 'roberta-base', nonTextCols, targetCol)
    valset = ReviewDataset(val, 'roberta-base', nonTextCols, targetCol)
    train_loader = DataLoader(dataset=trainset, batch_size=8, shuffle=True)
    val_loader = DataLoader(dataset=valset, batch_size=8, shuffle=False)

    model = UsefulScoreRegressorMetaOnly(num_meta_feats=len(nonTextCols), outputs=num_vals)  #classification
    model = model.cuda()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters())

    train_meta_model(num_epochs=10, model=model, optimizer=optimizer,
                     train_loader=train_loader, val_loader=val_loader,
                     criterion=criterion, save_path=f'../models/Classifiers/MetaOnly_NNClassifierBaseline{num_vals}.pt', clip=10000.0,
                     classify=True)

    #### Load the best model for the training run and evaluate its performance
    model = torch.load(f'../models/Classifiers/MetaOnly_NNClassifierBaseline{num_vals}.pt')
    f1, acc = get_cls_perf(model=model, loader=val_loader, model_type='META')
    print(f'f1: {f1}, acc: {acc}')

    print()

num_vals: 2
curr_quantiles: [0.5]
Epoch 0, val loss: inf -> 0.07052004, train loss: 0.07207060
Epoch 1, val loss: 0.07052004 -> 0.07033605, train loss: 0.06990190
Epoch 2, val loss: 0.07069781, train loss: 0.06976134
Epoch 3, val loss: 0.07033605 -> 0.07023999, train loss: 0.06962311
Epoch 4, val loss: 0.07023999 -> 0.07021992, train loss: 0.06960955
Epoch 5, val loss: 0.07022868, train loss: 0.06945290
Epoch 6, val loss: 0.07021992 -> 0.07010770, train loss: 0.06937224
Epoch 7, val loss: 0.07018366, train loss: 0.06951573
Epoch 8, val loss: 0.07010770 -> 0.07000520, train loss: 0.06945575
Epoch 9, val loss: 0.07015875, train loss: 0.06935509
f1: 0.7180882641386074, acc: 0.7240713941148095

num_vals: 3
curr_quantiles: [0.33333333 0.66666667]
Epoch 0, val loss: inf -> 0.11621064, train loss: 0.12217752
Epoch 1, val loss: 0.11621064 -> 0.11573562, train loss: 0.11665317
Epoch 2, val loss: 0.11605093, train loss: 0.11618662
Epoch 3, val loss: 0.11573562 -> 0.11529325, train loss: 0.116122

#### Linear Meta

In [23]:
for num_vals in range(2, 6):
    curr_quantiles = np.array([(1/num_vals)*i for i in range(1, num_vals)])
    print(f'num_vals: {num_vals}')
    print(f'curr_quantiles: {curr_quantiles}')

    train, val = load_data('../data/drugsComTrain_raw.csv', year_range=[2009, 2013], usefulCount_range=[0, 10000],
                           quantiles_for_class=curr_quantiles)

    trainset = ReviewDataset(train, 'roberta-base', nonTextCols, targetCol)
    valset = ReviewDataset(val, 'roberta-base', nonTextCols, targetCol)
    train_loader = DataLoader(dataset=trainset, batch_size=8, shuffle=True)
    val_loader = DataLoader(dataset=valset, batch_size=8, shuffle=False)

    model = UsefulScoreRegressorLinearBaseline(num_meta_feats=len(nonTextCols), outputs=num_vals)  #classification
    model = model.cuda()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters())

    train_meta_model(num_epochs=10, model=model, optimizer=optimizer,
                     train_loader=train_loader, val_loader=val_loader,
                     criterion=criterion, save_path=f'../models/Classifiers/MetaOnly_LinearClassifierBaseline{num_vals}.pt', clip=10000.0,
                     classify=True)

    #### Load the best model for the training run and evaluate its performance
    model = torch.load(f'../models/Classifiers/MetaOnly_LinearClassifierBaseline{num_vals}.pt')
    f1, acc = get_cls_perf(model=model, loader=val_loader, model_type='META')
    print(f'f1: {f1}, acc: {acc}')

    print()

num_vals: 2
curr_quantiles: [0.5]
Epoch 0, val loss: inf -> 0.07131396, train loss: 0.07458621
Epoch 1, val loss: 0.07131396 -> 0.07051269, train loss: 0.06978545
Epoch 2, val loss: 0.07051269 -> 0.07040187, train loss: 0.06936264
Epoch 3, val loss: 0.07040187 -> 0.07034733, train loss: 0.06928108
Epoch 4, val loss: 0.07034733 -> 0.07028681, train loss: 0.06922653
Epoch 5, val loss: 0.07028681 -> 0.07028193, train loss: 0.06918832
Epoch 6, val loss: 0.07028791, train loss: 0.06915965
Epoch 7, val loss: 0.07031276, train loss: 0.06917242
Epoch 8, val loss: 0.07028193 -> 0.07026925, train loss: 0.06916486
Epoch 9, val loss: 0.07029370, train loss: 0.06915908
f1: 0.7189442112730411, acc: 0.7243125904486252

num_vals: 3
curr_quantiles: [0.33333333 0.66666667]
Epoch 0, val loss: inf -> 0.11900895, train loss: 0.12557110
Epoch 1, val loss: 0.11900895 -> 0.11664075, train loss: 0.11730793
Epoch 2, val loss: 0.11664075 -> 0.11610938, train loss: 0.11605261
Epoch 3, val loss: 0.11610938 -> 0.11

# usefulCount cap of 99th percentile and years 2013 to 2017 with Age Feature

In [5]:
nonTextCols = ['ADHD', 'Acne', 'Anxiety', 'Bipolar Disorde', 'Birth Control',
               'Depression', 'Insomnia', 'Obesity', 'Pain', 'Weight Loss', 'ratingNormalized', 'ageScore']
targetCol = 'usefulCountClass'

#### Text-Meta

In [6]:
for num_vals in range(2, 6):
    curr_quantiles = np.array([(1/num_vals)*i for i in range(1, num_vals)])
    print(f'num_vals: {num_vals}')
    print(f'curr_quantiles: {curr_quantiles}')

    train, val = load_data('../data/drugsComTrain_raw.csv', year_range=[2013, 2017], usefulCount_range=[0, 10000],
                           quantiles_for_class=curr_quantiles)

    trainset = ReviewDataset(train, 'distilbert-base-uncased', nonTextCols, targetCol)
    valset = ReviewDataset(val, 'distilbert-base-uncased', nonTextCols, targetCol)
    train_loader = DataLoader(dataset=trainset, batch_size=8, shuffle=True)
    val_loader = DataLoader(dataset=valset, batch_size=8, shuffle=False)

    encoder = AutoModel.from_pretrained('distilbert-base-uncased', return_dict=True)

    # Freeze encoder parameters to avoid CUDA out of memory.
    for param in encoder.parameters():
        param.requires_grad = False

    model = UsefulScoreRegressorAllFeat(encoder, num_meta_feats=len(nonTextCols), outputs=num_vals)  # classification
    model = model.cuda()
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters())

    train_text_meta_model(num_epochs=10, model=model, optimizer=optimizer,
                          train_loader=train_loader, val_loader=val_loader,
                          criterion=criterion,
                          save_path=f'../models/Classifiers/distilBERT_Frozen_TextMeta_Classify{num_vals}_2013-2017_wAge.pt',
                          clip=1.0,
                          classify=True)

    #### Load the best model for the training run and evaluate its performance
    model = torch.load(f'../models/Classifiers/distilBERT_Frozen_TextMeta_Classify{num_vals}_2013-2017_wAge.pt')
    f1, acc = get_cls_perf(model=model, loader=val_loader, model_type='TEXT-META')
    print(f'f1: {f1}, acc: {acc}')

    print()

num_vals: 2
curr_quantiles: [0.5]
Epoch 0, val loss: inf -> 0.05181526, train loss: 0.06054873
Epoch 1, val loss: 0.05279917, train loss: 0.05560468
Epoch 2, val loss: 0.05197458, train loss: 0.05457137
Epoch 3, val loss: 0.05181526 -> 0.05133883, train loss: 0.05420045
Epoch 4, val loss: 0.05414853, train loss: 0.05338410
Epoch 5, val loss: 0.05154473, train loss: 0.05308257
Epoch 6, val loss: 0.05201705, train loss: 0.05276312
Epoch 7, val loss: 0.05191622, train loss: 0.05284573
Epoch 8, val loss: 0.05620904, train loss: 0.05252063
Epoch 9, val loss: 0.05133883 -> 0.05133720, train loss: 0.05197617
f1: 0.8136900438206434, acc: 0.8136928539724811

num_vals: 3
curr_quantiles: [0.33333333 0.66666667]
Epoch 0, val loss: inf -> 0.09994811, train loss: 0.10484060
Epoch 1, val loss: 0.09994811 -> 0.09400103, train loss: 0.09878587
Epoch 2, val loss: 0.09400103 -> 0.09265329, train loss: 0.09779492
Epoch 3, val loss: 0.09265329 -> 0.09206702, train loss: 0.09685928
Epoch 4, val loss: 0.0932