In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import numpy as np
import pandas as pd
import transformers
from transformers import AutoModel
import sys
import math
import seaborn as sb
import matplotlib.pyplot as plt
import sklearn.metrics as perf
from scipy.stats import pearsonr
sys.path.insert(0,'..')

from utils.preprocessing import load_data, get_buckets
from utils.transformer_dataset import ReviewDataset
from utils.training import train_text_model, train_text_meta_model, train_meta_model
from utils.evaluation import get_cls_perf, get_reg_perf, get_predictions, get_ordinal_cls_perf
from models.transformer_models import UsefulScoreRegressorTextOnly, UsefulScoreRegressorAllFeat, UsefulScoreRegressorMetaOnly
from models.transformer_models import UsefulScoreRegressorLinearBaseline, DrugLinearRegression

In [2]:
nonTextCols = ['ADHD', 'Acne', 'Anxiety', 'Bipolar Disorde', 'Birth Control',
               'Depression', 'Insomnia', 'Obesity', 'Pain', 'Weight Loss', 'ratingNormalized']
targetCol = 'usefulCountCappedNormalized'

In [11]:
#### Check Model Caps
train, val = load_data('../data/drugsComTrain_raw.csv', year_range=[2009, 2013],
                      usefulCount_quantile=0.99)
print(max(train['usefulCountCapped']))
train, val = load_data('../data/drugsComTrain_raw.csv', year_range=[2013, 2017],
                      usefulCount_quantile=0.99)
print(max(train['usefulCountCapped']))

192
162


# No usefulCount cap and no year filter

In [7]:
train, val = load_data('../data/drugsComTest_raw.csv', year_range=[2008, 2017], usefulCount_range=[0, 10000],
                      usefulCount_quantile=None)
test = pd.concat((train, val), axis=0)
testset = ReviewDataset(test, 'distilbert-base-uncased', nonTextCols, targetCol)
test_loader = DataLoader(dataset=testset, batch_size=8, shuffle=False)

## BERT

#### Text

In [5]:
model = torch.load('../models/distilBERT_Frozen_TextOnly.pt')
model = model.cuda()
mae, rmse, r2 = get_reg_perf(model=model, loader=test_loader, model_type='text')
print(f'mae: {mae}')
print(f'rmse: {rmse}')
print(f'r2: {r2}')

mae: 0.02106926217675209
rmse: 0.03769624978303909
r2: 0.16104679665887278


#### Text-Meta

In [None]:
model = torch.load('../models/distilBERT_Frozen_TextMeta.pt')
model = model.cuda()
mae, rmse, r2 = get_reg_perf(model=model, loader=test_loader, model_type='text-meta')
print(f'mae: {mae}')
print(f'rmse: {rmse}')
print(f'r2: {r2}')

### Baseline

#### Linear

In [8]:
model = DrugLinearRegression()
model.load_state_dict(torch.load('../models/Linear_Meta.pt'))
model = model.cuda()
mae, rmse, r2 = get_reg_perf(model=model, loader=test_loader, model_type='meta')
print(f'mae: {mae}')
print(f'rmse: {rmse}')
print(f'r2: {r2}')

mae: 0.019120143726468086
rmse: 0.037572771310806274
r2: 0.16653393699683916


#### Neural

In [10]:
model = torch.load('../models/Neural_Meta.pt')
model = model.cuda()
mae, rmse, r2 = get_reg_perf(model=model, loader=test_loader, model_type='meta')
print(f'mae: {mae}')
print(f'rmse: {rmse}')
print(f'r2: {r2}')

mae: 0.018023859709501266
rmse: 0.03678973391652107
r2: 0.20091181648239287


# usefulCount cap of 99th percentile and years 2009 to 2013 filter

In [3]:
train, val = load_data('../data/drugsComTrain_raw.csv', year_range=[2009, 2013], usefulCount_quantile=0.99)
max_usefulCount = max(train['usefulCountCapped'])  # get the cap used in training

train, val = load_data('../data/drugsComTest_raw.csv', year_range=[2009, 2013],
                       usefulCount_range=[0, max_usefulCount])
test = pd.concat((train, val), axis=0)
testset = ReviewDataset(test, 'distilbert-base-uncased', nonTextCols, targetCol)
test_loader = DataLoader(dataset=testset, batch_size=8, shuffle=False)

#### Text

In [7]:
model = torch.load('../models/distilBERT_Frozen_TextOnly_Cap99_2009-2013.pt')
model = model.cuda()
mae, rmse, r2 = get_reg_perf(model=model, loader=test_loader, model_type='text')
print(f'mae: {mae}')
print(f'rmse: {rmse}')
print(f'r2: {r2}')

mae: 0.11337287724018097
rmse: 0.16882476210594177
r2: 0.22647702923541813


#### Text-Meta

In [4]:
model = torch.load('../models/distilBERT_Frozen_TextMeta_Cap99_2009-2013.pt')
model = model.cuda()
mae, rmse, r2 = get_reg_perf(model=model, loader=test_loader, model_type='text-meta')
print(f'mae: {mae}')
print(f'rmse: {rmse}')
print(f'r2: {r2}')

mae: 0.10368625074625015
rmse: 0.16133826971054077
r2: 0.29355932056986955


### Baseline

#### Linear

In [6]:
model = torch.load('../models/Linear_Meta_Cap99_2009-2013.pt')
model = model.cuda()
mae, rmse, r2 = get_reg_perf(model=model, loader=test_loader, model_type='meta')
print(f'mae: {mae}')
print(f'rmse: {rmse}')
print(f'r2: {r2}')

mae: 0.11296074092388153
rmse: 0.17021065950393677
r2: 0.21372500760932356


#### Neural

In [8]:
model = torch.load('../models/Neural_Meta_Cap99_2009-2013.pt')
model = model.cuda()
mae, rmse, r2 = get_reg_perf(model=model, loader=test_loader, model_type='meta')
print(f'mae: {mae}')
print(f'rmse: {rmse}')
print(f'r2: {r2}')

mae: 0.10836990177631378
rmse: 0.16831393539905548
r2: 0.2311508821727014


# usefulCount cap of 99th percentile and years 2013 to 2017 filter

In [2]:
nonTextCols = ['ADHD', 'Acne', 'Anxiety', 'Bipolar Disorde', 'Birth Control',
               'Depression', 'Insomnia', 'Obesity', 'Pain', 'Weight Loss', 'ratingNormalized', 'ageScore']
targetCol = 'usefulCountCappedNormalized'

In [6]:
train, val = load_data('../data/drugsComTrain_raw.csv', year_range=[2013, 2017], usefulCount_quantile=0.99)
max_usefulCount = max(train['usefulCountCapped'])  # get the cap used in training

train, val = load_data('../data/drugsComTest_raw.csv', year_range=[2013, 2017],
                       usefulCount_range=[0, max_usefulCount])
test = pd.concat((train, val), axis=0)
testset = ReviewDataset(test, 'distilbert-base-uncased', nonTextCols, targetCol)
test_loader = DataLoader(dataset=testset, batch_size=8, shuffle=False)

#### Text

In [12]:
model = torch.load('../models/distilBERT_Frozen_TextOnly_Cap99_2013-2017_wAge.pt')
model = model.cuda()
mae, rmse, r2 = get_reg_perf(model=model, loader=test_loader, model_type='text')
print(f'mae: {mae}')
print(f'rmse: {rmse}')
print(f'r2: {r2}')

mae: 0.12026678025722504
rmse: 0.17249713838100433
r2: 0.19245881819412813


#### Text-Meta

In [7]:
model = torch.load('../models/distilBERT_Frozen_TextMeta_Cap99_2013-2017_wAge.pt')
model = model.cuda()
mae, rmse, r2 = get_reg_perf(model=model, loader=test_loader, model_type='text-meta')
print(f'mae: {mae}')
print(f'rmse: {rmse}')
print(f'r2: {r2}')

mae: 0.07737904787063599
rmse: 0.12650640308856964
r2: 0.5699776035372182


### Baseline

#### Linear

In [8]:
model = torch.load('../models/Linear_Meta_Cap99_2013-2017_wAge.pt')
model = model.cuda()
mae, rmse, r2 = get_reg_perf(model=model, loader=test_loader, model_type='meta')
print(f'mae: {mae}')
print(f'rmse: {rmse}')
print(f'r2: {r2}')

mae: 0.09539668262004852
rmse: 0.14320163428783417
r2: 0.4489868519066883


#### Neural

In [9]:
model = torch.load('../models/Neural_Meta_Cap99_2013-2017_wAge.pt')
model = model.cuda()
mae, rmse, r2 = get_reg_perf(model=model, loader=test_loader, model_type='meta')
print(f'mae: {mae}')
print(f'rmse: {rmse}')
print(f'r2: {r2}')

mae: 0.08046085387468338
rmse: 0.1320832520723343
r2: 0.5312281168152972


# Classification

## usefulCount cap of 99th percentile and years 2009 to 2013 filter

In [12]:
nonTextCols = ['ADHD', 'Acne', 'Anxiety', 'Bipolar Disorde', 'Birth Control',
               'Depression', 'Insomnia', 'Obesity', 'Pain', 'Weight Loss', 'ratingNormalized']
targetCol = 'usefulCountClass'

#### Text

In [None]:
for num_vals in range(2, 6):
    curr_quantiles = np.array([(1/num_vals)*i for i in range(1, num_vals)])
    print(f'num_vals: {num_vals}')
    print(f'curr_quantiles: {curr_quantiles}')

    train, val = load_data('../data/drugsComTest_raw.csv', year_range=[2009, 2013], usefulCount_range=[0, 10000],
                           quantiles_for_class=curr_quantiles)
    test = pd.concat((train, val), axis=0)
    testset = ReviewDataset(test, 'distilbert-base-uncased', nonTextCols, targetCol)
    test_loader = DataLoader(dataset=testset, batch_size=8, shuffle=False)
    
    model = torch.load(f'../models/Classifiers/distilBERT_Frozen_TextOnly_Classify{num_vals}.pt')
    model = model.cuda()
    f1, acc = get_cls_perf(model=model, loader=test_loader, model_type='text')
    print(f'f1: {f1}')
    print(f'acc: {acc}')
    
    print()

#### Text-Meta

In [13]:
for num_vals in range(2, 6):
    curr_quantiles = np.array([(1/num_vals)*i for i in range(1, num_vals)])
    print(f'num_vals: {num_vals}')
    print(f'curr_quantiles: {curr_quantiles}')

    train, val = load_data('../data/drugsComTest_raw.csv', year_range=[2009, 2013], usefulCount_range=[0, 10000],
                           quantiles_for_class=curr_quantiles)
    test = pd.concat((train, val), axis=0)
    testset = ReviewDataset(test, 'distilbert-base-uncased', nonTextCols, targetCol)
    test_loader = DataLoader(dataset=testset, batch_size=8, shuffle=False)
    
    model = torch.load(f'../models/Classifiers/distilBERT_Frozen_TextMeta_Classify{num_vals}.pt')
    model = model.cuda()
    f1, acc = get_cls_perf(model=model, loader=test_loader, model_type='text-meta')
    print(f'f1: {f1}')
    print(f'acc: {acc}')
    
    print()

num_vals: 2
curr_quantiles: [0.5]
f1: 0.7442669431370085
acc: 0.7456925337251236

num_vals: 3
curr_quantiles: [0.33333333 0.66666667]
f1: 0.5891198938700811
acc: 0.5948978228930145

num_vals: 4
curr_quantiles: [0.25 0.5  0.75]
f1: 0.4755736470840837
acc: 0.4881795111526646

num_vals: 5
curr_quantiles: [0.2 0.4 0.6 0.8]
f1: 0.3486130543906599
acc: 0.40857486309603314



### Baselines

#### Linear

In [14]:
for num_vals in range(2, 6):
    curr_quantiles = np.array([(1/num_vals)*i for i in range(1, num_vals)])
    print(f'num_vals: {num_vals}')
    print(f'curr_quantiles: {curr_quantiles}')

    train, val = load_data('../data/drugsComTest_raw.csv', year_range=[2009, 2013], usefulCount_range=[0, 10000],
                           quantiles_for_class=curr_quantiles)
    test = pd.concat((train, val), axis=0)
    testset = ReviewDataset(test, 'distilbert-base-uncased', nonTextCols, targetCol)
    test_loader = DataLoader(dataset=testset, batch_size=8, shuffle=False)
    
    model = torch.load(f'../models/Classifiers/MetaOnly_LinearClassifierBaseline{num_vals}.pt')
    model = model.cuda()
    f1, acc = get_cls_perf(model=model, loader=test_loader, model_type='meta')
    print(f'f1: {f1}')
    print(f'acc: {acc}')
    
    print()

num_vals: 2
curr_quantiles: [0.5]
f1: 0.7323244946351537
acc: 0.7367436890610392

num_vals: 3
curr_quantiles: [0.33333333 0.66666667]
f1: 0.5657927783152747
acc: 0.572859623347135

num_vals: 4
curr_quantiles: [0.25 0.5  0.75]
f1: 0.44674062371685364
acc: 0.4709496460531588

num_vals: 5
curr_quantiles: [0.2 0.4 0.6 0.8]
f1: 0.33760113177370316
acc: 0.40056097235207694



#### Neural

In [15]:
for num_vals in range(2, 6):
    curr_quantiles = np.array([(1/num_vals)*i for i in range(1, num_vals)])
    print(f'num_vals: {num_vals}')
    print(f'curr_quantiles: {curr_quantiles}')

    train, val = load_data('../data/drugsComTest_raw.csv', year_range=[2009, 2013], usefulCount_range=[0, 10000],
                           quantiles_for_class=curr_quantiles)
    test = pd.concat((train, val), axis=0)
    testset = ReviewDataset(test, 'distilbert-base-uncased', nonTextCols, targetCol)
    test_loader = DataLoader(dataset=testset, batch_size=8, shuffle=False)
    
    model = torch.load(f'../models/Classifiers/MetaOnly_NNClassifierBaseline{num_vals}.pt')
    model = model.cuda()
    f1, acc = get_cls_perf(model=model, loader=test_loader, model_type='meta')
    print(f'f1: {f1}')
    print(f'acc: {acc}')
    
    print()

num_vals: 2
curr_quantiles: [0.5]
f1: 0.7346525537214236
acc: 0.7395485508214238

num_vals: 3
curr_quantiles: [0.33333333 0.66666667]
f1: 0.571030924181377
acc: 0.5741952718044611

num_vals: 4
curr_quantiles: [0.25 0.5  0.75]
f1: 0.4443782479160957
acc: 0.4693468679043676

num_vals: 5
curr_quantiles: [0.2 0.4 0.6 0.8]
f1: 0.3346053023473036
acc: 0.40122879658073995



### 2013 to 2017 with age feature, Text-Meta

In [24]:
nonTextCols = ['ADHD', 'Acne', 'Anxiety', 'Bipolar Disorde', 'Birth Control',
               'Depression', 'Insomnia', 'Obesity', 'Pain', 'Weight Loss', 'ratingNormalized', 'ageScore']
targetCol = 'usefulCountClass'

In [25]:
for num_vals in range(2, 6):
    curr_quantiles = np.array([(1/num_vals)*i for i in range(1, num_vals)])
    print(f'num_vals: {num_vals}')
    print(f'curr_quantiles: {curr_quantiles}')

    train, val = load_data('../data/drugsComTest_raw.csv', year_range=[2013, 2017], usefulCount_range=[0, 10000],
                           quantiles_for_class=curr_quantiles)
    test = pd.concat((train, val), axis=0)
    testset = ReviewDataset(test, 'distilbert-base-uncased', nonTextCols, targetCol)
    test_loader = DataLoader(dataset=testset, batch_size=8, shuffle=False)
    
    model = torch.load(f'../models/Classifiers/distilBERT_Frozen_TextMeta_Classify{num_vals}_2013-2017_wAge.pt')
    model = model.cuda()
    f1, acc = get_cls_perf(model=model, loader=test_loader, model_type='text-meta')
    print(f'f1: {f1}')
    print(f'acc: {acc}')
    
    print()

num_vals: 2
curr_quantiles: [0.5]
f1: 0.8166952862120446
acc: 0.8166952980848763

num_vals: 3
curr_quantiles: [0.33333333 0.66666667]
f1: 0.6669123825030857
acc: 0.666984793535662

num_vals: 4
curr_quantiles: [0.25 0.5  0.75]
f1: 0.5529460367448011
acc: 0.5479417191576

num_vals: 5
curr_quantiles: [0.2 0.4 0.6 0.8]
f1: 0.4437772469145317
acc: 0.47871731246421073



# Ordinal Classification/Regression

### 2009 to 2013 with no age feature, Text-Meta

In [2]:
nonTextCols = ['ADHD', 'Acne', 'Anxiety', 'Bipolar Disorde', 'Birth Control',
               'Depression', 'Insomnia', 'Obesity', 'Pain', 'Weight Loss', 'ratingNormalized']
targetCol = 'usefulCountClass'

In [14]:
for num_vals in range(2, 6):
    curr_quantiles = np.array([(1/num_vals)*i for i in range(1, num_vals)])
    print(f'num_vals: {num_vals}')
    print(f'curr_quantiles: {curr_quantiles}')

    train, val = load_data('../data/drugsComTrain_raw.csv', year_range=[2009, 2013], usefulCount_quantile=0.99,
                           quantiles_for_class=curr_quantiles)
    train_val = pd.concat((train, val), axis=0)
    max_usefulCount = max(train['usefulCountCapped'])
    curr_buckets = get_buckets(df=train_val, quantiles=curr_quantiles)

    #### Load test data and create loader
    test1, test2 = load_data('../data/drugsComTest_raw.csv', year_range=[2009, 2013], usefulCount_range=[0, max_usefulCount],
                           quantiles_for_class=curr_quantiles)
    test = pd.concat((test1, test2), axis=0)
    testset = ReviewDataset(test, 'distilbert-base-uncased', nonTextCols, targetCol)
    test_loader = DataLoader(dataset=testset, batch_size=8, shuffle=False)
    
    model = torch.load(f'../models/distilBERT_Frozen_TextMeta_Cap99_2009-2013.pt')
    model = model.cuda()
    f1, acc = get_ordinal_cls_perf(model=model, loader=test_loader, model_type='text-meta',
                                   curr_buckets=curr_buckets, max_usefulCount=max_usefulCount)
    print(f'f1: {f1}')
    print(f'acc: {acc}')
    
    print()

num_vals: 2
curr_quantiles: [0.5]
f1: 0.7344548479788833
acc: 0.7415520235074129

num_vals: 3
curr_quantiles: [0.33333333 0.66666667]
f1: 0.5667544721135294
acc: 0.5705890209696808

num_vals: 4
curr_quantiles: [0.25 0.5  0.75]
f1: 0.4487997152533376
acc: 0.4493121410444771

num_vals: 5
curr_quantiles: [0.2 0.4 0.6 0.8]
f1: 0.37400672996791107
acc: 0.3743822625884867



### 2013 to 2017 with age feature, Text-Meta

In [16]:
nonTextCols = ['ADHD', 'Acne', 'Anxiety', 'Bipolar Disorde', 'Birth Control',
               'Depression', 'Insomnia', 'Obesity', 'Pain', 'Weight Loss', 'ratingNormalized', 'ageScore']
targetCol = 'usefulCountClass'

In [17]:
for num_vals in range(2, 6):
    curr_quantiles = np.array([(1/num_vals)*i for i in range(1, num_vals)])
    print(f'num_vals: {num_vals}')
    print(f'curr_quantiles: {curr_quantiles}')

    train, val = load_data('../data/drugsComTrain_raw.csv', year_range=[2013, 2017], usefulCount_quantile=0.99,
                           quantiles_for_class=curr_quantiles)
    train_val = pd.concat((train, val), axis=0)
    max_usefulCount = max(train['usefulCountCapped'])
    curr_buckets = get_buckets(df=train_val, quantiles=curr_quantiles)

    #### Load test data and create loader
    test1, test2 = load_data('../data/drugsComTest_raw.csv', year_range=[2013, 2017], usefulCount_range=[0, max_usefulCount],
                           quantiles_for_class=curr_quantiles)
    test = pd.concat((test1, test2), axis=0)
    testset = ReviewDataset(test, 'distilbert-base-uncased', nonTextCols, targetCol)
    test_loader = DataLoader(dataset=testset, batch_size=8, shuffle=False)
    
    model = torch.load(f'../models/distilBERT_Frozen_TextMeta_Cap99_2013-2017_wAge.pt')
    model = model.cuda()
    f1, acc = get_ordinal_cls_perf(model=model, loader=test_loader, model_type='text-meta',
                                   curr_buckets=curr_buckets, max_usefulCount=max_usefulCount)
    print(f'f1: {f1}')
    print(f'acc: {acc}')
    
    print()

num_vals: 2
curr_quantiles: [0.5]
f1: 0.8136123422441912
acc: 0.8168861742062734

num_vals: 3
curr_quantiles: [0.33333333 0.66666667]
f1: 0.6489881444225284
acc: 0.6484061843863332

num_vals: 4
curr_quantiles: [0.25 0.5  0.75]
f1: 0.5445023270051742
acc: 0.5480053445313992

num_vals: 5
curr_quantiles: [0.2 0.4 0.6 0.8]
f1: 0.46532274237106563
acc: 0.4680282496659668



#### TextMeta with No Age

In [4]:
nonTextCols = ['ADHD', 'Acne', 'Anxiety', 'Bipolar Disorde', 'Birth Control',
               'Depression', 'Insomnia', 'Obesity', 'Pain', 'Weight Loss', 'ratingNormalized']
targetCol = 'usefulCountClass'

num_vals = 2
curr_quantiles = np.array([(1/num_vals)*i for i in range(1, num_vals)])
print(f'num_vals: {num_vals}')
print(f'curr_quantiles: {curr_quantiles}')

train, val = load_data('../data/drugsComTrain_raw.csv', year_range=[2013, 2017], usefulCount_quantile=0.99,
                       quantiles_for_class=curr_quantiles)
train_val = pd.concat((train, val), axis=0)
max_usefulCount = max(train['usefulCountCapped'])
curr_buckets = get_buckets(df=train_val, quantiles=curr_quantiles)

#### Load test data and create loader
test1, test2 = load_data('../data/drugsComTest_raw.csv', year_range=[2013, 2017], usefulCount_range=[0, max_usefulCount],
                       quantiles_for_class=curr_quantiles)
test = pd.concat((test1, test2), axis=0)
testset = ReviewDataset(test, 'distilbert-base-uncased', nonTextCols, targetCol)
test_loader = DataLoader(dataset=testset, batch_size=8, shuffle=False)

model = torch.load(f'../models/Classifiers/distilBERT_Frozen_TextMeta_Classify{num_vals}_2013-2017_NoAge.pt')
model = model.cuda()
f1, acc = get_cls_perf(model=model, loader=test_loader, model_type='text-meta')
print(f'f1: {f1}')
print(f'acc: {acc}')

print()

num_vals: 2
curr_quantiles: [0.5]
f1: 0.7988682954899657
acc: 0.7991346949163326



#### TextOnly with No Age

In [5]:
targetCol = 'usefulCountClass'

num_vals = 2
curr_quantiles = np.array([(1/num_vals)*i for i in range(1, num_vals)])
print(f'num_vals: {num_vals}')
print(f'curr_quantiles: {curr_quantiles}')

train, val = load_data('../data/drugsComTrain_raw.csv', year_range=[2013, 2017], usefulCount_quantile=0.99,
                       quantiles_for_class=curr_quantiles)
train_val = pd.concat((train, val), axis=0)
max_usefulCount = max(train['usefulCountCapped'])
curr_buckets = get_buckets(df=train_val, quantiles=curr_quantiles)

#### Load test data and create loader
test1, test2 = load_data('../data/drugsComTest_raw.csv', year_range=[2013, 2017], usefulCount_range=[0, max_usefulCount],
                       quantiles_for_class=curr_quantiles)
test = pd.concat((test1, test2), axis=0)
testset = ReviewDataset(test, 'distilbert-base-uncased', nonTextCols, targetCol)
test_loader = DataLoader(dataset=testset, batch_size=8, shuffle=False)

model = torch.load(f'../models/Classifiers/distilBERT_Frozen_TextOnly_Classify{num_vals}_2013-2017_NoAge.pt')
model = model.cuda()
f1, acc = get_cls_perf(model=model, loader=test_loader, model_type='text')
print(f'f1: {f1}')
print(f'acc: {acc}')

print()

num_vals: 2
curr_quantiles: [0.5]
f1: 0.7679170691216542
acc: 0.768149137876185



#### Linear Baseline

### Scratch Work

In [10]:
#### Set up buckets for regression prediction assignment
num_vals = 4
curr_quantiles = np.array([(1/num_vals)*i for i in range(1, num_vals)])

train, val = load_data('../data/drugsComTrain_raw.csv', year_range=[2009, 2013], usefulCount_quantile=0.99,
                       quantiles_for_class=curr_quantiles)
train_val = pd.concat((train, val), axis=0)
max_usefulCount = max(train['usefulCountCapped'])
curr_buckets = get_buckets(df=train_val, quantiles=curr_quantiles)

#### Load test data and create loader
test1, test2 = load_data('../data/drugsComTest_raw.csv', year_range=[2009, 2013], usefulCount_range=[0, max_usefulCount],
                       quantiles_for_class=curr_quantiles)
test = pd.concat((test1, test2), axis=0)
testset = ReviewDataset(test, 'distilbert-base-uncased', nonTextCols, targetCol)
test_loader = DataLoader(dataset=testset, batch_size=8, shuffle=False)

#### Get classification performance using assigned buckets

In [11]:
model = torch.load(f'../models/distilBERT_Frozen_TextMeta_Cap99_2009-2013.pt')
model = model.cuda()

In [12]:
f1, acc = get_ordinal_cls_perf(model=model, loader=test_loader, model_type='text-meta',
                               curr_buckets=curr_buckets, max_usefulCount=max_usefulCount)
print(f1)

0.4487997152533376


In [11]:
from utils.evaluation import assign_bucket

reg_pred = torch.Tensor([0, 25, 17, 4, 5]).cuda()
reg_pred.cpu().apply_(lambda x: assign_bucket(x, curr_buckets))

tensor([0., 1., 0., 0., 0.])

In [4]:
curr_buckets

[[0, 18.0], [18.0, inf]]