In [3]:
import pandas as pd
import transformers
import torch
from torch.utils.data import Dataset, DataLoader
from torchnlp.encoders.text import StaticTokenizerEncoder, stack_and_pad_tensors, pad_tensor
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer, AdamW, get_linear_schedule_with_warmup
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from time import time
from collections import defaultdict

In [2]:
tokenizer = AutoTokenizer.from_pretrained('google/electra-small-discriminator')

In [3]:
class TBSADataset(Dataset):
    def __init__(self, texts, targets, tokenizer, max_len=128):
        self.texts = texts
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        target = self.targets[idx]

        encoding = self.tokenizer.encode_plus(
          text,
          add_special_tokens=True,
          padding='max_length',
          return_token_type_ids=False,
          max_length=self.max_len,
          return_attention_mask=True,
          return_tensors='pt',
          truncation='only_first'
        )

        return {
          'review_text': text,
          'input_ids': encoding['input_ids'].flatten(),
          'attention_mask': encoding['attention_mask'].flatten(),
          'targets': torch.tensor(target, dtype=torch.long)
        }


In [4]:
def create_data_loader(df):
    ds = TBSADataset(
        texts=df.text.to_numpy(),
        targets=df.sentiment.to_numpy(),
        tokenizer=tokenizer,
        max_len=128
    )
    return DataLoader(
        ds,
        batch_size=32,
        shuffle=True
    )

In [5]:
df = pd.read_csv('data/data_no_tweet.tsv', sep='\t')
df.sentiment += 1
df_train, df_test = train_test_split(
  df,
  test_size=0.2,
  random_state=42
)

df_train, df_eval = train_test_split(
  df_train,
  test_size=0.2,
  random_state=42
)

In [6]:
train_dl = create_data_loader(df_train)

In [7]:
test_dl = create_data_loader(df_test)

In [8]:
eval_dl = create_data_loader(df_eval)

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")

In [7]:
class TBSA(nn.Module):

    def __init__(self):
        super(TBSA, self).__init__()
        self.transformer = AutoModel.from_pretrained('google/electra-small-discriminator', return_dict=False)
        
        self.drop = nn.Dropout(p=0.2)
        self.out = nn.Linear(self.transformer.config.hidden_size, 3)

    def forward(self, input_ids, attention_mask):
        pooled_output = self.transformer(
          input_ids=input_ids,
          attention_mask=attention_mask,
        )[0][:, 0, :]
        output = self.drop(pooled_output)
        return self.out(output)

In [6]:
loss_fn = nn.CrossEntropyLoss().to(device)
model = TBSA().to(device)
optimizer = AdamW(model.parameters(), lr=5e-5, correct_bias=False)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=int(len(train_dl) * 1)
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


NameError: name 'train_dl' is not defined

In [8]:
def train_epoch(model, train_dl, acc_steps=1):
    model = model.train()
    losses = []
    avg_losses = []
    temp_preds = []
    temp_targets = []
    avg_accs = []
    acc_losses = []
    correct_predictions = 0
    i = 0
    t0 = time()
    train_size = len(df_train)
    for d in train_dl:
        input_ids = d["input_ids"].to(device)
        targets = d["targets"].to(device).view(-1)

        attention_mask = d["attention_mask"].to(device)
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        preds = outputs.argmax(1, keepdim = True).view(-1)
        loss = loss_fn(outputs, targets)
        loss = loss / acc_steps
        correct_predictions += torch.sum(preds == targets)

        temp_preds += preds.cpu().tolist()
        temp_targets += targets.cpu().tolist()

        acc_losses.append(loss.item())
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

        i += 1
        if i % acc_steps == 0:
            losses.append(np.mean(acc_losses))
            acc_losses = []
            optimizer.step()
            #scheduler.step()
            optimizer.zero_grad()
        if i % (100 * acc_steps) == 0:
            acc = 0
            try:
                acc = accuracy_score(temp_targets, temp_preds)
            except ValueError:
                pass
            temp_preds = []
            temp_targets = []

            avg_accs.append(acc)
            avg_losses.append(np.mean(losses[i-100:i]))
            print(i, 'iters, auroc, loss, time : ', avg_accs[-1], avg_losses[-1], time()-t0)

    return correct_predictions.double() / train_size, np.mean(losses), avg_losses, avg_accs

In [9]:
def eval_model(model, eval_dl):
    model = model.eval()
    losses = []
    temp_preds = []
    temp_targets = []   
    correct_predictions = 0
    eval_size = len(df_eval)
    with torch.no_grad():
        for d in eval_dl:
            input_ids = d["input_ids"].to(device)
            targets = d["targets"].to(device).view(-1)
            outputs = torch.zeros_like(targets)

            attention_mask = d["attention_mask"].to(device)
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            preds = outputs.argmax(1, keepdim = True).view(-1)
            loss = loss_fn(outputs, targets)
            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

            temp_preds += preds.cpu().tolist()
            temp_targets += targets.cpu().tolist()


    acc = 0
    try:
        acc = accuracy_score(temp_targets, temp_preds)
    except ValueError:
        pass

    return correct_predictions.double() / eval_size, np.mean(losses)


In [10]:
def get_predictions(model, test_dl):
    model = model.eval()
    review_texts = []
    predictions = []
    prediction_probs = []
    real_values = []
    with torch.no_grad():
        for d in test_dl:
            texts = d["review_text"]
            input_ids = d["input_ids"].to(device)
            attention_mask = d["attention_mask"].to(device)
            targets = d["targets"].to(device)

            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )

            preds = outputs.argmax(1, keepdim = True).view(-1)
            review_texts.extend(texts)
            predictions.extend(preds)
            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    real_values = torch.stack(real_values).cpu()
    return review_texts, predictions, real_values


In [16]:
history = defaultdict(list)
best_acc = 0
epochs = 5

In [11]:
def run_epoch(epoch, best_acc):
    print(f'Epoch {epoch + 1}/{epochs}')
    print('-' * 10)
    train_acc, train_loss, train_avg_losses, avg_accs = train_epoch(model, train_dl)
    
    print(f'Train loss {train_loss} accuracy {train_acc}')
    
    val_acc, val_loss = eval_model(model, eval_dl)

    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()
    
    history['train_acc'].append(train_acc)
    history['train_loss'] += train_avg_losses
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_acc:
        best_acc = val_acc
    return best_acc


In [25]:
for epoch in range(epochs):
    run_epoch(epoch, best_acc)

Epoch 1/5
----------
100 iters, auroc, loss, time :  0.7096875 0.7012013110518456 23.46341633796692
Train loss 0.6748644018270931 accuracy 0.7217368961973278
Val   loss 0.5442714143183923 accuracy 0.7882836587872559

Epoch 2/5
----------
100 iters, auroc, loss, time :  0.8121875 0.4827631662786007 23.28145956993103
Train loss 0.48346972502157337 accuracy 0.8111510791366907
Val   loss 0.5431040544663707 accuracy 0.7882836587872559

Epoch 3/5
----------
100 iters, auroc, loss, time :  0.80375 0.5017260302603245 23.578564882278442
Train loss 0.495240148462233 accuracy 0.8060123329907503
Val   loss 0.5385278895978005 accuracy 0.7882836587872559

Epoch 4/5
----------
100 iters, auroc, loss, time :  0.8059375 0.4934819088876247 23.377612829208374
Train loss 0.48639885287304396 accuracy 0.8101233299075026
Val   loss 0.5365809346399 accuracy 0.7882836587872559

Epoch 5/5
----------
100 iters, auroc, loss, time :  0.8153125 0.4842222927510738 23.436198234558105
Train loss 0.4895379302687332 acc

In [26]:
y_review_texts, y_pred, y_test = get_predictions(model, test_dl)
print(accuracy_score(y_test, y_pred))

0.80936729663106


In [27]:
f1_score(y_test, y_pred, average='macro')

0.685866444605009

In [28]:
confusion_matrix(y_test, y_pred)

array([[186,  10,  66],
       [ 49,  55,  51],
       [ 44,  12, 744]], dtype=int64)

In [12]:
df['target_text'] = df.apply(lambda x: x.text + ' [SEP] ' + x.target, axis=1)

# TBSA

In [18]:
def create_tb_data_loader(df):
    ds = TBSADataset(
        texts=df.target_text.to_numpy(),
        targets=df.sentiment.to_numpy(),
        tokenizer=tokenizer,
        max_len=128
    )
    return DataLoader(
        ds,
        batch_size=32,
        shuffle=True
    )

In [19]:
df_train, df_test = train_test_split(
  df,
  test_size=0.2,
  random_state=42
)

df_train, df_eval = train_test_split(
  df_train,
  test_size=0.2,
  random_state=42
)

train_dl = create_tb_data_loader(df_train)

test_dl = create_tb_data_loader(df_test)

eval_dl = create_tb_data_loader(df_eval)

In [20]:
#weights = torch.Tensor(compute_class_weight('balanced', [0,1,2], df_train.sentiment.values))
#weights = weights / weights.sum()
loss_fn = nn.CrossEntropyLoss().to(device)
model = TBSA().to(device)
optimizer = AdamW(model.parameters(), lr=5e-5, correct_bias=False)

#scheduler = get_linear_schedule_with_warmup(
#    optimizer,
#    num_warmup_steps=0,
#    num_training_steps=int(len(train_dl) * 1)
#)
history = defaultdict(list)
best_acc = 0
epochs = 5

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [23]:
for epoch in range(epochs):
    run_epoch(epoch, best_acc)

Epoch 1/5
----------
100 iters, auroc, loss, time :  0.94375 0.1646930056065321 6.757266521453857
Train loss 0.16879604820955973 accuracy 0.9427029804727647
Val   loss 0.6755291050480258 accuracy 0.828365878725591

Epoch 2/5
----------
100 iters, auroc, loss, time :  0.955625 0.12535545540042223 6.445020914077759
Train loss 0.13122159748750387 accuracy 0.9524665981500514
Val   loss 0.6031383843191208 accuracy 0.8324768756423433

Epoch 3/5
----------
100 iters, auroc, loss, time :  0.9628125 0.10862114432267844 6.639143705368042
Train loss 0.11192483143102316 accuracy 0.9617163412127441
Val   loss 0.7191731285664343 accuracy 0.8263103802672148

Epoch 4/5
----------
100 iters, auroc, loss, time :  0.97 0.089422884255182 6.57964825630188
Train loss 0.08958865710740267 accuracy 0.9707091469681398
Val   loss 0.743628466321576 accuracy 0.8437821171634121

Epoch 5/5
----------
100 iters, auroc, loss, time :  0.9778125 0.07244041429134086 6.425522804260254
Train loss 0.07732532101064982 accura

In [17]:
#batch=128
y_review_texts, y_pred, y_test = get_predictions(model, test_dl)
print(accuracy_score(y_test, y_pred))
f1_score(y_test, y_pred, average='macro')

0.8348397699260477


0.7507956592076185

In [22]:
#batch=64
y_review_texts, y_pred, y_test = get_predictions(model, test_dl)
print(accuracy_score(y_test, y_pred))
f1_score(y_test, y_pred, average='macro')

0.8373048479868529


0.752758858452138

In [28]:
#256
y_review_texts, y_pred, y_test = get_predictions(model, test_dl)
print(accuracy_score(y_test, y_pred))
f1_score(y_test, y_pred, average='macro')

0.8233360723089564


0.7414560150462578

In [25]:
#192
y_review_texts, y_pred, y_test = get_predictions(model, test_dl)
print(accuracy_score(y_test, y_pred))
f1_score(y_test, y_pred, average='macro')

0.8134757600657354


0.7300259866812592

In [20]:
#128
y_review_texts, y_pred, y_test = get_predictions(model, test_dl)
print(accuracy_score(y_test, y_pred))
f1_score(y_test, y_pred, average='macro')



0.8356614626129828


0.7674470928633134

In [17]:
#96
y_review_texts, y_pred, y_test = get_predictions(model, test_dl)
print(accuracy_score(y_test, y_pred))
f1_score(y_test, y_pred, average='macro')

0.8027937551355793


0.7024931239464111

In [22]:
#64
y_review_texts, y_pred, y_test = get_predictions(model, test_dl)
print(accuracy_score(y_test, y_pred))
f1_score(y_test, y_pred, average='macro')

0.8175842235004108


0.7256547521288529

In [26]:
confusion_matrix(y_test, y_pred)

array([[197,  15,  50],
       [ 33,  78,  44],
       [ 55,  28, 717]], dtype=int64)

In [27]:
#128
y_review_texts, y_pred, y_test = get_predictions(model, test_dl)
print(accuracy_score(y_test, y_pred))
f1_score(y_test, y_pred, average='macro')

0.8299096138044372


0.7577898094622112

In [22]:
y_review_texts, y_pred, y_test = get_predictions(model, test_dl)
print(accuracy_score(y_test, y_pred))
f1_score(y_test, y_pred, average='macro')

0.8307313064913723


0.751533864377337

In [24]:
y_review_texts, y_pred, y_test = get_predictions(model, test_dl)
print(accuracy_score(y_test, y_pred))
f1_score(y_test, y_pred, average='macro')

0.8364831552999178


0.752528986417904

In [29]:
compute_class_weight('balanced', [0,1,2], df_train.sentiment.values)



array([1.3627451 , 2.51421189, 0.53520352])

In [35]:
weights / weights.sum()

tensor([0.3089, 0.5698, 0.1213])

# Using the interface we-ve developed

In [1]:
from src.eval import eval_model, get_predictions
from src.transformer_model import TransformerTBSA
import torch
from sklearn.metrics import f1_score, accuracy_score
from types import SimpleNamespace
from src.data_loader import create_data_loader
import pandas as pd

In [2]:
args = SimpleNamespace()
args.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
args.final_dropout = 0.2
args.model_name = 'yangheng/deberta-v3-base-absa-v1.1'

In [4]:
df = pd.read_csv('data/food_reviews_occ.tsv', sep='\t')
df.sentiment += 1

In [5]:
args.model = TransformerTBSA(args)
args.model.load_state_dict(torch.load('models/transformer_deberta-v3-base-absa-v1.1_32_5e-05.pt'))

<All keys matched successfully>

In [6]:
args.batch_size = 32
args.max_len = 128
args.lr = 5e-5

In [7]:
from transformers import AutoTokenizer

In [8]:
df['target_text'] = df.apply(lambda x: x.text + ' [SEP] ' + x.target, axis=1)

args.tokenizer = AutoTokenizer.from_pretrained(args.model_name)



In [9]:
args.test_dl = create_data_loader(df, args)

In [10]:
args.model.to(args.device)
y_review_texts, y_pred, y_test = get_predictions(args)

In [11]:
f1_score(y_test, y_pred, average='macro')

0.733745402430063

In [37]:
accuracy_score(y_test, y_pred)

0.74

In [30]:
np.column_stack([df.text.iloc[np.where(y_test != y_pred)].values, 
                y_test[np.where(y_test != y_pred)],
               y_pred[np.where(y_test != y_pred)]])

array([['we ordered two omelettes and two lemonades.', 1, 2],
       ["sorry but i don't want french fries with my omelette.", 1, 0],
       ['after half an hour we ask where our order is, the waitress that it will be in 10 minutes, then after 15 minutes we ask again and that they are "already putting on" ... frustrated after another 5 minutes we get up and say that we are leaving, the waitress brings food suddenly (already not the one who served us originally, the other one has already started avoiding us), only an omelette was served, the second course was served after another 5 minutes ... we ate quickly and paid equally quickly and left.',
        0, 1],
       ["we didn't eat the omelette to the end and we ended up visiting ... it's a pity but i won't use it anymore",
        1, 2],
       ['failure ... a long waiting period, 45 minutes for breakfast (fried eggs and an omelette is a bit of a lot) plus winter coffee, because the waitress forgot to bring ... dinner supposedly ok, bu

Looks like most of the mistakes are debatable. Luckily, it handles the positive comments nicely, so it should be pretty accurate combined with the beta distr

In [15]:
df.head()

Unnamed: 0,target,text,sentiment,stars,target_text
0,omelette,we ordered two omelettes and two lemonades.,1,1,we ordered two omelettes and two lemonades. [S...
1,omelette,"delicious breakfasts, scrambled eggs, omelette...",2,5,"delicious breakfasts, scrambled eggs, omelette..."
2,omelette,we took a baguette and an omelette with paprik...,2,5,we took a baguette and an omelette with paprik...
3,omelette,always a good spot for some delightful breakfa...,2,5,always a good spot for some delightful breakfa...
4,omelette,my omelette and grits were better than expected.,2,4,my omelette and grits were better than expecte...


In [12]:
import numpy as np
y_pred_bin = (y_pred > 0).to(int)
y_bin = (y_test > 0).to(int)
f1_score(y_bin, y_pred_bin)

0.8874388254486134

In [46]:
y_pred_bin = (y_pred < 1).to(int)
y_bin = (y_test < 1).to(int)
f1_score(y_bin, y_pred_bin)

0.759581881533101

In [48]:
# untrained model:
args.model = TransformerTBSA(args)
args.model.to(args.device)


TransformerTBSA(
  (transformer): DebertaV2ForSequenceClassification(
    (deberta): DebertaV2Model(
      (embeddings): DebertaV2Embeddings(
        (word_embeddings): Embedding(128100, 768, padding_idx=0)
        (LayerNorm): LayerNorm((768,), eps=1e-07, elementwise_affine=True)
        (dropout): StableDropout()
      )
      (encoder): DebertaV2Encoder(
        (layer): ModuleList(
          (0): DebertaV2Layer(
            (attention): DebertaV2Attention(
              (self): DisentangledSelfAttention(
                (query_proj): Linear(in_features=768, out_features=768, bias=True)
                (key_proj): Linear(in_features=768, out_features=768, bias=True)
                (value_proj): Linear(in_features=768, out_features=768, bias=True)
                (pos_dropout): StableDropout()
                (dropout): StableDropout()
              )
              (output): DebertaV2SelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=True)
          

In [49]:
y_review_texts, y_pred, y_test = get_predictions(args)

In [50]:
f1_score(y_test, y_pred, average='macro')

0.6635894678729262

In [51]:
accuracy_score(y_test, y_pred)

0.6644444444444444

In [17]:
args.model_name = 'google/electra-small-discriminator'
args.model = TransformerTBSA(args)
args.model.load_state_dict(torch.load('models/transformer_electra-small-discriminator_32_5e-05.pt'))
args.model.to(args.device)
y_review_texts, y_pred, y_test = get_predictions(args)
f1_score(y_test, y_pred, average='macro')

Some weights of the model checkpoint at google/electra-small-discriminator were not used when initializing ElectraModel: ['discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense.weight']
- This IS expected if you are initializing ElectraModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [11]:
args.model_name = 'distilbert-base-uncased'
args.model = TransformerTBSA(args)
args.model.load_state_dict(torch.load('models/transformer_distilbert-base-uncased_32_5e-05.pt'))
args.model.to(args.device)
y_review_texts, y_pred, y_test = get_predictions(args)
f1_score(y_test, y_pred, average='macro')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.

In [16]:
args.model_name = 'yangheng/deberta-v3-base-absa-v1.1'
args.model = TransformerTBSA(args)
args.model.load_state_dict(torch.load('models/transformer_1671719673.6458306.pt'))
args.model.to(args.device)
y_review_texts, y_pred, y_test = get_predictions(args)
f1_score(y_test, y_pred, average='macro')

0.6766876805581461