## Set up

In [None]:
!pip install transformers==4.37.0
!pip install datasets

Collecting transformers==4.37.0
  Downloading transformers-4.37.0-py3-none-any.whl.metadata (129 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/129.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m129.4/129.4 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.19,>=0.14 (from transformers==4.37.0)
  Downloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)
Downloading transformers-4.37.0-py3-none-any.whl (8.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading tokenizers-0.15.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m46.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers, transformers
  Attempting uninstall: tokenize

In [None]:
import torch
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, TensorDataset, DataLoader
from torch.nn import CrossEntropyLoss
from torch.nn.utils.rnn import pad_sequence
from tqdm import tqdm
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import AdamW
import os
from sklearn.metrics import classification_report, precision_score, recall_score, f1_score
from datasets import load_dataset

torch.manual_seed(42)
np.random.seed(42)

## Google Drive linking
Make sure you have a CSE_354_Project in your Google Drive


In [None]:
from google.colab import drive
drive.mount('/content/drive')
base_dir = "drive/MyDrive/CSE_354_Project/"
%cd $base_dir

Mounted at /content/drive
/content/drive/MyDrive/CSE_354_Project


## Loading the dataset
This make an API request to hugging face then we verify the data

In [None]:
train_dataset = load_dataset("community-datasets/per_sent", split="train")
test_random_dataset = load_dataset("community-datasets/per_sent", split="test_random")
test_fixed_dataset = load_dataset("community-datasets/per_sent", split="test_fixed")
validation_dataset = load_dataset("community-datasets/per_sent", split="validation")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/14.0k [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/8.84M [00:00<?, ?B/s]

test_random-00000-of-00001.parquet:   0%|          | 0.00/1.60M [00:00<?, ?B/s]

test_fixed-00000-of-00001.parquet:   0%|          | 0.00/2.27M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/1.41M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3355 [00:00<?, ? examples/s]

Generating test_random split:   0%|          | 0/579 [00:00<?, ? examples/s]

Generating test_fixed split:   0%|          | 0/827 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/578 [00:00<?, ? examples/s]

## Baseline Model Creation
Using DistillBERT with 3 classes, and a special mask_token which is represented by [TGT] in Masked document of the dataset

In [None]:
class DistillBERT():
  def __init__(self, model_name='distilbert-base-uncased', num_classes=3):
    self.tokenizer = AutoTokenizer.from_pretrained(model_name)
    self.tokenizer.add_special_tokens({'mask_token': '[TGT]'})
    self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
    self.model.resize_token_embeddings(len(self.tokenizer))


  def get_tokenizer_and_model(self):
    return self.model, self.tokenizer

Using the model's tokenizer to tokenize the input dataset

In [None]:
class DatasetLoader(Dataset):

  def __init__(self, dataset, tokenizer):
    self.dataset = dataset
    self.tokenizer = tokenizer

  def tokenize_data(self):
    print("Tokenizing data...")
    tokens = []
    labels = []

    for row in tqdm(self.dataset):
      text = row['MASKED_DOCUMENT']
      label = row['TRUE_SENTIMENT']

      tokens.append(torch.tensor(self.tokenizer.encode(text, max_length=512, truncation=True, add_special_tokens=True)))
      labels.append(label)


    tokens = pad_sequence(tokens, batch_first=True)
    labels = torch.tensor(labels)
    dataset = TensorDataset(tokens, labels)
    return dataset

  def get_data_loaders(self, batch_size=32, shuffle=True):
    processed_dataset = self.tokenize_data()

    data_loader = DataLoader(
        processed_dataset,
        shuffle=shuffle,
        batch_size=batch_size
    )

    return data_loader

The model's fine-tuning stages by giving it the task specific data.

The Optimizer is created in the execution stage, which have the hyperparameters of learning rate and epsilon.

In [None]:
class Trainer():

  def __init__(self, options):
    self.device = options['device']
    self.train_data = options['train_data']
    self.batch_size = options['batch_size']
    self.epochs = options['epochs']
    self.save_path = options['save_path']
    transformer = DistillBERT()
    self.model, self.tokenizer = transformer.get_tokenizer_and_model()
    self.model.to(self.device)

  def get_performance_metrics(self, preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    precision = precision_score(labels_flat, pred_flat, zero_division=0, average='weighted')
    recall = recall_score(labels_flat, pred_flat, zero_division=0, average='weighted')
    f1 = f1_score(labels_flat, pred_flat, zero_division=0, average='weighted')
    return precision, recall, f1

  def set_training_parameters(self):
    for i, layer in enumerate(self.model.distilbert.transformer.layer):
      for param in layer.parameters():
        param.requires_grad = True

    for param in self.model.classifier.parameters():
      param.requires_grad = True

  def train(self, data_loader, optimizer):
    self.model.train()
    total_recall = 0
    total_precision = 0
    total_f1 = 0
    total_loss = 0

    for batch_idx, (text, labels) in enumerate(tqdm(data_loader)):
      self.model.zero_grad()

      text = text.to(self.device)
      labels = labels.to(self.device)

      outputs = self.model(text, labels=labels)
      predicted = outputs.logits.cpu().detach().numpy()

      loss = outputs.loss
      total_loss += loss.item()
      loss.backward()
      optimizer.step()

      precision, recall, f1 = self.get_performance_metrics(predicted, labels.cpu().numpy())
      total_precision += precision
      total_recall += recall
      total_f1 += f1

    precision = total_precision/len(data_loader)
    recall = total_recall/len(data_loader)
    f1 = total_f1/len(data_loader)
    loss = total_loss/len(data_loader)

    return precision, recall, f1, loss

  def eval(self, data_loader):
    self.model.eval()
    total_recall = 0
    total_precision = 0
    total_f1 = 0
    total_loss = 0

    with torch.no_grad():
      for batch_idx, (text, labels) in enumerate(tqdm(data_loader)):
        text = text.to(self.device)
        labels = labels.to(self.device)

        outputs = self.model(text, labels=labels)
        predicted = outputs.logits.cpu().detach().numpy()

        loss = outputs.loss
        total_loss += loss.item()

        precision, recall, f1 = self.get_performance_metrics(predicted, labels.cpu().numpy())
        total_precision += precision
        total_recall += recall
        total_f1 += f1

      precision = total_precision/len(data_loader)
      recall = total_recall/len(data_loader)
      f1 = total_f1/len(data_loader)
      loss = total_loss/len(data_loader)

      return precision, recall, f1, loss

  def save_transformer(self):
    self.model.save_pretrained(self.save_path)
    self.tokenizer.save_pretrained(self.save_path)

  def execute(self):
    last_best = 0
    train_dataset = DatasetLoader(self.train_data, self.tokenizer)
    train_data_loader = train_dataset.get_data_loaders(self.batch_size)

    val_dataset = DatasetLoader(validation_dataset, self.tokenizer)
    val_data_loader = val_dataset.get_data_loaders(self.batch_size)

    optimizer = torch.optim.AdamW(self.model.parameters(), lr = 3e-5, eps = 1e-8)
    self.set_training_parameters()

    for epoch_i in range(0, self.epochs):
      train_precision, train_recall, train_f1, train_loss = self.train(train_data_loader, optimizer)
      print(f'Epoch {epoch_i + 1}: train_loss: {train_loss:.4f} train_precision: {train_precision:.4f} train_recall: {train_recall:.4f} train_f1: {train_f1:.4f}')
      val_precision, val_recall, val_f1, val_loss = self.eval(val_data_loader)
      print(f'Epoch {epoch_i + 1}: val_loss: {val_loss:.4f} val_precision: {val_precision:.4f} val_recall: {val_recall:.4f} val_f1: {val_f1:.4f}')

      if val_f1 > last_best:
        print("Saving model..")
        self.save_transformer()
        last_best = val_f1
        print("Model saved.")

Hyper Params of
1. Batch size
2. Epochs

Then the additional of save path of the model which is set to *distilbert_per_sent_baseline* at the moment

In [None]:
BATCH_SIZE = 16
EPOCHS = 3

SAVE_PATH = 'distilbert_per_sent_baseline'

## Baseline Training
training on the train_dataset we've loaded up previously, then verfying it with the validation_dataset each epochs to see the improvement of the model

In [None]:
options = {}
options['device'] = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
options['train_data'] = train_dataset
options['batch_size'] = BATCH_SIZE
options['epochs'] = EPOCHS
options['save_path'] = SAVE_PATH

trainer = Trainer(options)
trainer.execute()



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizing data...


100%|██████████| 3355/3355 [00:08<00:00, 416.24it/s]


Tokenizing data...


100%|██████████| 578/578 [00:00<00:00, 617.85it/s]
  0%|          | 0/210 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
100%|██████████| 210/210 [02:35<00:00,  1.35it/s]


Epoch 1: train_loss: 0.9196 train_precision: 0.4477 train_recall: 0.5349 train_f1: 0.4539


100%|██████████| 37/37 [00:09<00:00,  3.90it/s]


Epoch 1: val_loss: 0.8555 val_precision: 0.5779 val_recall: 0.6014 val_f1: 0.5526
Saving model..
Model saved.


100%|██████████| 210/210 [02:39<00:00,  1.32it/s]


Epoch 2: train_loss: 0.8439 train_precision: 0.5574 train_recall: 0.5917 train_f1: 0.5471


100%|██████████| 37/37 [00:09<00:00,  3.96it/s]


Epoch 2: val_loss: 0.8694 val_precision: 0.5858 val_recall: 0.5760 val_f1: 0.5439


100%|██████████| 210/210 [02:38<00:00,  1.32it/s]


Epoch 3: train_loss: 0.7192 train_precision: 0.6709 train_recall: 0.6720 train_f1: 0.6489


100%|██████████| 37/37 [00:09<00:00,  3.94it/s]


Epoch 3: val_loss: 0.9075 val_precision: 0.6223 val_recall: 0.5777 val_f1: 0.5762
Saving model..
Model saved.


## Baseline Testing
Verifying the model performance with fresh data from testing split.

In [None]:
class Tester:
  def __init__(self, options):
    self.device = options['device']
    self.test_data = options['test_data']
    self.batch_size = options['batch_size']
    self.save_path = options['save_path']

    transformer = DistillBERT()
    self.model, self.tokenizer = transformer.get_tokenizer_and_model()
    self.model.to(self.device)

  def load_transformer(self):
    self.model = AutoModelForSequenceClassification.from_pretrained(self.save_path)
    self.tokenizer = AutoTokenizer.from_pretrained(self.save_path)
    self.model.to(self.device)
    self.model.eval()

  def get_performance_metrics(self, preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    precision = precision_score(labels_flat, pred_flat, zero_division=0, average='weighted')
    recall = recall_score(labels_flat, pred_flat, zero_division=0, average='weighted')
    f1 = f1_score(labels_flat, pred_flat, zero_division=0, average='weighted')
    return precision, recall, f1

  def test(self, data_loader):
    self.model.eval()
    total_recall = 0
    total_precision = 0
    total_f1 = 0
    total_loss = 0

    y_true = []
    y_pred = []

    with torch.no_grad():
      for batch_idx, (text, labels) in enumerate(tqdm(data_loader)):
        text = text.to(self.device)
        labels = labels.to(self.device)

        outputs = self.model(text, labels=labels)
        predicted = outputs.logits.cpu().detach().numpy()
        loss = outputs.loss
        total_loss += loss.item()

        labels = labels.cpu().numpy()
        y_true.extend(labels)
        y_pred.extend(np.argmax(predicted, axis=1))

        precision, recall, f1 = self.get_performance_metrics(predicted, labels)
        total_precision += precision
        total_recall += recall
        total_f1 += f1

      precision = total_precision / len(data_loader)
      recall = total_recall / len(data_loader)
      f1 = total_f1 / len(data_loader)
      loss = total_loss / len(data_loader)
      report = classification_report(y_true, y_pred, target_names=["Negative", "Neutral", "Positive"])

      return precision, recall, f1, loss, report

  def execute(self):
    self.load_transformer()
    test_dataset = DatasetLoader(self.test_data, self.tokenizer)
    test_data_loader = test_dataset.get_data_loaders(self.batch_size)
    precision, recall, f1, loss, report = self.test(test_data_loader)
    print(f'precision: {precision:.4f} recall: {recall:.4f} f1: {f1:.4f}')
    print(f'Test loss: {loss:.4f}')
    print()
    print(report)


Testing on the test_random_dataset (random and different main target entity)

In [None]:
options={}
options['device'] = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
options['test_data'] = test_random_dataset
options['batch_size'] = BATCH_SIZE
options['save_path'] = SAVE_PATH

tester = Tester(options)
tester.execute()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizing data...


100%|██████████| 579/579 [00:01<00:00, 560.63it/s]
100%|██████████| 37/37 [00:09<00:00,  3.93it/s]

precision: 0.5998 recall: 0.5552 f1: 0.5515
Test loss: 0.9371

              precision    recall  f1-score   support

    Negative       0.32      0.44      0.37        73
     Neutral       0.48      0.39      0.43       213
    Positive       0.68      0.70      0.69       293

    accuracy                           0.55       579
   macro avg       0.49      0.51      0.50       579
weighted avg       0.56      0.55      0.55       579






Testing on the test_fixed_dataset (stuff have repeat high frequence main target entity)

In [None]:
options={}
options['device'] = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
options['test_data'] = test_fixed_dataset
options['batch_size'] = BATCH_SIZE
options['save_path'] = SAVE_PATH

tester = Tester(options)
tester.execute()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizing data...


100%|██████████| 827/827 [00:01<00:00, 547.01it/s]
100%|██████████| 52/52 [00:13<00:00,  3.76it/s]

precision: 0.5082 recall: 0.4768 f1: 0.4652
Test loss: 1.1159

              precision    recall  f1-score   support

    Negative       0.33      0.26      0.29       139
     Neutral       0.45      0.38      0.41       320
    Positive       0.53      0.64      0.58       368

    accuracy                           0.48       827
   macro avg       0.44      0.43      0.43       827
weighted avg       0.46      0.48      0.47       827






---
## Idea 1 Mask TGT Token for Classification
Below are the implementation for idea 1 (it will redefined the classes in Baseline)

Using DistillBERT with 3 classes, and a special mask_token which is represented by [TGT] in Masked document of the dataset

In [None]:
class DistillBERT():
  def __init__(self, model_name='distilbert-base-uncased', num_classes=3):
    self.tokenizer = AutoTokenizer.from_pretrained(model_name)
    self.tokenizer.add_special_tokens({'mask_token': '[TGT]'})
    self.model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_classes)
    self.model.resize_token_embeddings(len(self.tokenizer))


  def get_tokenizer_and_model(self):
    return self.model, self.tokenizer

Using the model's tokenizer to tokenize the input dataset

In [None]:
class DatasetLoader(Dataset):

  def __init__(self, dataset, tokenizer):
    self.dataset = dataset
    self.tokenizer = tokenizer

  def tokenize_data(self):
    print("Tokenizing data...")
    tokens = []
    labels = []

    data_not_have_tgt = 0
    data_does_not_have_tgt_before_limit = 0
    tgt_token_id = self.tokenizer.convert_tokens_to_ids("[TGT]")


    for row in tqdm(self.dataset):
      text = row['MASKED_DOCUMENT']
      label = row['TRUE_SENTIMENT']

      if '[TGT]' not in text:
        data_not_have_tgt += 1
        continue

      token = self.tokenizer.encode(text, max_length=512, truncation=True, add_special_tokens=True)

      if tgt_token_id not in token:
        data_does_not_have_tgt_before_limit += 1
        continue

      tokens.append(torch.tensor(token))
      labels.append(label)


    tokens = pad_sequence(tokens, batch_first=True)
    labels = torch.tensor(labels)
    dataset = TensorDataset(tokens, labels)
    print("Data thrown away: " + str(data_not_have_tgt) + ", Reason: due to not having [TGT] tokens")
    print("Data thrown away: " + str(data_does_not_have_tgt_before_limit) + ", Reason: due to not having [TGT] tokens before limit")
    return dataset

  def get_data_loaders(self, batch_size=32, shuffle=True):
    processed_dataset = self.tokenize_data()

    data_loader = DataLoader(
        processed_dataset,
        shuffle=shuffle,
        batch_size=batch_size
    )

    return data_loader

The model's fine-tuning stages by giving it the task specific data.

The Optimizer is created in the execution stage, which have the hyperparameters of learning rate and epsilon.

In [None]:
class Trainer():

  def __init__(self, options):
    self.device = options['device']
    self.train_data = options['train_data']
    self.batch_size = options['batch_size']
    self.epochs = options['epochs']
    self.save_path = options['save_path']
    transformer = DistillBERT()
    self.model, self.tokenizer = transformer.get_tokenizer_and_model()
    self.model.to(self.device)

  def get_performance_metrics(self, preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    precision = precision_score(labels_flat, pred_flat, zero_division=0, average='weighted')
    recall = recall_score(labels_flat, pred_flat, zero_division=0, average='weighted')
    f1 = f1_score(labels_flat, pred_flat, zero_division=0, average='weighted')
    return precision, recall, f1

  def set_training_parameters(self):
    for i, layer in enumerate(self.model.distilbert.transformer.layer):
      for param in layer.parameters():
        param.requires_grad = True

    for param in self.model.classifier.parameters():
      param.requires_grad = True

  def train(self, data_loader, optimizer):
    self.model.train()
    total_recall = 0
    total_precision = 0
    total_f1 = 0
    total_loss = 0

    for batch_idx, (text, labels) in enumerate(tqdm(data_loader)):
      self.model.zero_grad()

      text = text.to(self.device)
      labels = labels.to(self.device)

      outputs = self.model(text, labels=labels, output_hidden_states=True)
      last_layer = outputs.hidden_states[-1]

      tgt_token_id = self.tokenizer.convert_tokens_to_ids("[TGT]")
      tgt_mask = (text == tgt_token_id).unsqueeze(-1)

      # Commented out since was running into issue with the values, not needed anymore
      # issue was document 512 token does not have [TGT] within the limit but outside the limit

      # if (tgt_mask.sum(dim=1) == 0).any():
      #   print(f"Skipping batch {batch_idx} because [TGT] is missing.")

      #   missing_tgt_mask = (tgt_mask.sum(dim=1) == 0).squeeze(-1)
      #   missing_indices = missing_tgt_mask.nonzero(as_tuple=True)[0]

      #   for idx in missing_indices:
      #     decoded_text = self.tokenizer.decode(text[idx].tolist(), skip_special_tokens=False)
      #     print(f"Decoded text for row {idx}: {decoded_text}")

      #   continue

      masked_last_layer = last_layer * tgt_mask
      tgt_embeddings = masked_last_layer.sum(dim=1) / (tgt_mask.sum(dim=1))

      logits = self.model.classifier(tgt_embeddings)
      loss = CrossEntropyLoss()(logits, labels)

      total_loss += loss.item()

      loss.backward()
      optimizer.step()

      predicted = logits.cpu().detach().numpy()
      precision, recall, f1 = self.get_performance_metrics(predicted, labels.cpu().numpy())
      total_precision += precision
      total_recall += recall
      total_f1 += f1


    precision = total_precision/len(data_loader)
    recall = total_recall/len(data_loader)
    f1 = total_f1/len(data_loader)
    loss = total_loss/len(data_loader)

    return precision, recall, f1, loss

  def eval(self, data_loader):
    self.model.eval()
    total_recall = 0
    total_precision = 0
    total_f1 = 0
    total_loss = 0

    with torch.no_grad():
      for batch_idx, (text, labels) in enumerate(tqdm(data_loader)):
        text = text.to(self.device)
        labels = labels.to(self.device)

        outputs = self.model(text, labels=labels, output_hidden_states=True)
        last_layer = outputs.hidden_states[-1]

        tgt_token_id = self.tokenizer.convert_tokens_to_ids("[TGT]")
        tgt_mask = (text == tgt_token_id).unsqueeze(-1)

        # Commented out since was running into issue with the values, not needed anymore
        # issue was document 512 token does not have [TGT] within the limit but outside the limit

        # if (tgt_mask.sum(dim=1) == 0).any():
        #   print(f"Skipping batch {batch_idx} because [TGT] is missing.")
        #   continue

        masked_last_layer = last_layer * tgt_mask
        tgt_embeddings = masked_last_layer.sum(dim=1) / (tgt_mask.sum(dim=1))

        logits = self.model.classifier(tgt_embeddings)
        loss = CrossEntropyLoss()(logits, labels)

        total_loss += loss.item()

        predicted = logits.cpu().detach().numpy()
        precision, recall, f1 = self.get_performance_metrics(predicted, labels.cpu().numpy())
        total_precision += precision
        total_recall += recall
        total_f1 += f1

      precision = total_precision/len(data_loader)
      recall = total_recall/len(data_loader)
      f1 = total_f1/len(data_loader)
      loss = total_loss/len(data_loader)

      return precision, recall, f1, loss

  def save_transformer(self):
    self.model.save_pretrained(self.save_path)
    self.tokenizer.save_pretrained(self.save_path)

  def execute(self):
    last_best = 0
    train_dataset = DatasetLoader(self.train_data, self.tokenizer)
    train_data_loader = train_dataset.get_data_loaders(self.batch_size)

    val_dataset = DatasetLoader(validation_dataset, self.tokenizer)
    val_data_loader = val_dataset.get_data_loaders(self.batch_size)

    optimizer = torch.optim.AdamW(self.model.parameters(), lr = 3e-5, eps = 1e-8)
    self.set_training_parameters()

    for epoch_i in range(0, self.epochs):
      train_precision, train_recall, train_f1, train_loss = self.train(train_data_loader, optimizer)
      print(f'Epoch {epoch_i + 1}: train_loss: {train_loss:.4f} train_precision: {train_precision:.4f} train_recall: {train_recall:.4f} train_f1: {train_f1:.4f}')

      val_precision, val_recall, val_f1, val_loss = self.eval(val_data_loader)
      print(f'Epoch {epoch_i + 1}: val_loss: {val_loss:.4f} val_precision: {val_precision:.4f} val_recall: {val_recall:.4f} val_f1: {val_f1:.4f}')

      if val_f1 > last_best:
        print("Saving model..")
        self.save_transformer()
        last_best = val_f1
        print("Model saved.")

Hyper Params of
1. Batch size
2. Epochs

Then the additional of save path of the model which is set to *distilbert_per_sent_idea_1* at the moment

In [None]:
BATCH_SIZE = 16
EPOCHS = 3

SAVE_PATH = 'distilbert_per_sent_idea_1'

## Idea 1 Training

training on the train_dataset we've loaded up previously, then verfying it with the validation_dataset each epochs to see the improvement of the model

In [None]:
options = {}
options['device'] = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
options['train_data'] = train_dataset
options['batch_size'] = BATCH_SIZE
options['epochs'] = EPOCHS
options['save_path'] = SAVE_PATH

trainer = Trainer(options)
trainer.execute()

torch.cuda.empty_cache()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizing data...


100%|██████████| 3355/3355 [00:05<00:00, 575.64it/s]


Data thrown away: 142, Reason: due to not having [TGT] tokens
Data thrown away: 28, Reason: due to not having [TGT] tokens before limit
Tokenizing data...


100%|██████████| 578/578 [00:01<00:00, 520.22it/s]


Data thrown away: 31, Reason: due to not having [TGT] tokens
Data thrown away: 3, Reason: due to not having [TGT] tokens before limit


100%|██████████| 200/200 [02:50<00:00,  1.17it/s]


Epoch 1: train_loss: 0.9236 train_precision: 0.4495 train_recall: 0.5306 train_f1: 0.4498


100%|██████████| 34/34 [00:09<00:00,  3.41it/s]


Epoch 1: val_loss: 0.8620 val_precision: 0.4731 val_recall: 0.5404 val_f1: 0.4473
Saving model..
Model saved.


100%|██████████| 200/200 [02:48<00:00,  1.19it/s]


Epoch 2: train_loss: 0.8359 train_precision: 0.5716 train_recall: 0.5894 train_f1: 0.5460


100%|██████████| 34/34 [00:10<00:00,  3.34it/s]


Epoch 2: val_loss: 0.8372 val_precision: 0.5753 val_recall: 0.5735 val_f1: 0.5508
Saving model..
Model saved.


100%|██████████| 200/200 [02:48<00:00,  1.18it/s]


Epoch 3: train_loss: 0.6856 train_precision: 0.6931 train_recall: 0.6747 train_f1: 0.6575


100%|██████████| 34/34 [00:10<00:00,  3.25it/s]


Epoch 3: val_loss: 0.8639 val_precision: 0.6047 val_recall: 0.5993 val_f1: 0.5897
Saving model..
Model saved.


## Idea 1 Testing

Verify the model performance with fresh data from testing split

In [None]:
class Tester:
  def __init__(self, options):
    self.device = options['device']
    self.test_data = options['test_data']
    self.batch_size = options['batch_size']
    self.save_path = options['save_path']

    transformer = DistillBERT()
    self.model, self.tokenizer = transformer.get_tokenizer_and_model()
    self.model.to(self.device)

  def load_transformer(self):
    self.model = AutoModelForSequenceClassification.from_pretrained(self.save_path)
    self.tokenizer = AutoTokenizer.from_pretrained(self.save_path)
    self.model.to(self.device)
    self.model.eval()

  def get_performance_metrics(self, preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    precision = precision_score(labels_flat, pred_flat, zero_division=0, average='weighted')
    recall = recall_score(labels_flat, pred_flat, zero_division=0, average='weighted')
    f1 = f1_score(labels_flat, pred_flat, zero_division=0, average='weighted')
    return precision, recall, f1

  def test(self, data_loader):
    self.model.eval()
    total_recall = 0
    total_precision = 0
    total_f1 = 0
    total_loss = 0

    y_true = []
    y_pred = []

    with torch.no_grad():
      for batch_idx, (text, labels) in enumerate(tqdm(data_loader)):
        text = text.to(self.device)
        labels = labels.to(self.device)

        outputs = self.model(text, labels=labels, output_hidden_states=True)
        last_layer = outputs.hidden_states[-1]

        tgt_token_id = self.tokenizer.convert_tokens_to_ids("[TGT]")
        tgt_mask = (text == tgt_token_id).unsqueeze(-1)

        masked_last_layer = last_layer * tgt_mask
        tgt_embeddings = masked_last_layer.sum(dim=1) / (tgt_mask.sum(dim=1))

        logits = self.model.classifier(tgt_embeddings)
        loss = CrossEntropyLoss()(logits, labels)

        total_loss += loss.item()

        predicted = logits.cpu().detach().numpy()
        labels = labels.cpu().numpy()
        y_true.extend(labels)
        y_pred.extend(np.argmax(predicted, axis=1))
        precision, recall, f1 = self.get_performance_metrics(predicted, labels)
        total_precision += precision
        total_recall += recall
        total_f1 += f1

      precision = total_precision/len(data_loader)
      recall = total_recall/len(data_loader)
      f1 = total_f1/len(data_loader)
      loss = total_loss/len(data_loader)
      report = classification_report(y_true, y_pred, target_names=["Negative", "Neutral", "Positive"])

      return precision, recall, f1, loss, report

  def execute(self):
    self.load_transformer()
    test_dataset = DatasetLoader(self.test_data, self.tokenizer)
    test_data_loader = test_dataset.get_data_loaders(self.batch_size)
    precision, recall, f1, loss, report = self.test(test_data_loader)
    print(f'precision: {precision:.4f} recall: {recall:.4f} f1: {f1:.4f}')
    print(f'Test loss: {loss:.4f}')
    print()
    print(report)


Testing on the test_random_dataset (random and different main target entity)

In [None]:
options={}
options['device'] = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
options['test_data'] = test_random_dataset
options['batch_size'] = BATCH_SIZE
options['save_path'] = SAVE_PATH

tester = Tester(options)
tester.execute()

torch.cuda.empty_cache()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizing data...


100%|██████████| 579/579 [00:03<00:00, 172.05it/s]


Data thrown away: 26, Reason: due to not having [TGT] tokens
Data thrown away: 10, Reason: due to not having [TGT] tokens before limit


100%|██████████| 34/34 [00:10<00:00,  3.18it/s]

precision: 0.5846 recall: 0.5767 f1: 0.5621
Test loss: 0.9191

              precision    recall  f1-score   support

    Negative       0.33      0.09      0.15        64
     Neutral       0.47      0.63      0.54       203
    Positive       0.70      0.65      0.68       276

    accuracy                           0.58       543
   macro avg       0.50      0.46      0.45       543
weighted avg       0.57      0.58      0.56       543






Testing on the test_fixed_dataset (stuff have repeat high frequence main target entity)

In [None]:
options={}
options['device'] = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
options['test_data'] = test_fixed_dataset
options['batch_size'] = BATCH_SIZE
options['save_path'] = SAVE_PATH

tester = Tester(options)
tester.execute()

torch.cuda.empty_cache()

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Tokenizing data...


100%|██████████| 827/827 [00:02<00:00, 311.08it/s]


Data thrown away: 14, Reason: due to not having [TGT] tokens
Data thrown away: 5, Reason: due to not having [TGT] tokens before limit


100%|██████████| 51/51 [00:15<00:00,  3.37it/s]

precision: 0.4998 recall: 0.4583 f1: 0.4558
Test loss: 1.1089

              precision    recall  f1-score   support

    Negative       0.27      0.16      0.20       133
     Neutral       0.42      0.51      0.46       311
    Positive       0.54      0.53      0.53       364

    accuracy                           0.46       808
   macro avg       0.41      0.40      0.40       808
weighted avg       0.45      0.46      0.45       808




