In [1]:
%%capture
! pip install tqdm boto3 requests regex sentencepiece sacremoses
! pip install transformers

In [1]:
import os

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# os.chdir('/content/drive/MyDrive/FA 23/TTIC 31190 NLP/hw3')


## BERT Features

In this part, you will use BERT features to classify DBPedia articles.
The data is already pre-processed, and the data loader is implemented below.

In [3]:
import torch.nn.functional as F
import gc

In [4]:
# Basics: dataset, data loaders, Classifier
import collections
import json
import torch
import torch.nn as nn
import tqdm
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel


SPLITS = ['train', 'dev', 'test']

class DBPediaDataset(Dataset):
  '''DBPedia dataset.
    Args:
      path[str]: path to the original data.
  '''
  def __init__(self, path):
    with open(path) as fin:
      self._data = [json.loads(l) for l in fin]
    self._n_classes = len(set([datum['label'] for datum in self._data]))

  def __getitem__(self, index):
    return self._data[index]

  def __len__(self):
    return len(self._data)

  @property
  def n_classes(self):
    return self._n_classes

  @staticmethod
  def collate_fn(tokenizer, device, batch):
    '''The collate function that compresses a training batch.
      Args:
        batch[list[dict[str, Any]]]: data in the batch.
      Returns:
        labels[torch.LongTensor]: the labels in the batch.
        sentences[dict[str, torch.Tensor]]: sentences converted by tokenizers.
    '''
    labels = torch.tensor([datum['label'] for datum in batch]).long().to(device)
    sentences = tokenizer(
        [datum['sentence'] for datum in batch],
        return_tensors='pt',  # pt = pytorch style tensor
        padding=True)
    for key in sentences:
      sentences[key] = sentences[key].to(device)
    return labels, sentences

def construct_datasets(prefix, batch_size, tokenizer, device):
  '''Constructs datasets and data loaders.
    Args:
      prefix[str]: prefix of the dataset (e.g., dbpedia_).
      batch_size[int]: maximum number of examples in a batch.
      tokenizer: model tokenizer that converts sentences to integer tensors.
      device[torch.device]: the device (cpu/gpu) that the tensor should be on.
    Returns:
      datasets[dict[str, Dataset]]: a dict of constructed datasets.
      dataloaders[dict[str, DataLoader]]: a dict of constructed data loaders.
  '''
  datasets = collections.defaultdict()
  dataloaders = collections.defaultdict()
  for split in SPLITS:
    datasets[split] = DBPediaDataset(f'{prefix}{split}.json')
    dataloaders[split] = DataLoader(
        datasets[split],
        batch_size=batch_size,
        shuffle=(split == 'train'),
        collate_fn=lambda x:DBPediaDataset.collate_fn(tokenizer, device, x))
  return datasets, dataloaders

In [5]:
class Classifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(Classifier, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

## Training and Evaluation

In [6]:
def predict(outputs):
    """
    Return predicted classes.
    """
    classes_pred = torch.argmax(outputs, dim=1)

    return classes_pred

In [10]:
def train_classifier(seed, datasets, dataloaders, bert_model, params=False, 
                     pooling="cls"):
    """
    Train the classifier.
    """
    torch.manual_seed(seed)

    classifier_hidden_size = 32

    classifier = Classifier(
        bert_model.config.hidden_size,
        classifier_hidden_size,
        datasets['train'].n_classes).to(bert_model.device)

    classifier.train()

    if params != False:
        optimizer = torch.optim.Adam(params + list(classifier.parameters()), 
                                     lr=5e-4)

    else:
        optimizer = torch.optim.Adam(classifier.parameters(), lr=5e-4)


    loss_func = nn.CrossEntropyLoss()

    pbar = tqdm.tqdm(dataloaders['train'])

    for labels, sentences in pbar:
        optimizer.zero_grad()

        if params:
            if pooling == "cls":
                features = bert_model(**sentences)['last_hidden_state'][:, 0, :]

            elif pooling == "mean":
                attention_mask = sentences["attention_mask"]
                features = mean_pooling(bert_model(**sentences)\
                                        ['last_hidden_state'], attention_mask)

            else:
                attention_mask = sentences["attention_mask"]
                features = max_pooling(bert_model(**sentences)\
                                       ['last_hidden_state'], attention_mask)

        else:
            with torch.no_grad():
                if pooling == "cls":
                    features = bert_model(**sentences)['last_hidden_state'][:, 0, :]

                elif pooling == "mean":
                    attention_mask = sentences["attention_mask"] 
                    features = mean_pooling(bert_model(**sentences) \
                                            ['last_hidden_state'], 
                                            attention_mask)

                else:
                    attention_mask = sentences["attention_mask"]
                    features = max_pooling(bert_model(**sentences) \
                                           ['last_hidden_state'], 
                                           attention_mask)

        outputs = classifier(features)
        loss = loss_func(outputs, labels)

        loss.backward()
        optimizer.step()

    torch.cuda.empty_cache()
    gc.collect()

    return classifier, bert_model

In [7]:
def eval_classifier(dataloaders, dataset, classifier, bert_model, pooling="cls"):
    """
    Evaluate the classifier.
    """
    classifier.eval()
    bert_model.eval()

    total_steps = len(dataloaders[dataset])
    correct_pred = 0
    total_pred = 0

    pbar = tqdm.tqdm(dataloaders[dataset])
    torch.cuda.empty_cache()

    for step, (labels, sentences) in enumerate(pbar):

        if pooling == "cls":
            features = bert_model(**sentences)['last_hidden_state'][:, 0, :]

        elif pooling == "mean":
            attention_mask = sentences["attention_mask"]
            features = mean_pooling(bert_model(**sentences)['last_hidden_state'], 
                                    attention_mask)

        else:
            attention_mask = sentences["attention_mask"]
            features = max_pooling(bert_model(**sentences)['last_hidden_state'], 
                                   attention_mask)


        outputs = classifier(features)

        classes_pred = predict(outputs)
        correct_pred += (classes_pred == labels).sum().item()
        total_pred += labels.size(0)


        pbar.set_description(f"Step {step+1}/{total_steps}")

    acc = correct_pred / total_pred

    print(f"{dataset} accuracy: {acc}")

    torch.cuda.empty_cache()

    return acc

In [8]:
def mean_pooling(features, attention_mask):
    """
    Apply mean pooling on the features.
    """

    mask_expanded = attention_mask.unsqueeze(-1).expand(features.size()).float()

    sum_embed = torch.sum(features * mask_expanded, 1)

    len_sentence = mask_expanded.sum(1)

    return sum_embed / len_sentence

In [9]:
def max_pooling(features, attention_mask):
    """
    Apply max pooling on the features.
    """
    mask_expanded = attention_mask.unsqueeze(-1).expand(features.size()).float()
    return torch.max(features, 1)[0]

In [14]:
def trials(seeds, params=False, pooling="cls"):
    """
    Train, evaluate, and test classifier with differet seeds.
    """
    best_acc = 0
    best_seed = None
    best_classifier = None
    acc_sum = 0
    acc_sum_x2 = 0
    n = len(seeds)
    batch_size = 32
    tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')


    for seed in seeds:
        torch.manual_seed(seed)
        bert_model = AutoModel.from_pretrained('bert-base-cased')
        if params:
            params = list()
            for name, param in bert_model.named_parameters():
                if name.startswith('encoder.layer.10') or \
                   name.startswith('encoder.layer.11'):
                        params.append(param)

        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)
            bert_model = bert_model.cuda()

        datasets, dataloaders = construct_datasets(prefix='dbpedia_',
                                                   batch_size=batch_size,
                                                   tokenizer=tokenizer,
                                                   device=bert_model.device)

        classifier, bert_model = train_classifier(seed, datasets, dataloaders,
                                                  bert_model, params=params,
                                                  pooling=pooling)

        acc = eval_classifier(dataloaders, "dev", classifier, bert_model,
                              pooling=pooling)
        if acc > best_acc:
            best_acc = acc
            best_seed = seed
            best_classifier = classifier
            best_bert = bert_model

        acc_sum += acc
        acc_sum_x2 += acc ** 2
        torch.cuda.empty_cache()
        gc.collect()


    mean = acc_sum / n
    var = (acc_sum_x2 - acc_sum**2 / n) / (n - 1)
    std = var**0.5

    torch.cuda.empty_cache()
    gc.collect()

    test_acc = eval_classifier(dataloaders, "test", best_classifier, best_bert,
                               pooling=pooling)

    return mean, std, test_acc


### 1.1 CLS

In [10]:
seeds = [0, 42, 1, 3, 8]

In [None]:
avg_acc_cls, sd_acc_cls, test_acc_cls = trials(seeds)

tokenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

100%|██████████| 313/313 [00:46<00:00,  6.70it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.49it/s]


dev accuracy: 0.961


100%|██████████| 313/313 [00:49<00:00,  6.35it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.41it/s]


dev accuracy: 0.962


100%|██████████| 313/313 [00:47<00:00,  6.52it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.01it/s]


dev accuracy: 0.968


100%|██████████| 313/313 [00:48<00:00,  6.43it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.37it/s]


dev accuracy: 0.951


100%|██████████| 313/313 [00:48<00:00,  6.48it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.32it/s]


dev accuracy: 0.954


Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.42it/s]


test accuracy: 0.968


In [None]:
print("Average Accuracy on Dev: ", avg_acc_cls)
print("Standard Deviation of Accuracies on Dev: ", sd_acc_cls)
print("Test Accuracy: ", test_acc_cls)

Average Accuracy on Dev:  0.9592
Standard Deviation of Accuracies on Dev:  0.00676017751244857
Test Accuracy:  0.968


For CLS, the average accuracy on dev is 0.9592; the standard deviation is 0.0068; the test accuracy is 0.968.



### 1.2 Mean pooling and Max pooling

Mean Pooling

In [None]:
avg_acc_mean, sd_acc_mean, test_acc_mean = trials(seeds, pooling="mean")

100%|██████████| 313/313 [00:48<00:00,  6.46it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.35it/s]


dev accuracy: 0.967


100%|██████████| 313/313 [00:48<00:00,  6.49it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.27it/s]


dev accuracy: 0.956


100%|██████████| 313/313 [00:48<00:00,  6.47it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.30it/s]


dev accuracy: 0.971


100%|██████████| 313/313 [00:48<00:00,  6.41it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.33it/s]


dev accuracy: 0.969


100%|██████████| 313/313 [00:48<00:00,  6.47it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.28it/s]


dev accuracy: 0.967


Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.46it/s]


test accuracy: 0.964


In [None]:
print("Average Accuracy on Dev: ", avg_acc_mean)
print("Standard Deviation of Accuracies on Dev: ", sd_acc_mean)
print("Test Accuracy: ", test_acc_mean)

Average Accuracy on Dev:  0.966
Standard Deviation of Accuracies on Dev:  0.005830951894833936
Test Accuracy:  0.964


For mean pooling, the average accuracy on dev is 0.966; the standard deviation is 0.0058; the test accuracy is 0.964.

Max Pooling

In [None]:
avg_acc_max, sd_acc_max, test_acc_max = trials(seeds, pooling="max")

100%|██████████| 313/313 [00:48<00:00,  6.45it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.25it/s]


dev accuracy: 0.665


100%|██████████| 313/313 [00:48<00:00,  6.47it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.31it/s]


dev accuracy: 0.742


100%|██████████| 313/313 [00:48<00:00,  6.45it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.31it/s]


dev accuracy: 0.583


100%|██████████| 313/313 [00:48<00:00,  6.44it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.29it/s]


dev accuracy: 0.602


100%|██████████| 313/313 [00:48<00:00,  6.46it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.34it/s]


dev accuracy: 0.49


Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.43it/s]


test accuracy: 0.736


In [None]:
print("Average Accuracy on Dev: ", avg_acc_max)
print("Standard Deviation of Accuracies on Dev: ", sd_acc_max)
print("Test Accuracy: ", test_acc_max)

Average Accuracy on Dev:  0.6164
Standard Deviation of Accuracies on Dev:  0.09412385457470364
Test Accuracy:  0.736


For max pooling, the average accuracy on dev is 0.6164; the standard deviation is 0.0941; the test accuracy is 0.736.

#### 1.3 Comparison

| Feature Extraction Method | Mean Accuracy on Dev | Standard Deviation on Dev | Test Accuracy |
|---------------------------|----------------------|---------------------------|---------------|
| CLS                       | 0.9592               | 0.0068                    | 0.968         |
| Mean Pooling              | 0.966                | 0.0058                    | 0.964         |
| Max Pooling               | 0.6164               | 0.0941                    | 0.736         |


Based on the test accuracy, CLS has the best performance with frozen BERT feature.

### 1.4 Finetuning Bert

CLS

In [None]:
torch.cuda.empty_cache()
gc.collect()

0

In [None]:
avg_acc_cls_ft, sd_acc_cls_ft, test_acc_cls_ft = trials(seeds, params=True)

100%|██████████| 313/313 [01:53<00:00,  2.77it/s]
Step 32/32: 100%|██████████| 32/32 [00:03<00:00,  8.22it/s]


dev accuracy: 0.996


100%|██████████| 313/313 [01:52<00:00,  2.79it/s]
Step 32/32: 100%|██████████| 32/32 [00:03<00:00,  8.24it/s]


dev accuracy: 0.992


100%|██████████| 313/313 [01:52<00:00,  2.79it/s]
Step 32/32: 100%|██████████| 32/32 [00:03<00:00,  8.11it/s]


dev accuracy: 0.988


100%|██████████| 313/313 [01:52<00:00,  2.77it/s]
Step 32/32: 100%|██████████| 32/32 [00:03<00:00,  8.27it/s]


dev accuracy: 0.987


100%|██████████| 313/313 [01:52<00:00,  2.78it/s]
Step 32/32: 100%|██████████| 32/32 [00:03<00:00,  8.10it/s]


dev accuracy: 0.978


Step 32/32: 100%|██████████| 32/32 [00:03<00:00,  8.35it/s]


test accuracy: 0.992


In [None]:
print("Average Accuracy on Dev: ", avg_acc_cls_ft)
print("Standard Deviation of Accuracies on Dev: ", sd_acc_cls_ft)
print("Test Accuracy: ", test_acc_cls_ft)

Average Accuracy on Dev:  0.9882
Standard Deviation of Accuracies on Dev:  0.006723094525586226
Test Accuracy:  0.992


Mean Pooling

In [None]:
avg_acc_mean_ft, sd_acc_mean_ft, test_acc_mean_ft = trials(seeds, params=True, 
                                                           pooling="mean")

100%|██████████| 313/313 [01:51<00:00,  2.81it/s]
Step 32/32: 100%|██████████| 32/32 [00:03<00:00,  8.37it/s]


dev accuracy: 0.98


100%|██████████| 313/313 [01:52<00:00,  2.79it/s]
Step 32/32: 100%|██████████| 32/32 [00:03<00:00,  8.20it/s]


dev accuracy: 0.99


100%|██████████| 313/313 [01:52<00:00,  2.78it/s]
Step 32/32: 100%|██████████| 32/32 [00:03<00:00,  8.26it/s]


dev accuracy: 0.992


100%|██████████| 313/313 [01:53<00:00,  2.77it/s]
Step 32/32: 100%|██████████| 32/32 [00:03<00:00,  8.25it/s]


dev accuracy: 0.992


100%|██████████| 313/313 [01:52<00:00,  2.78it/s]
Step 32/32: 100%|██████████| 32/32 [00:03<00:00,  8.23it/s]


dev accuracy: 0.988


Step 32/32: 100%|██████████| 32/32 [00:03<00:00,  8.46it/s]


test accuracy: 0.993


In [None]:
print("Average Accuracy on Dev: ", avg_acc_mean_ft)
print("Standard Deviation of Accuracies on Dev: ", sd_acc_mean_ft)
print("Test Accuracy: ", test_acc_mean_ft)

Average Accuracy on Dev:  0.9884000000000001
Standard Deviation of Accuracies on Dev:  0.004979959839177919
Test Accuracy:  0.993


Max Pooling

In [None]:
avg_acc_max_ft, sd_acc_max_ft, test_acc_max_ft = trials(seeds, params=True, 
                                                        pooling="max")

100%|██████████| 313/313 [01:55<00:00,  2.70it/s]
Step 32/32: 100%|██████████| 32/32 [00:03<00:00,  8.26it/s]


dev accuracy: 0.991


100%|██████████| 313/313 [01:55<00:00,  2.71it/s]
Step 32/32: 100%|██████████| 32/32 [00:03<00:00,  8.05it/s]


dev accuracy: 0.99


100%|██████████| 313/313 [01:55<00:00,  2.71it/s]
Step 32/32: 100%|██████████| 32/32 [00:03<00:00,  8.27it/s]


dev accuracy: 0.981


100%|██████████| 313/313 [01:56<00:00,  2.69it/s]
Step 32/32: 100%|██████████| 32/32 [00:03<00:00,  8.25it/s]


dev accuracy: 0.977


100%|██████████| 313/313 [01:55<00:00,  2.70it/s]
Step 32/32: 100%|██████████| 32/32 [00:03<00:00,  8.11it/s]


dev accuracy: 0.984


Step 32/32: 100%|██████████| 32/32 [00:03<00:00,  8.39it/s]


test accuracy: 0.99


In [None]:
print("Average Accuracy on Dev: ", avg_acc_max_ft)
print("Standard Deviation of Accuracies on Dev: ", sd_acc_max_ft)
print("Test Accuracy: ", test_acc_max_ft)

Average Accuracy on Dev:  0.9846
Standard Deviation of Accuracies on Dev:  0.005941380311001844
Test Accuracy:  0.99


| Feature Extraction Method |  Finetune or Frozen       | Mean Accuracy on Dev | Standard Deviation on Dev | Test Accuracy |
|---------------------------|-----------------|----------------------|---------------------------|---------------|
| CLS                       | Frozen BERT     | 0.9592               | 0.0068                    | 0.968         |
| CLS                       | Fine-Tuned BERT | 0.9882               | 0.006723095               | 0.992         |
| Mean Pooling              | Frozen BERT     | 0.966                | 0.0058                    | 0.964         |
| Mean Pooling              | Fine-Tuned BERT | 0.9884               | 0.00497996                | 0.993         |
| Max Pooling               | Frozen BERT     | 0.6164               | 0.0941                    | 0.736         |
| Max Pooling               | Fine-Tuned BERT | 0.9846               | 0.00594138                | 0.99          |


- Fine-tuning BERT consistently improves performance across all feature extraction methods compared to using frozen features.

- CLS and Mean Pooling shows strong performance in both frozen and fine-tuned scenarios, where Mean Pooling has the highest accuracy with finetuned bert.
Max Pooling lags behind the other methods, indicating it might be less suitable for this specific task.

- Fine-tuning BERT leads to lower standard deviations, implying more stable performance across training runs.


### 1.5 GPT-2

CLS

In [11]:
from transformers import GPT2Model, GPT2Tokenizer

In [13]:
def train_classifier_gpt2(seed, datasets, dataloaders, gpt2_model, pooling="cls"):
    """
    Train the classifier.
    """
    torch.manual_seed(seed)

    classifier_hidden_size = 32

    classifier = Classifier(
        gpt2_model.config.hidden_size,
        classifier_hidden_size,
        datasets['train'].n_classes).to(gpt2_model.device)

    classifier.train()


    optimizer = torch.optim.Adam(classifier.parameters(), lr=5e-4)


    loss_func = nn.CrossEntropyLoss()

    pbar = tqdm.tqdm(dataloaders['train'])

    for labels, sentences in pbar:
        optimizer.zero_grad()

        with torch.no_grad():
            if pooling == "cls":
                features = gpt2_model(**sentences)['last_hidden_state'][:, 0, :]

            elif pooling == "mean":
                attention_mask = sentences["attention_mask"]
                features = mean_pooling(gpt2_model(**sentences) \
                                        ['last_hidden_state'],
                                        attention_mask)

            else:
                attention_mask = sentences["attention_mask"]
                features = max_pooling(gpt2_model(**sentences) \
                                       ['last_hidden_state'],
                                       attention_mask)

        outputs = classifier(features)
        loss = loss_func(outputs, labels)

        loss.backward()
        optimizer.step()

    torch.cuda.empty_cache()
    gc.collect()

    return classifier, gpt2_model

In [19]:
def trials_gpt2(seeds, pooling="cls"):
    """
    Train, evaluate, and test classifier with differet seeds.
    """
    best_acc = 0
    best_seed = None
    best_classifier = None
    acc_sum = 0
    acc_sum_x2 = 0
    n = len(seeds)
    batch_size = 32
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
    tokenizer.pad_token = tokenizer.eos_token


    for seed in seeds:
        torch.manual_seed(seed)
        gpt2_model = GPT2Model.from_pretrained("gpt2")

        if torch.cuda.is_available():
            torch.cuda.manual_seed_all(seed)
            feature_model = gpt2_model.cuda()

        datasets, dataloaders = construct_datasets(prefix='dbpedia_',
                                                   batch_size=batch_size,
                                                   tokenizer=tokenizer,
                                                   device=gpt2_model.device)

        classifier, feature_model = train_classifier_gpt2(seed, datasets, 
                                                          dataloaders,
                                                          gpt2_model,
                                                          pooling=pooling)

        acc = eval_classifier(dataloaders, "dev", classifier, gpt2_model,
                              pooling=pooling)
        if acc > best_acc:
            best_acc = acc
            best_seed = seed
            best_classifier = classifier
            best_gpt2 = gpt2_model

        acc_sum += acc
        acc_sum_x2 += acc ** 2
        torch.cuda.empty_cache()
        gc.collect()


    mean = acc_sum / n
    var = (acc_sum_x2 - acc_sum**2 / n) / (n - 1)
    std = var**0.5

    torch.cuda.empty_cache()
    gc.collect()

    test_acc = eval_classifier(dataloaders, "test", best_classifier, best_gpt2,
                               pooling=pooling)

    return mean, std, test_acc


In [18]:
torch.cuda.empty_cache()
gc.collect()

0

In [20]:
avg_acc_cls_gpt2, sd_acc_cls_gpt2, test_acc_cls_gpt2 = trials_gpt2(seeds)

100%|██████████| 313/313 [00:44<00:00,  7.05it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.82it/s]


dev accuracy: 0.215


100%|██████████| 313/313 [00:45<00:00,  6.92it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.05it/s]


dev accuracy: 0.217


100%|██████████| 313/313 [00:44<00:00,  7.03it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.80it/s]


dev accuracy: 0.225


100%|██████████| 313/313 [00:45<00:00,  6.95it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.80it/s]


dev accuracy: 0.222


100%|██████████| 313/313 [00:48<00:00,  6.39it/s]
Step 32/32: 100%|██████████| 32/32 [00:05<00:00,  5.40it/s]


dev accuracy: 0.216


Step 32/32: 100%|██████████| 32/32 [00:05<00:00,  6.18it/s]


test accuracy: 0.249


In [21]:
print("Average Accuracy on Dev: ", avg_acc_cls_gpt2)
print("Standard Deviation of Accuracies on Dev: ", sd_acc_cls_gpt2)
print("Test Accuracy: ", test_acc_cls_gpt2)

Average Accuracy on Dev:  0.219
Standard Deviation of Accuracies on Dev:  0.0043011626335210445
Test Accuracy:  0.249


Mean Pooling

In [22]:
avg_acc_mean_gpt2, sd_acc_mean_gpt2, test_acc_mean_gpt2 = trials_gpt2(seeds, 
                                                                pooling="mean")

100%|██████████| 313/313 [00:55<00:00,  5.61it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  6.52it/s]


dev accuracy: 0.894


100%|██████████| 313/313 [00:44<00:00,  6.99it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.52it/s]


dev accuracy: 0.896


100%|██████████| 313/313 [00:44<00:00,  7.00it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.76it/s]


dev accuracy: 0.879


100%|██████████| 313/313 [00:45<00:00,  6.91it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.81it/s]


dev accuracy: 0.903


100%|██████████| 313/313 [00:45<00:00,  6.94it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.77it/s]


dev accuracy: 0.895


Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.26it/s]


test accuracy: 0.912


In [25]:
print("Average Accuracy on Dev: ", avg_acc_mean_gpt2)
print("Standard Deviation of Accuracies on Dev: ", sd_acc_mean_gpt2)
print("Test Accuracy: ", test_acc_mean_gpt2)

Average Accuracy on Dev:  0.8934000000000001
Standard Deviation of Accuracies on Dev:  0.008792041856130285
Test Accuracy:  0.912


Max Pooling

In [23]:
avg_acc_max_gpt2, sd_acc_max_gpt2, test_acc_max_gpt2 = trials_gpt2(seeds, 
                                                                pooling="max")

100%|██████████| 313/313 [00:46<00:00,  6.78it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.70it/s]


dev accuracy: 0.516


100%|██████████| 313/313 [00:44<00:00,  6.98it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.81it/s]


dev accuracy: 0.664


100%|██████████| 313/313 [00:44<00:00,  7.01it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.78it/s]


dev accuracy: 0.34


100%|██████████| 313/313 [00:45<00:00,  6.95it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.52it/s]


dev accuracy: 0.601


100%|██████████| 313/313 [00:44<00:00,  7.00it/s]
Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.69it/s]


dev accuracy: 0.574


Step 32/32: 100%|██████████| 32/32 [00:04<00:00,  7.64it/s]

test accuracy: 0.656





In [24]:
print("Average Accuracy on Dev: ", avg_acc_max_gpt2)
print("Standard Deviation of Accuracies on Dev: ", sd_acc_max_gpt2)
print("Test Accuracy: ", test_acc_max_gpt2)

Average Accuracy on Dev:  0.539
Standard Deviation of Accuracies on Dev:  0.12331261087171858
Test Accuracy:  0.656


| Feature Extraction Method | Mean Accuracy on Dev | Standard Deviation on Dev | Test Accuracy |
|---------------------------|----------------------|---------------------------|---------------|
| CLS                       | 0.219                | 0.004301163               | 0.249         |
| Mean Pooling              | 0.8934               | 0.008792042               | 0.912         |
| Max Pooling               | 0.539                | 0.123312611               | 0.656         |
