# Assignment 3
## Practical Deep Learning for Language Processing

01/23/2023


# Part A

In [1]:
import time
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm

#### 1

In [2]:
if torch.cuda.is_available(): 
    device = "cuda"
else:
    device = "cpu"

#### 2

In [3]:
from transformers import Trainer, TrainingArguments, pipeline, DistilBertForSequenceClassification, DistilBertTokenizerFast
sequence_clf_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-cased", 
                                                                         num_labels = 2, 
                                                                         id2label={0: "negative", 1: "positive"}, 
                                                                         label2id={"negative": 0, "positive": 1})

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifier.w

#### 3, 4

In [4]:
from datasets import load_dataset

# load the data with the predefined train-test split
imdb_train, imdb_test = load_dataset('imdb', split=['train', 'test'])

Found cached dataset imdb (/home/tu/tu_tu/tu_zxobe27/.cache/huggingface/datasets/imdb/plain_text/1.0.0/2fdd8b9bcadd6e7055e742a706876ba43f19faee861df134affd7a3f60fc38a1)


  0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
# further perform a test-validation split

# get the number of examples per class
n_split = int(len(imdb_test) / 2)
# set the sizes for subsetting
subset_num = (4000, 1000, 1000)
# get a randomized order
shuffle = torch.randperm(n_split)


In [6]:
# get the subsets
imdb_train = imdb_train.select(torch.concat([shuffle, (shuffle + n_split)])).\
    select(torch.concat([torch.arange(0, subset_num[0]/2), torch.arange(0, subset_num[0]/2) + n_split]))
imdb_val = imdb_test.select(torch.concat([shuffle, (shuffle + n_split)])).\
    select(torch.concat([torch.arange(subset_num[1]/2, subset_num[1]/2 + subset_num[2]/2), torch.arange(subset_num[1]/2, subset_num[1]/2 + subset_num[2]/2) + n_split]))
imdb_test = imdb_test.select(torch.concat([shuffle, (shuffle + n_split)])).\
    select(torch.concat([torch.arange(0, subset_num[1]/2), torch.arange(0, subset_num[1]/2) + n_split]))

#### 5

In [7]:
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

In [8]:
# simple function to batch tokenize utterances with truncation
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation = True, padding = True)

In [9]:
# apply the tokenization in batches to all data sets
imdb_train = imdb_train.map(preprocess_function, batched=True).remove_columns("text")
imdb_val = imdb_val.map(preprocess_function, batched=True).remove_columns("text")
imdb_test = imdb_test.map(preprocess_function, batched=True).remove_columns("text")

  0%|          | 0/4 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

## 6, 7, 8

In [10]:
training_args = TrainingArguments(
    # setting the epochs for task 7
    num_train_epochs = 3,
    # setting the checkpoint directory and interval for task 8
    output_dir = "./MyIMDBModel",
    save_strategy = "steps",
    save_steps = 50,
    evaluation_strategy='steps',
    eval_steps = 50,
    logging_steps = 50,
    log_level='info',
    # proposed settings from task 6
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64, 
    warmup_steps=100, 
    weight_decay=0.01, 
    logging_strategy='steps', 
    logging_dir='./logs', 
    fp16 = (device == "cuda"), 
    load_best_model_at_end=True
)

## 9

In [11]:
import evaluate
# load the metric
f1_metric = evaluate.load("f1")

# a function to compute the metrics from predicted logits and ground truth
def compute_metrics(eval_pred):
    # separate the items
    logits, labels = eval_pred
    # logits are given in two columns: get the prediction as the column with the higher logit
    predictions = np.argmax(logits, axis=-1)
    # compute the f1 metric
    return f1_metric.compute(predictions = predictions, references = labels, average = "macro")

## 10

In [12]:
# compile the trainer class
trainer = Trainer(
    model = sequence_clf_model,
    args = training_args,
    train_dataset = imdb_train,
    eval_dataset = imdb_test,
    compute_metrics = compute_metrics
)

Using cuda_amp half precision backend


## 11, 12
Because Tensor Board requires opening a port and I compute this assignment on the cluster where this is not easily accomplished, I do not use it for evaluation.

In [13]:
# get evaluation prior to training
# this will use the loaded weights
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64


{'eval_loss': 0.6942612528800964,
 'eval_f1': 0.46486765996849333,
 'eval_runtime': 2.5766,
 'eval_samples_per_second': 388.11,
 'eval_steps_per_second': 6.21}

In [14]:
trainer.train()

***** Running training *****
  Num examples = 4000
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 375
  Number of trainable parameters = 65783042


Step,Training Loss,Validation Loss,F1
50,0.6834,0.641838,0.529858
100,0.4673,0.463496,0.782286
150,0.3062,0.397723,0.838112
200,0.2826,0.327772,0.864997
250,0.2547,0.334565,0.862997
300,0.1247,0.429809,0.871975
350,0.0897,0.413689,0.886991


***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
Saving model checkpoint to ./MyIMDBModel/checkpoint-50
Configuration saved in ./MyIMDBModel/checkpoint-50/config.json
Model weights saved in ./MyIMDBModel/checkpoint-50/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
Saving model checkpoint to ./MyIMDBModel/checkpoint-100
Configuration saved in ./MyIMDBModel/checkpoint-100/config.json
Model weights saved in ./MyIMDBModel/checkpoint-100/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
Saving model checkpoint to ./MyIMDBModel/checkpoint-150
Configuration saved in ./MyIMDBModel/checkpoint-150/config.json
Model weights saved in ./MyIMDBModel/checkpoint-150/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
Saving model checkpoint to ./MyIMDBModel/checkpoint-200
Configuration saved in ./MyIMDBModel/checkpoint-200/config.json
Model weights saved in ./MyIM

TrainOutput(global_step=375, training_loss=0.3012816460927327, metrics={'train_runtime': 95.4594, 'train_samples_per_second': 125.708, 'train_steps_per_second': 3.928, 'total_flos': 1589608783872000.0, 'train_loss': 0.3012816460927327, 'epoch': 3.0})

**Overall loss decreased from 0.694 pre-training to a low of 0.328 after step 200. The F1 score improved from 0.465 to 0.865.**

## 13

In [15]:
pipe = pipeline("text-classification", sequence_clf_model, tokenizer=tokenizer, device=0)

In [16]:
pipe("Fargo is amazingly entertaining.")

[{'label': 'positive', 'score': 0.9862398505210876}]

The first text is correctly predicted to be positive.

In [17]:
pipe("Overall I think that Star Wars 8 is the worst thing that happened in 2017.")

[{'label': 'negative', 'score': 0.9294151067733765}]

The second text is correctly predicted to be negative.

In [18]:
pipe('The Room is probably the worst movie that I ever loved.')

[{'label': 'negative', 'score': 0.9377620816230774}]

Finally the third text is a very complicated example which the model predicts to be negative. I would however argue that the overall sentiment of the sentence is positive because the focal word is "loved". I would have at least expected the score of the prediction to be much lower. Perhaps BERT has had some bad experiences with love? ;)

## 14

In [19]:
import joblib
# export the tokenizer and model
joblib.dump(tokenizer, "tokenizer.pkl")
joblib.dump(sequence_clf_model, "model.pkl")

['model.pkl']

---
# Part B
## 1

In [20]:
from datasets import load_dataset
# loading the data
stsb_train, stsb_val = load_dataset('glue', 'stsb', split=["train", "validation"])

Found cached dataset glue (/home/tu/tu_tu/tu_zxobe27/.cache/huggingface/datasets/glue/stsb/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/2 [00:00<?, ?it/s]

In [21]:
import torch
# defining the length of the validation set
n_split = len(stsb_val)
# taking a subset of 5000 observations in the training sample
stsb_train = stsb_train.select(torch.arange(5000))
# splitting the validation sample in half and assigning the first half as the testing set
stsb_test = stsb_val.select(torch.randperm(n_split)[:int(n_split / 2)])
stsb_val = stsb_val.select(torch.randperm(n_split)[int(n_split / 2):])

## 2

In [22]:
from transformers import Trainer, TrainingArguments, pipeline, DistilBertForSequenceClassification, DistilBertTokenizerFast, DataCollatorWithPadding
# loading the model with the number of labels set as 1 to achieve regression
regression_clf_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased', num_labels = 1)

loading configuration file config.json from cache at /home/tu/tu_tu/tu_zxobe27/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "label2id": {
    "LABEL_0": 0
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.0",
  "vocab_size": 28996
}

loading weights file pytorch_model.bin from cache at /home/tu/tu_tu/tu_zxobe27/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/pytorch_model.bin
Some weights of the model checkpoint at distilbe

## 3

In [23]:
# loading the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
# loading the collator which we use for efficient padding
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

# simple function to batch tokenize utterances with truncation
## merging with [SEP] is done automatically by the tokenizer as per the documentation (https://huggingface.co/docs/transformers/v4.26.0/en/main_classes/tokenizer#transformers.PreTrainedTokenizerFast)
def preprocess_function(example):
    return tokenizer(example["sentence1"], example["sentence2"], truncation = True)

# apply the tokenization in batches and remove columns that are not needed after
stsb_train = stsb_train.map(preprocess_function, batched=True).remove_columns(["sentence1", "sentence2", "idx"])
stsb_val = stsb_val.map(preprocess_function, batched=True).remove_columns(["sentence1", "sentence2", "idx"])
stsb_test = stsb_test.map(preprocess_function, batched=True).remove_columns(["sentence1", "sentence2", "idx"])

loading file vocab.txt from cache at /home/tu/tu_tu/tu_zxobe27/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/vocab.txt
loading file tokenizer.json from cache at /home/tu/tu_tu/tu_zxobe27/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /home/tu/tu_tu/tu_zxobe27/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/tokenizer_config.json
loading configuration file config.json from cache at /home/tu/tu_tu/tu_zxobe27/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim

## 4

In [24]:
# setting the training arguments
training_args = TrainingArguments(
    # setting epochs and new output directory
    num_train_epochs = 4,
    output_dir = "./MySTSBModel",
    # everything below is identical
    save_strategy = "steps",
    save_steps = 50,
    evaluation_strategy='steps',
    eval_steps = 50,
    logging_steps = 50,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64, 
    warmup_steps=100, 
    weight_decay=0.01, 
    logging_strategy='steps', 
    logging_dir='./logs', 
    fp16 = (device == "cuda"), 
    load_best_model_at_end=True,
    log_level='info'
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


## 5

In [25]:
import evaluate
import numpy as np
# load the metrics
corr_pearson = evaluate.load("pearsonr")
corr_spearman = evaluate.load("spearmanr")

# a function to compute the metrics from predicted logits and ground truth
def compute_metrics(eval_pred):
    # separate the items
    reg_preds, labels = eval_pred
    # compute the two correlation coefficients
    return {"pearson": corr_pearson.compute(predictions = np.squeeze(reg_preds), references = labels)["pearsonr"], 
            "spearman": corr_spearman.compute(predictions = np.squeeze(reg_preds), references = labels)["spearmanr"]}

## 6

In [26]:
# define the trainer:
trainer = Trainer(
    model=regression_clf_model,
    args=training_args,
    train_dataset = stsb_train,
    eval_dataset = stsb_val,
    compute_metrics=compute_metrics,
    data_collator=data_collator
)

Using cuda_amp half precision backend


## 7

In [27]:
# perform the training
trainer.train()

***** Running training *****
  Num examples = 5000
  Num Epochs = 4
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 628
  Number of trainable parameters = 65782273
You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Pearson,Spearman
50,5.6851,2.269089,0.162055,0.131543
100,1.3975,0.992972,0.776604,0.775306
150,0.9619,0.978886,0.783636,0.774144
200,0.7774,0.688031,0.831395,0.825778
250,0.6626,1.058583,0.819995,0.828979
300,0.6441,0.699469,0.843661,0.839887
350,0.5214,0.591362,0.854371,0.850726
400,0.3485,0.68575,0.846772,0.845007
450,0.3934,0.611058,0.852351,0.849114
500,0.2748,0.635973,0.85521,0.851626


***** Running Evaluation *****
  Num examples = 750
  Batch size = 64
Saving model checkpoint to ./MySTSBModel/checkpoint-50
Configuration saved in ./MySTSBModel/checkpoint-50/config.json
Model weights saved in ./MySTSBModel/checkpoint-50/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 750
  Batch size = 64
Saving model checkpoint to ./MySTSBModel/checkpoint-100
Configuration saved in ./MySTSBModel/checkpoint-100/config.json
Model weights saved in ./MySTSBModel/checkpoint-100/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 750
  Batch size = 64
Saving model checkpoint to ./MySTSBModel/checkpoint-150
Configuration saved in ./MySTSBModel/checkpoint-150/config.json
Model weights saved in ./MySTSBModel/checkpoint-150/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 750
  Batch size = 64
Saving model checkpoint to ./MySTSBModel/checkpoint-200
Configuration saved in ./MySTSBModel/checkpoint-200/config.json
Model weights saved in ./MySTSBMo

TrainOutput(global_step=628, training_loss=0.9787004130661108, metrics={'train_runtime': 46.0847, 'train_samples_per_second': 433.983, 'train_steps_per_second': 13.627, 'total_flos': 391966112142384.0, 'train_loss': 0.9787004130661108, 'epoch': 4.0})

In [28]:
# evaluate on train data
trainer.evaluate(stsb_train)

***** Running Evaluation *****
  Num examples = 5000
  Batch size = 64


{'eval_loss': 0.15423133969306946,
 'eval_pearson': 0.9619616158330276,
 'eval_spearman': 0.9533286223393135,
 'eval_runtime': 1.1424,
 'eval_samples_per_second': 4376.561,
 'eval_steps_per_second': 69.15,
 'epoch': 4.0}

In [29]:
# evaluate on test data
trainer.evaluate(stsb_test)

***** Running Evaluation *****
  Num examples = 750
  Batch size = 64


{'eval_loss': 0.6391530632972717,
 'eval_pearson': 0.8461880126641562,
 'eval_spearman': 0.840871929112593,
 'eval_runtime': 0.2236,
 'eval_samples_per_second': 3353.476,
 'eval_steps_per_second': 53.656,
 'epoch': 4.0}

In [30]:
# evaluate on validation data
trainer.evaluate(stsb_val)

***** Running Evaluation *****
  Num examples = 750
  Batch size = 64


{'eval_loss': 0.5868598818778992,
 'eval_pearson': 0.8569211303648085,
 'eval_spearman': 0.8523696778320894,
 'eval_runtime': 0.2302,
 'eval_samples_per_second': 3258.393,
 'eval_steps_per_second': 52.134,
 'epoch': 4.0}

I find that loss and correlation are expectedly significantly better in the training set. However correlation coefficients are still good at above 0.84 and similar between the test and validation sets.

## 8

In [31]:
from datasets import Dataset
# compile a dataset from the two sentences
dataset = Dataset.from_list([{"sentence1": "Tom Brady is a football player.", "sentence2": "Tom Brady is an American Football player."}])
# get a prediction for that dataset
trainer.predict(dataset.map(preprocess_function).remove_columns(["sentence1", "sentence2"]))

  0%|          | 0/1 [00:00<?, ?ex/s]

***** Running Prediction *****
  Num examples = 1
  Batch size = 64


PredictionOutput(predictions=array([[3.719]], dtype=float16), label_ids=None, metrics={'test_runtime': 0.0083, 'test_samples_per_second': 120.081, 'test_steps_per_second': 120.081})

These two sentences are expectedly predicted to be very similar.

In [32]:
from datasets import Dataset

dataset = Dataset.from_list([{"sentence1": "A technology destroying humandkind.", "sentence2": "The BERT transformer-based neural network."}])
trainer.predict(dataset.map(preprocess_function).remove_columns(["sentence1", "sentence2"]))

  0%|          | 0/1 [00:00<?, ?ex/s]

***** Running Prediction *****
  Num examples = 1
  Batch size = 64


PredictionOutput(predictions=array([[1.018]], dtype=float16), label_ids=None, metrics={'test_runtime': 0.0083, 'test_samples_per_second': 120.419, 'test_steps_per_second': 120.419})

It appears that there is indeed little risk that BERT will destroy humandkind.

## 9

In [33]:
import joblib
# export the tokenizer and model
joblib.dump(tokenizer, "stsb_tokenizer.pkl")
joblib.dump(regression_clf_model, "stsb_model.pkl")

['stsb_model.pkl']

---
# Part C
## 1

In [34]:
from transformers import Trainer, TrainingArguments, pipeline, DistilBertForSequenceClassification, DistilBertTokenizerFast, DataCollatorWithPadding
# loading the model
classification_clf_model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-cased', 
                                                                               num_labels = 2, 
                                                                               id2label={0: "negative", 1: "positive"}, 
                                                                               label2id={"negative": 0, "positive": 1})

loading configuration file config.json from cache at /home/tu/tu_tu/tu_zxobe27/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/config.json
Model config DistilBertConfig {
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "negative",
    "1": "positive"
  },
  "initializer_range": 0.02,
  "label2id": {
    "negative": 0,
    "positive": 1
  },
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.0",
  "vocab_size": 28996
}

loading weights file pytorch_model.bin from cache at /home/tu/tu_tu/tu_zxobe27/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/pytorch_model.bin
Some w

## 2

In [35]:
from datasets import load_dataset
import evaluate
# load datasets and metric
sst2 = load_dataset("glue", "sst2")
# the proposed way of loading metrics seems to be outdated; I use a more recent one
sst2_metric = evaluate.load("glue", "sst2")

Found cached dataset glue (/home/tu/tu_tu/tu_zxobe27/.cache/huggingface/datasets/glue/sst2/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad)


  0%|          | 0/3 [00:00<?, ?it/s]

## 3

In [36]:
# splitting the dataset as proposed
texts = sst2['train']['sentence'] 
labels = sst2['train']['label'] 
val_texts = sst2['validation']['sentence'] 
val_labels = sst2['validation']['label']

## 4

In [37]:
from torch.utils.data import Dataset

# define a custom dataset class
class MyCustomDataset(Dataset):
    
    def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_masks = attention_masks
        self.values = labels

    def __len__(self):
        return len(self.values)

    def __getitem__(self, idx):
        ## I choose to not return tensors on GPU here as I have the tokenizer do this in the previous step
        return self.input_ids[idx], self.attention_masks[idx], self.values[idx]

## 5

In [38]:
import torch
# set default so that every tensor is initialized on the GPU if available
if torch.cuda.is_available(): 
    torch.set_default_tensor_type(torch.cuda.HalfTensor)
    random_gen = torch.Generator(device = "cuda").manual_seed(42)
    device = torch.device("cuda")
else:
    torch.set_default_tensor_type(torch.HalfTensor)
    random_gen = torch.Generator(device = "cpu").manual_seed(42)
    device = torch.device("cpu")

In [39]:
# load and instantiate the tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

# tokenize, pad, truncate and move results to GPU tensors if available
texts = tokenizer(texts, padding = True, truncation = True, return_tensors = "pt")
val_texts = tokenizer(val_texts, padding = True, truncation = True, return_tensors = "pt")

loading file vocab.txt from cache at /home/tu/tu_tu/tu_zxobe27/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/vocab.txt
loading file tokenizer.json from cache at /home/tu/tu_tu/tu_zxobe27/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /home/tu/tu_tu/tu_zxobe27/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/tokenizer_config.json
loading configuration file config.json from cache at /home/tu/tu_tu/tu_zxobe27/.cache/huggingface/hub/models--distilbert-base-cased/snapshots/9d7568e4b20ed5db15ee30e99c7219bde9990762/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-cased",
  "activation": "gelu",
  "attention_dropout": 0.1,
  "dim

In [40]:
# create datasets from the tokenzied tensors
train_dataset = MyCustomDataset(texts["input_ids"], texts["attention_mask"], labels)
val_dataset = MyCustomDataset(val_texts["input_ids"], val_texts["attention_mask"], val_labels)

## 6

In [41]:
from torch.utils.data import DataLoader

# create data loaders from the datasets
train_dataloader = DataLoader(train_dataset, batch_size = 16, shuffle = True, generator = random_gen)
val_dataloader = DataLoader(val_dataset, batch_size = 16, shuffle = True, generator = random_gen)

## 7

In [42]:
import torch

# define the AdamW optimizer from PyTorch
optimizer = torch.optim.AdamW(classification_clf_model.parameters(), lr = 5e-5)

# define the Cross-Entropy loss function
criterion = torch.nn.CrossEntropyLoss()

# Softmax Metric for evaluation
softmax = torch.nn.Softmax(dim = -1)

## 8, 9, 10

In [43]:
from tqdm import tqdm

# set the number of epochs
n_epochs = 4

# send the model to the selected device
classification_clf_model.to(device)

# iterate over epochs
for epoch in range(n_epochs):
    
    print(f"starting epoch: {epoch + 1}")
    
    ## set training mode
    classification_clf_model.train()
    
    # start counting loss and batches before each epoch
    sum_loss = 0
    n_batches = 0
    
    # establish a progress bar over the dataloader items        
    with tqdm(train_dataloader, unit = "batch", miniters = 10) as tepoch:
        
        # iterate over the items of the dataloader
        for batch in tepoch:
            
            # reset the gradients
            optimizer.zero_grad()
            
            # forward pass the items of the batch
            outputs = classification_clf_model(batch[0], batch[1])
            
            # the criterion defined above does not require any Softmax beforehands because it does this step by itself
            loss = criterion(outputs.logits, batch[2])
            
            # compute the gradients
            loss.backward()
            
            # update the weights
            optimizer.step()
            
            # add to counters
            n_batches += 1
            sum_loss += loss.item()
            
            # update tqdm with loss information every 100 batches
            if (n_batches % 100) == 0:
                tepoch.set_postfix_str(f" loss: {(sum_loss / n_batches)}", refresh = True)
        
    ## set evaluation mode
    classification_clf_model.eval()
    
    # iterate over items in the dataloader
    for idx, batch in enumerate(val_dataloader):
                
        # evaluate without computing gradients
        with torch.no_grad():
            # forward pass
            outputs = classification_clf_model(batch[0], batch[1])

        # extract logits
        logits = outputs.logits
        # do softmax
        scaled_logits = softmax(logits)
        # get the columns of the highest scaled logit as prediction
        predictions = torch.argmax(logits, dim = -1)
        # add the predicted batch to the metric
        sst2_metric.add_batch(predictions = predictions, references = batch[2])

    # compute and print the metric
    print(f"epoch: {epoch + 1} | metric: {sst2_metric.compute()}")

starting epoch: 1


100%|██████████| 4210/4210 [03:03<00:00, 22.88batch/s,  loss: 0.2325539238689955] 


epoch: 1 | metric: {'accuracy': 0.8979357798165137}
starting epoch: 2


100%|██████████| 4210/4210 [03:04<00:00, 22.88batch/s,  loss: 0.12242913089980859]


epoch: 2 | metric: {'accuracy': 0.8887614678899083}
starting epoch: 3


100%|██████████| 4210/4210 [03:04<00:00, 22.86batch/s,  loss: 0.08599835398649919]


epoch: 3 | metric: {'accuracy': 0.893348623853211}
starting epoch: 4


100%|██████████| 4210/4210 [03:04<00:00, 22.88batch/s,  loss: 0.06463774304200169] 


epoch: 4 | metric: {'accuracy': 0.8864678899082569}


I finally classify the same sentences as in Part A and get similar results. This is of course very encouraging.

In [44]:
pipe = pipeline("text-classification", classification_clf_model, tokenizer = tokenizer, device = 0)

In [45]:
pipe("Fargo is amazingly entertaining.")

[{'label': 'positive', 'score': 0.9998492002487183}]

In [46]:
pipe("Overall I think that Star Wars 8 is the worst thing that happened in 2017.")

[{'label': 'negative', 'score': 0.9997366070747375}]

In [47]:
pipe('The Room is probably the worst movie that I ever loved.')

[{'label': 'negative', 'score': 0.9997199177742004}]

# 11

In [48]:
import joblib
# export the tokenizer and model
joblib.dump(tokenizer, "sst2_tokenizer.pkl")
joblib.dump(classification_clf_model, "sst2_model.pkl")

['sst2_model.pkl']