# T5 Fine-Tuning Organized Notebook

### Check if GPU is currently available

In [29]:
!nvidia-smi

Wed Jul 20 18:48:12 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.103.01   Driver Version: 470.103.01   CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Quadro RTX 6000     Off  | 00000000:D8:00.0 Off |                  Off |
| 35%   41C    P8     7W / 260W |  17493MiB / 24220MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [30]:
%ls

[0m[01;34mDatasets[0m/
[01;34mGermanT5[0m/
[01;34mGermanT5-RP-Mod[0m/
[01;34mRP-Crowd-3-results[0m/
[01;34mRP-Mod[0m/
[01;34mRP-Mod-GermanT5-oscar-german-small-el32[0m/
[01;34mRP-Mod-results[0m/
T5FineTuner.py
[01;34m__pycache__[0m/
[01;34maclImdb[0m/
aclImdb_v1.tar.gz
[01;34marguments_test_dir[0m/
[01;34maugmented[0m/
[01;34mbestmodels[0m/
classification_classes.py
create_t5_embeddings.ipynb
[01;34mfalse_pos[0m/
[01;34mgerman-t5-oscar-ep1-prompted-germanquadRP-Crowd-2-learning_rate-0.0004[0m/
[01;34mgerman-t5-oscar-ep1-prompted-germanquadRP-Crowd-2-learning_rate-5.6e-05[0m/
[01;34mgerman-t5-oscar-ep1-prompted-germanquadRP-Crowd-3-learning_rate-0.0004[0m/
[01;34mgerman-t5-oscar-ep1-prompted-germanquadRP-Crowd-3-learning_rate-5.6e-05[0m/
[01;34mgerman-t5-oscar-ep1-prompted-germanquadRP-Mod-learning_rate-0.0004[0m/
[01;34mgerman-t5-oscar-ep1-prompted-germanquadRP-Mod-learning_rate-5.6e-05[0m/
[01;34mgerman-t5-oscar-ep1-prompted-germanquadno-gradien

### Import statements

In [31]:
import argparse
import glob
import os
import json
import time
import logging
import random
import re
from itertools import chain
from string import punctuation

import nltk
nltk.download('punkt')
from nltk.tokenize import sent_tokenize
import csv

import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl

# eval packages
import textwrap
from tqdm.auto import tqdm
from sklearn import metrics
from torch.optim import AdamW

from transformers import (
    # AdamW,
    T5ForConditionalGeneration,
    T5Tokenizer,
    get_linear_schedule_with_warmup
)

def set_seed(seed):
  random.seed(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

set_seed(42)

[nltk_data] Downloading package punkt to /home/dobby/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Important Class Definitions

In [32]:
class T5FineTuner(pl.LightningModule):
  def __init__(self, hparams):
    super(T5FineTuner, self).__init__()
    # self.hparams.save_hyperparameters(hparams)
    self.save_hyperparameters(hparams)
    
    self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
    self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)
    # self.train_dataset = train_dataset
    # self.val_dataset = val_dataset
    
  def is_logger(self):
    return self.trainer.global_rank <= 0
  

  def forward(
      self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, labels=None
  ):
    return self.model(
        input_ids,
        attention_mask=attention_mask,
        decoder_input_ids=decoder_input_ids,
        decoder_attention_mask=decoder_attention_mask,
        labels=labels,
        # lm_labels=lm_labels,
    )

  def _step(self, batch):
    labels = batch["target_ids"]
    labels[labels[:, :] == self.tokenizer.pad_token_id] = -100

    outputs = self(
        input_ids=batch["source_ids"],
        attention_mask=batch["source_mask"],
        labels=labels,
        # lm_labels=lm_labels,
        decoder_attention_mask=batch['target_mask']
    )

    loss = outputs[0]

    return loss
  
  def get_accuracy(self, batch):
    labels = batch["target_ids"]

    outs = self.model.generate(input_ids=batch["source_ids"], 
                                attention_mask=batch["source_mask"], 
                                max_length=2)

    dec = [self.tokenizer.decode(ids) for ids in outs]
    target = [self.tokenizer.decode(label) for label in labels]

    new_outputs = [s[6:] for s in dec]
    new_targets = [s[:-4] for s in target]

    accuracy_score = metrics.accuracy_score(new_targets, new_outputs)
    f1 = metrics.f1_score(new_targets, new_outputs, average="micro")
    # rec = metrics.recall_score(new_targets, new_outputs, average="micros")
    return accuracy_score, f1
  # def computer_accuracy(self):
  #   self.model.model.eval()

  def training_step(self, batch, batch_idx):
    loss = self._step(batch)
    self.log("train/loss", loss)
    return {"loss": loss}
  
  def training_epoch_end(self, outputs):
    avg_train_loss = torch.stack([x["loss"] for x in outputs]).mean()
    tensorboard_logs = {"avg_train_loss": avg_train_loss}
    self.log("avg_train_loss", avg_train_loss)
    
  def validation_step(self, batch, batch_idx):
    loss = self._step(batch)
    accuracy, f1 = torch.tensor(self.get_accuracy(batch))
    self.log("val_loss", loss, logger=True)
    self.log("val_accuracy", accuracy, logger=True)
    return {"val_loss": loss, "val_accuracy": accuracy}
  
  def validation_epoch_end(self, outputs):
    avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
    avg_accuracy = torch.stack([x["val_accuracy"] for x in outputs]).mean()
    # tensorboard_logs = {"val_loss": avg_loss}
    self.log("avg_val_loss", avg_loss)
    self.log("avg_val_accuracy", avg_accuracy)
    # self.log("log", tensorboard_logs)
    # self.log("progress_bar", tensorboard_logs)
    # self.log({"avg_val_loss": avg_loss, 
    #           "log": tensorboard_logs,
    #           'progress_bar': tensorboard_logs}, logger=True, prog_bar=True)
    return {"avg_val_loss": avg_loss, "avg_val_accuracy": avg_accuracy}

  def configure_optimizers(self):
    "Prepare optimizer and schedule (linear warmup and decay)"

    model = self.model
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": self.hparams.weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
    self.opt = optimizer
    return [optimizer]
  
  def optimizer_step(self, 
                      epoch,
                      batch_idx,
                      optimizer,
                      optimizer_idx,
                      second_order_closure=None,
                      on_tpu=None,
                      using_native_amp=None,
                      using_lbfgs=None):
    # if self.trainer.use_tpu:
    #   xm.optimizer_step(optimizer)
    # else:
    optimizer.step(closure=second_order_closure)
    optimizer.zero_grad()
    self.lr_scheduler.step()
  
  # def closure(self):
  #   return "closure"
  
  def get_tqdm_dict(self):
    tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

    return tqdm_dict

  def train_dataloader(self):
    # train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams)
    dataloader = DataLoader(self.hparams.train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
    t_total = (
        (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
        // self.hparams.gradient_accumulation_steps
        * float(self.hparams.num_train_epochs)
    )
    scheduler = get_linear_schedule_with_warmup(
        self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
    )
    self.lr_scheduler = scheduler
    return dataloader

  def val_dataloader(self):
    # val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="val", args=self.hparams)
    return DataLoader(self.hparams.val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

In [33]:
class RPDataset(Dataset):
    def __init__(self, tokenizer, inputs, outputs, max_len=512):
        self.max_len = max_len
        self.tokenizer = tokenizer

        self.inputs = inputs
        self.outputs = outputs

        self.tokenized_inputs = tokenizer.batch_encode_plus(
            inputs, max_length=max_len, padding=True, truncation=True, return_tensors="pt"
        )
        self.tokenized_targets = tokenizer.batch_encode_plus(
            outputs, max_length=2, padding=True, truncation=True, return_tensors="pt"
        )
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, index):
        # tokenize input and output
        
        source_ids = self.tokenized_inputs["input_ids"][index].squeeze()
        target_ids = self.tokenized_targets["input_ids"][index].squeeze()

        src_mask    = self.tokenized_inputs["attention_mask"][index].squeeze()  # might need to squeeze
        target_mask = self.tokenized_targets["attention_mask"][index].squeeze()  # might need to squeeze

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}
        

## Important Constants

In [34]:
DATASET = "augmented/500short-RP-Mod"
SUFFIX = "t5-efficient-oscar-german-small-el32"
MODEL_NAME_OR_PATH = f"GermanT5/{SUFFIX}"
dataset = DATASET.replace("/", "-")
WANDB_PROJECT_NAME = f"{dataset}-t5-efficient-oscar-german-small-el32"
OUTPUT_DIR = f"./{DATASET}-results/t5-efficient-oscar-german-small-el32/"

## Load Dataset

In [35]:
SOURCE = f"./Datasets/{DATASET}-folds.csv"
tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME_OR_PATH)

def get_folds(csv_path):
    #TODO: check for URLs and possibly turn them into text!?
    val_inputs = []
    val_targets = []
    train_inputs = []
    train_targets = []

    REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
    REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

    LABELS = ["unproblematisch", "problematisch"]

    with open(csv_path, encoding="utf-8") as f_source:
        reader = csv.DictReader(f_source)
        for row in reader:
            line = row["text"]
            line = REPLACE_NO_SPACE.sub("", line)
            line = REPLACE_WITH_SPACE.sub(" ", line)
            # line = "classification: " + line
            print(line)

            target = LABELS[int(float(row['label']))]

            if int(row["ten_folds"]) <= 7:
                train_inputs.append(line)
                train_targets.append(target)
            else:
                val_inputs.append(line)
                val_targets.append(target)
        
        return train_inputs, train_targets, val_inputs, val_targets

train_inputs, train_targets, val_inputs, val_targets = get_folds(SOURCE)

train_dataset = RPDataset(tokenizer, train_inputs, train_targets)
valid_dataset = RPDataset(tokenizer, val_inputs, val_targets)

print(len(train_dataset), len(valid_dataset))

Warum gehen die dann nach Lybien Man muss sie ja nicht nach Lybien zurück schicken 
Und warum funktioniert diese Section Control einwandfrei und ohne datenschutzrechtliche Problem zB in den Niederlanden und Österreich   beides EU Mitglieder in beiden Ländern gilt dieselbe DSGVO Heißt es nicht Bundesrecht bricht Landesrecht und EU Recht bricht Bundesrecht Irgendetwas stimmt doch in diesem ganzen Rechtsgefüge nicht   blicken etwa unsere Parlamentarier wieder einmal nicht mehr durch   wie schon bei den Abstimmungen über den Maastricht Vertrag und den Lissabon Vertrag
„Ferndiagnosen sind deplatziert“ das stimmt Wieso wurden und werden über Donald Trump solche erstellt Es geistern seit der Amtseinführung solche Diagnosen durch die Presse aber bei Frau Merkel ist es deplatziert
Weil in deren Kulturkreis Karfreitag kein Feiertag ist Ist doch recht simpel Vielleicht erleben wir dazu auch noch ein paar Blockaden auf der Autobahn und Grill  Happenings von „großen“ Familien im Park und die Bereic

In [36]:
print(train_inputs[0], train_targets[0], val_inputs[0], val_targets[0])

Warum gehen die dann nach Lybien Man muss sie ja nicht nach Lybien zurück schicken  unproblematisch kultuuuuuuuuur in remscheid mit alkohoooooooool   bieeeeeeeeeer und cooooooover   miuuuuuusik  das ist kultur genau dem stadtzustand angepasst   herrlich   krank  keine kultur   problematisch


## Hyperparameter Selection

Learning Rate: I found the given learning rate, by using pytorch lightning's auto_lr_finder method https://pytorch-lightning.readthedocs.io/en/stable/advanced/training_tricks.html#learning-rate-finder and then I experimented with learning rates in that ballpark to find the optimal learning rat. 

Weight Decay: I chose to use weight decay, because the fine-tuned T5 model was overfitting the data after only a few epochs. I experimented with values between 0.01 and 10 and found that 0.1 was the best value.

Note: on RP-Mod dataset the validation loss starts to diverge after about 5 epochs even though the vaildation accuracy tends continues to increase. This indicates overfitting on the RP-Mod dataset. 

In [37]:
from pytorch_lightning.loggers import WandbLogger
import wandb

possible_learning_rates = [4e-4]
possible_weight_decays = [0.1]

for lr in possible_learning_rates:
    for wd in possible_weight_decays:
        wandb.finish()
        run_name = f"learning_rate-{lr}-weight_decay-{wd}"
        wandb_logger = WandbLogger(project=WANDB_PROJECT_NAME, 
        name=run_name)

        wandb.define_metric("val_accuracy", summary="max")

        checkpoint_callback = pl.callbacks.ModelCheckpoint(
            dirpath=OUTPUT_DIR + run_name, filename="{epoch}-{val_accuracy:.2f}-{val_loss:.2f}", monitor="val_accuracy", mode="max", save_top_k=5
        )

        args_dict = dict(
            data_dir="", # path for data files
            output_dir=OUTPUT_DIR, # path to save the checkpoints
            model_name_or_path=MODEL_NAME_OR_PATH,
            tokenizer_name_or_path=MODEL_NAME_OR_PATH,
            dataset_name=DATASET,
            max_seq_length=512,
            learning_rate=lr,
            weight_decay=wd,
            adam_epsilon=1e-8,
            warmup_steps=0,
            train_batch_size=8,
            eval_batch_size=8,
            num_train_epochs=20,
            gradient_accumulation_steps=32,
            n_gpu=1,
            early_stop_callback=False,
            fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
            opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
            max_grad_norm=0.5, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
            seed=42,
            train_dataset=train_dataset, 
            val_dataset=valid_dataset
        )
        args = argparse.Namespace(**args_dict)

        train_params = dict(
            accumulate_grad_batches=args.gradient_accumulation_steps,
            auto_lr_find=True,
            gpus=args.n_gpu,
            max_epochs=args.num_train_epochs,
            default_root_dir=f"/home/dobby/RP-Mod/t5-efficient-oscar-german-small-el32",
            # early_stop_callback=False,
            precision= 16 if args.fp_16 else 32,
            amp_level=args.opt_level,
            gradient_clip_val=args.max_grad_norm,
            # checkpoint_callback=checkpoint_callback,
            logger=wandb_logger,
            enable_checkpointing=checkpoint_callback,
            callbacks=[checkpoint_callback],
            # callbacks=[raytuner_callback],
            # callbacks=[LoggingCallback()],
            amp_backend="apex"
        )

        # model = T5FineTuner(args, train_dataset, valid_dataset)
        model = T5FineTuner(args)
        trainer = pl.Trainer(**train_params)

        print("begin training!!")
        trainer.fit(model)

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
avg_train_loss,█▁▁▁▁▁▁
avg_val_accuracy,▁▁▁▁▁▁▁
avg_val_loss,▁▇█████
epoch,▁▁▂▂▃▃▃▅▅▆▆▆▇▇███
train/loss,█▃▁
trainer/global_step,▁▁▂▂▂▃▃▅▅▅▆▆▇▇▇██
val_accuracy,▁▁▁▁▁▁▁
val_loss,▁▇█████

0,1
avg_train_loss,0.0
avg_val_accuracy,0.5
avg_val_loss,6.1223
epoch,6.0
train/loss,0.0
trainer/global_step,160.0
val_loss,6.1223


GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(


begin training!!


  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 142 M 
-----------------------------------------------------
142 M     Trainable params
0         Non-trainable params
142 M     Total params
569.289   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

### Loading Models from Checkpoint

In [38]:
checkpoint_path = "./GermanT5-RP-Mod/t5-efficient-oscar-german-small-el32/lr-0.0004-wd-0.1/epoch=3-val_accuracy=0.74-val_loss=0.28.ckpt"

new_model = T5FineTuner.load_from_checkpoint(checkpoint_path=checkpoint_path)
new_modelo

AttributeError: 'dict' object has no attribute 'model_name_or_path'

In [None]:
checkpoint = torch.load(checkpoint_path)
checkpoint

{'epoch': 3,
 'global_step': 360,
 'pytorch-lightning_version': '1.6.4',
 'state_dict': OrderedDict([('model.shared.weight',
               tensor([[  2.8209,  -7.1442,  -2.9547,  ...,   2.6787,   2.8289,  -6.9014],
                       [ -9.3156,   6.5471, -14.4285,  ..., -13.8214, -21.0871, -15.7351],
                       [ -8.2654,   1.9326,  -0.5002,  ..., -13.3861,   4.5573, -27.7593],
                       ...,
                       [  6.4472, -17.3983, -15.4240,  ...,  -9.0693, -15.9176, -10.7968],
                       [  6.4781, -19.2491, -13.4497,  ...,  -9.1927, -14.4985,  -9.9947],
                       [  5.2750, -17.6450, -15.0538,  ...,  -9.3778, -14.4369, -10.4266]],
                      device='cuda:0')),
              ('model.encoder.embed_tokens.weight',
               tensor([[  2.8209,  -7.1442,  -2.9547,  ...,   2.6787,   2.8289,  -6.9014],
                       [ -9.3156,   6.5471, -14.4285,  ..., -13.8214, -21.0871, -15.7351],
                       [ 