### Important Classes

In [16]:
import pytorch_lightning as pl
import torch
import torch.nn as nn
import torch.nn.functional as F
from collections import OrderedDict
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from transformers import get_linear_schedule_with_warmup
from transformers import T5EncoderModel, T5Tokenizer, AutoTokenizer
import csv

LABELS_ATTR = "label"

def compute_accuracy(out, yb):
    preds = torch.argmax(out, dim=1)
    return (preds == yb).float().mean()

class Enc1T5(pl.LightningModule):
    def __init__(self, hparams):
        super(Enc1T5, self).__init__()
        self.save_hyperparameters(hparams)
        self.k = 512

        self.tokenizer = AutoTokenizer.from_pretrained(hparams.model_name_or_path)

        self.t5encoder = T5EncoderModel.from_pretrained(hparams.model_name_or_path)
        self.classifier = nn.Linear(self.k, 2)

    def forward(
        self, input_ids, attention_mask=None
    ): 
        # print("forward method of Enc1T5")
        model_output = self.t5encoder(input_ids, attention_mask=attention_mask, output_attentions=True)
        last_hidden_state = model_output.last_hidden_state
        attentions = model_output.attentions

        averaged = last_hidden_state.mean(dim=1)
        classification_output = self.classifier(averaged)
        # print(f"size after classification: {classification_output.size()}")
        output = {"prediction": classification_output, "attention": attentions}
        return output
    
    def _step(self, batch):
        labels = batch[LABELS_ATTR]
        output = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"]
            )
        pred = output["prediction"]
        loss = F.cross_entropy(pred, labels)
        accuracy = compute_accuracy(pred, labels)
        return loss, accuracy
    
    def training_step(self, batch, batch_idx=None):
        loss, accuracy = self._step(batch)
        self.log("train/loss", loss)
        self.log("train/accuracy", accuracy)
        return {"loss": loss}
    
    def validation_step(self, batch, batch_idx):
        loss, accuracy = self._step(batch)
        self.log("val_loss", loss, logger=True)
        self.log("val_accuracy", accuracy, logger=True)
        return {"val_loss": loss, 
              "val_accuracy": accuracy
                }
    
    def configure_optimizers(self):
        "Prepare optimizer and schedule (linear warmup and decay)"

        model = self.t5encoder
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": self.hparams.weight_decay,
            },
            {
                "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(optimizer_grouped_parameters, lr=self.hparams.learning_rate, \
                            eps=self.hparams.adam_epsilon, betas=self.hparams.adam_betas)
        self.opt = optimizer
        return [optimizer]
    
    def optimizer_step(self, 
                        epoch,
                        batch_idx,
                        optimizer,
                        optimizer_idx,
                        second_order_closure=None,
                        on_tpu=None,
                        using_native_amp=None,
                        using_lbfgs=None):
        optimizer.step(closure=second_order_closure)
        optimizer.zero_grad()
        self.lr_scheduler.step()

    def get_tqdm_dict(self):
      tqdm_dict = {"loss": "{:.3f}".format(self.trainer.avg_loss), "lr": self.lr_scheduler.get_last_lr()[-1]}

      return tqdm_dict

    def train_dataloader(self):
      # train_dataset = get_dataset(tokenizer=self.tokenizer, type_path="train", args=self.hparams)
      dataloader = DataLoader(self.hparams.train_dataset, batch_size=self.hparams.train_batch_size, drop_last=True, shuffle=True, num_workers=4)
      t_total = (
          (len(dataloader.dataset) // (self.hparams.train_batch_size * max(1, self.hparams.n_gpu)))
          // self.hparams.gradient_accumulation_steps
          * float(self.hparams.num_train_epochs)
      )
      scheduler = get_linear_schedule_with_warmup(
          self.opt, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=t_total
      )
      self.lr_scheduler = scheduler
      return dataloader

    def val_dataloader(self):
      # val_dataset = get_dataset(tokenizer=self.tokenizer, type_path="val", args=self.hparams)
      return DataLoader(self.hparams.val_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

class Enc2T5(Enc1T5):
    def __init__(self, hparams):
        super(Enc1T5, self).__init__()
        self.save_hyperparameters(hparams)
        self.k = 512

        self.tokenizer = AutoTokenizer.from_pretrained(hparams.model_name_or_path)

        self.t5encoder = T5EncoderModel.from_pretrained(hparams.model_name_or_path)
        self.classifier = nn.Sequential(
            nn.Linear(self.k, self.k//2),
            nn.GELU(),
            nn.Linear(self.k//2, 2)
        )

class EncMoreAttention(Enc1T5):
    def __init__(self, hparams):
        super(Enc1T5, self).__init__()
        self.save_hyperparameters(hparams)
        self.k = 512

        self.tokenizer = AutoTokenizer.from_pretrained(hparams.model_name_or_path)
        self.t5encoder = T5EncoderModel.from_pretrained(hparams.model_name_or_path)

        self.attnlayer = nn.MultiheadAttention(self.k, 8, 0.1)
        self.classifier = nn.Sequential(
            nn.Linear(self.k, self.k//2),
            nn.GELU(),
            nn.Linear(self.k//2, 2)
        )
    
    def forward(
        self, input_ids, attention_mask=None
    ): 
        # print("forward method of Enc1T5")
        last_hidden_state = self.t5encoder(input_ids, attention_mask=attention_mask).last_hidden_state
        attnoutput = self.attnlayer(last_hidden_state, last_hidden_state, last_hidden_state)
        # averaged = last_hidden_state.mean(dim=1)
        classification_output = self.classifier(attnoutput)
        # print(f"size after classification: {classification_output.size()}")
        return classification_output

class EncConvT5(Enc1T5):
    def __init__(self, hparams):
        super(Enc1T5, self).__init__()
        self.save_hyperparameters(hparams)

        self.k = 512

        self.tokenizer = AutoTokenizer.from_pretrained(hparams.model_name_or_path)
        self.t5encoder = T5EncoderModel.from_pretrained(hparams.model_name_or_path)

        self.convlayer = nn.Conv1d()

In [17]:
class RPClassificationDataset(Dataset):
    def __init__(self, tokenizer, inputs, outputs, max_len=512):
        self.max_len = max_len
        self.tokenizer = tokenizer

        self.inputs = inputs
        self.outputs = outputs

        self.tokenized_inputs = tokenizer.batch_encode_plus(
            inputs, max_length=max_len, padding=True, truncation=True, return_tensors="pt"
        )
        # self.tokenized_targets = tokenizer.batch_encode_plus(
        #     outputs, max_length=2, padding=True, truncation=True, return_tensors="pt"
        # )
    
    def __len__(self):
        return len(self.inputs)
    
    def __getitem__(self, index):
        # tokenize input and output
        
        source_ids = self.tokenized_inputs["input_ids"][index].squeeze()
        target = self.outputs[index]
        # target_ids = self.tokenized_targets["input_ids"][index].squeeze()

        src_mask    = self.tokenized_inputs["attention_mask"][index].squeeze()  # might need to squeeze
        # target_mask = self.tokenized_targets["attention_mask"][index].squeeze()  # might need to squeeze
        # what inputs does BERT model expect...
        return {"source_ids": source_ids, "source_mask": src_mask, LABELS_ATTR: target}

### Load Datasets

In [18]:
import re
def get_folds_classification(csv_path):
    val_inputs = []
    val_targets = []
    train_inputs = []
    train_targets = []

    REPLACE_NO_SPACE = re.compile("[.;:!\'?,\"()\[\]]")
    REPLACE_WITH_SPACE = re.compile("(<br\s*/><br\s*/>)|(\-)|(\/)")

    LABELS = ["unproblematisch", "problematisch"]

    with open(csv_path, encoding="utf-8") as f_source:
        reader = csv.DictReader(f_source)
        for row in reader:
            line = row["text"]
            line = REPLACE_NO_SPACE.sub("", line)
            line = REPLACE_WITH_SPACE.sub(" ", line)
            line = line

            target = int(float(row['label']))

            if int(row["ten_folds"]) <= 7:
                train_inputs.append(line)
                train_targets.append(target)
            else:
                val_inputs.append(line)
                val_targets.append(target)
        
        return train_inputs, train_targets, val_inputs, val_targets

def load_dataset(source):
# source = f"./Datasets/{dataset}-folds.csv"
    train_inputs, train_targets, val_inputs, val_targets = get_folds_classification(source)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_OR_PATH)

    train_dataset = RPClassificationDataset(tokenizer, train_inputs, train_targets)
    valid_dataset = RPClassificationDataset(tokenizer, val_inputs, val_targets)
    return train_dataset, valid_dataset

### Important Constants

In [19]:
MODEL_NAME_OR_PATH = "GermanT5/t5-efficient-oscar-german-small-el32"
DATASETS = ["RP-Crowd-3", "RP-Crowd-2", "RP-Mod"]
# WANDB_PROJECT_NAME = f"{MODEL_NAME_OR_PATH}-all-datasets"
OUTPUT_DIR = f"./{MODEL_NAME_OR_PATH}-encoder-1-results/"
TUNING_LEARNING_RATE = True

In [20]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [21]:
import argparse
args_dict = dict(
                model_name_or_path=MODEL_NAME_OR_PATH,
                gradient_accumulation_steps=16,
                weight_decay=0.1,
                learning_rate=1e-5,
                adam_epsilon=1e-8,
                adam_betas=(0.9,0.999),
                num_train_epochs=30,
                n_gpu=1,
                train_batch_size=8,
                eval_batch_size=8,
                data_dir="", # path for data files
                output_dir=OUTPUT_DIR, # path to save the checkpoints
                # dataset_name=dataset,
                max_seq_length=512,
                early_stop_callback=True,
                fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
                opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
                max_grad_norm=0.5, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
                seed=42,
                # train_dataset=train_dataset,
                # val_dataset=valid_dataset,
                warmup_steps=0
                )
args = argparse.Namespace(**args_dict)

train_params = dict(
                accumulate_grad_batches=args.gradient_accumulation_steps,
                auto_lr_find=True,
                gpus=args.n_gpu,
                max_epochs=args.num_train_epochs,
                # default_root_dir=f"/home/dobby/RP-Mod/t5-efficient-oscar-german-small-el32",
                precision= 16 if args.fp_16 else 32,
                amp_level=args.opt_level,
                gradient_clip_val=args.max_grad_norm,
                # enable_checkpointing=checkpoint_callback,
                # callbacks=[EarlyStopping(monitor="val/accuracy", patience=5, mode="max")],
                # callbacks=[raytuner_callback],
                # callbacks=[LoggingCallback()],
                amp_backend="apex"
                )

### Modular Hyperparameter Tuning

In [22]:
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping
import wandb
def train(model_class, config, wandb_project_name):
    # load train & test datasets
    run_name = ""
    for key in config.keys():
        run_name += f"-{key}-{config[key]}"
        args_dict[key] = config[key]
    args = argparse.Namespace(**args_dict)

    # get train params and update with wandb logger, checkpoint callback, and early stopping callback
    early_stop_callback = EarlyStopping(monitor="val_accuracy", patience=5, mode="max")
    wandb.finish()
                        
    wandb_logger = WandbLogger(project=wandb_project_name, 
            name=run_name)
    wandb.define_metric("val_accuracy", summary="max")

    checkpoint_callback = pl.callbacks.ModelCheckpoint(
                                dirpath=args.output_dir + "/" + wandb_project_name + "/" + run_name, 
                                filename="{epoch}-{val_accuracy:.2f}-{val_loss:.2f}", 
                                monitor="val_accuracy", mode="max", save_top_k=5
                                )
    
    train_params["logger"] = wandb_logger
    train_params["callbacks"] = [early_stop_callback, checkpoint_callback]

    model = model_class(args)
    trainer = pl.Trainer(**train_params)
    trainer.fit(model)

def do_one_training_run(dataset, model_class, config, wandb_project_name):
    source = f"./Datasets/{dataset}-folds.csv"

    train_dataset, valid_dataset = load_dataset(source)
    args_dict["train_dataset"] = train_dataset
    args_dict["val_dataset"] = valid_dataset

    train(model_class, config, wandb_project_name)

In [23]:
# config_test = {
#     "learning_rate": [1e-7, 1e-4],
#     "weight_decay": [1e-3, 0],
#     "gradient_accumulation_steps" : 16
# }

# for dataset in DATASETS:
#     source = f"./Datasets/{dataset}-folds.csv"

#     train_dataset, valid_dataset = load_dataset(source)



In [24]:
import numpy.random as rand
import numpy as np
np.exp(rand.uniform(-7, -3, 2))

array([0.01652095, 0.0094098 ])

In [25]:
arr = rand.uniform(-6, -3, 2)
arr

array([-3.56499735, -4.52826678])

In [26]:
np.power(10, arr)

array([2.72271793e-04, 2.96301071e-05])

### Run once

In [27]:
run_once = True
if run_once:
    config = {
        "learning_rate": 0.0001,
        "weight_decay": 0.1,
        "num_train_epochs": 15
    }

    for dataset in DATASETS:
        do_one_training_run(dataset, Enc1T5, config, f"{dataset}-enc1t5-attention-output")

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

Some weights of the model checkpoint at GermanT5/t5-efficient-oscar-german-small-el32 were not used when initializing T5EncoderModel: ['decoder.block.3.layer.1.layer_norm.weight', 'lm_head.weight', 'decoder.final_layer_norm.weight', 'decoder.block.3.layer.2.DenseReluDense.wo.weight', 'decoder.block.5.layer.0.SelfAttention.k.weight', 'decoder.block.0.layer.1.EncDecAttention.q.weight', 'decoder.block.3.layer.0.SelfAttention.v.weight', 'decoder.block.1.layer.1.EncDecAttention.v.weight', 'decoder.block.2.layer.2.DenseReluDense.wi.weight', 'decoder.block.5.layer.1.EncDecAttention.o.weight', 'decoder.block.5.layer.0.SelfAttention.q.weight', 'decoder.block.1.layer.0.SelfAttention.q.weight', 'decoder.block.1.layer.2.DenseReluDense.wi.weight', 'decoder.block.0.layer.2.DenseReluDense.wo.weight', 'decoder.block.1.layer.1.EncDecAttention.q.weight', 'decoder.block.4.layer.0.SelfAttention.q.weight', 'decoder.block.2.layer.1.EncDecAttention.q.weight', 'decoder.block.5.layer.2.DenseReluDense.wo.weight

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▂▂▂▂▃▃▃▃▄▅▅▅▅▆▆▆▆▇▇▇██
train/accuracy,▁▅▁▅▅█▅▅█▅
train/loss,█▄█▄▃▁▃▄▂▃
trainer/global_step,▁▁▂▂▂▃▃▃▃▄▄▅▅▅▆▆▆▆▇▇▇██
val_accuracy,▁▆▇██████████
val_loss,█▃▁▁▁▁▁▂▂▂▃▃▃

0,1
epoch,12.0
train/accuracy,0.875
train/loss,0.17638
trainer/global_step,519.0
val_loss,0.44697


Some weights of the model checkpoint at GermanT5/t5-efficient-oscar-german-small-el32 were not used when initializing T5EncoderModel: ['decoder.block.3.layer.1.layer_norm.weight', 'lm_head.weight', 'decoder.final_layer_norm.weight', 'decoder.block.3.layer.2.DenseReluDense.wo.weight', 'decoder.block.5.layer.0.SelfAttention.k.weight', 'decoder.block.0.layer.1.EncDecAttention.q.weight', 'decoder.block.3.layer.0.SelfAttention.v.weight', 'decoder.block.1.layer.1.EncDecAttention.v.weight', 'decoder.block.2.layer.2.DenseReluDense.wi.weight', 'decoder.block.5.layer.1.EncDecAttention.o.weight', 'decoder.block.5.layer.0.SelfAttention.q.weight', 'decoder.block.1.layer.0.SelfAttention.q.weight', 'decoder.block.1.layer.2.DenseReluDense.wi.weight', 'decoder.block.0.layer.2.DenseReluDense.wo.weight', 'decoder.block.1.layer.1.EncDecAttention.q.weight', 'decoder.block.4.layer.0.SelfAttention.q.weight', 'decoder.block.2.layer.1.EncDecAttention.q.weight', 'decoder.block.5.layer.2.DenseReluDense.wo.weight

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇████
train/accuracy,▅▅▁▅▅▄▄▇▅▅██▅▅▇▇▇█▇█▇███▇█
train/loss,▅▅█▅▅▆▄▂▆▄▂▂▄█▄▄▄▁▃▁▃▁▁▂▂▁
trainer/global_step,▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇███
val_accuracy,▁▅▆▇▆▇████▇▇
val_loss,▂▁▁▂▂▃▃▅▆▆▇█

0,1
epoch,11.0
train/accuracy,1.0
train/loss,0.07521
trainer/global_step,1307.0
val_loss,0.63238


Some weights of the model checkpoint at GermanT5/t5-efficient-oscar-german-small-el32 were not used when initializing T5EncoderModel: ['decoder.block.3.layer.1.layer_norm.weight', 'lm_head.weight', 'decoder.final_layer_norm.weight', 'decoder.block.3.layer.2.DenseReluDense.wo.weight', 'decoder.block.5.layer.0.SelfAttention.k.weight', 'decoder.block.0.layer.1.EncDecAttention.q.weight', 'decoder.block.3.layer.0.SelfAttention.v.weight', 'decoder.block.1.layer.1.EncDecAttention.v.weight', 'decoder.block.2.layer.2.DenseReluDense.wi.weight', 'decoder.block.5.layer.1.EncDecAttention.o.weight', 'decoder.block.5.layer.0.SelfAttention.q.weight', 'decoder.block.1.layer.0.SelfAttention.q.weight', 'decoder.block.1.layer.2.DenseReluDense.wi.weight', 'decoder.block.0.layer.2.DenseReluDense.wo.weight', 'decoder.block.1.layer.1.EncDecAttention.q.weight', 'decoder.block.4.layer.0.SelfAttention.q.weight', 'decoder.block.2.layer.1.EncDecAttention.q.weight', 'decoder.block.5.layer.2.DenseReluDense.wo.weight

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

### Auto Learning Rate finder

In [None]:
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import EarlyStopping
import wandb
import argparse
import re

TUNING_LEARNING_RATE = True
model_name = "enc2t5"
for dataset in DATASETS:
        source = f"./Datasets/{dataset}-folds.csv"
        train_inputs, train_targets, val_inputs, val_targets = get_folds_classification(source)

        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME_OR_PATH)

        train_dataset = RPClassificationDataset(tokenizer, train_inputs, train_targets)
        valid_dataset = RPClassificationDataset(tokenizer, val_inputs, val_targets)
        args_dict = dict(
                model_name_or_path=MODEL_NAME_OR_PATH,
                gradient_accumulation_steps=16,
                weight_decay=0.1,
                learning_rate=1e-5,
                adam_epsilon=1e-8,
                adam_betas=(0.9,0.999),
                num_train_epochs=30,
                n_gpu=1,
                train_batch_size=8,
                eval_batch_size=8,
                data_dir="", # path for data files
                output_dir=OUTPUT_DIR, # path to save the checkpoints
                dataset_name=dataset,
                max_seq_length=512,
                early_stop_callback=True,
                fp_16=False, # if you want to enable 16-bit training then install apex and set this to true
                opt_level='O1', # you can find out more on optimisation levels here https://nvidia.github.io/apex/amp.html#opt-levels-and-properties
                max_grad_norm=0.5, # if you enable 16-bit training then set this to a sensible value, 0.5 is a good default
                seed=42,
                train_dataset=train_dataset,
                val_dataset=valid_dataset,
                warmup_steps=0
                )
        args = argparse.Namespace(**args_dict)
        args.auto_lr_find = "learning_rate"

        train_params = dict(
                accumulate_grad_batches=args.gradient_accumulation_steps,
                auto_lr_find=True,
                gpus=args.n_gpu,
                max_epochs=args.num_train_epochs,
                # default_root_dir=f"/home/dobby/RP-Mod/t5-efficient-oscar-german-small-el32",
                precision= 16 if args.fp_16 else 32,
                amp_level=args.opt_level,
                gradient_clip_val=args.max_grad_norm,
                # enable_checkpointing=checkpoint_callback,
                callbacks=[],
                # callbacks=[EarlyStopping(monitor="val/accuracy", patience=5, mode="max")],
                # callbacks=[raytuner_callback],
                # callbacks=[LoggingCallback()],
                amp_backend="apex"
                )
        wandb_project_name = f"{dataset}-hyperparameter-search-{model_name}"
        possible_weight_decays = [0.1, 0]
        early_stop_callback = EarlyStopping(monitor="val_accuracy", patience=1, mode="max")
        for wd in possible_weight_decays:
                if TUNING_LEARNING_RATE:
                        
                        args.weight_decay = wd
                        model = Enc1T5(args)
                        init_trainer = pl.Trainer(**train_params)
                        print("*" * 100)
                        print(f"{dataset} Learning Rate Tuning")
                        lr_finder = init_trainer.tuner.lr_find(model)
                        # print(lr_finder.results)
                        fig = lr_finder.plot(suggest=True)
                        fig.show()
                        new_lr = lr_finder.suggestion()
                        print(f"Best Learning Rate is: {new_lr}")

                        # update with the best learning rate
                        possible_learning_rates = [1e-4, new_lr, 1e-5]
                        # possible_learning_rates = np.power(10, rand.uniform(-6, np.log10(new_lr) + 1, 3))
                else:
                        possible_learning_rates = [1e-4, 1e-5, 1e-6]
                
                for lr in possible_learning_rates:
                        config = {
                                "learning_rate": lr,
                                "weight_decay": wd, 
                                "num_train_epochs": 3
                        }
                        train(Enc2T5, config, wandb_project_name)
                        # model.hparams.learning_rate = lr
                        # run_name = f"learning_rate-{lr}-weight_decay-{wd}"

                        # # set up wandb logging
                        # wandb.finish()
                        
                        # wandb_logger = WandbLogger(project=f"hyperparameter-search-{args.dataset_name}-encoder-1-t5", 
                        #         name=run_name)
                        # wandb.define_metric("val/accuracy", summary="max")
                        # train_params_copy = train_params.copy()
                        # train_params_copy["logger"] = wandb_logger

                        # # set up checkpointing
                        # checkpoint_callback = pl.callbacks.ModelCheckpoint(
                        #         dirpath=args.output_dir + "/" + run_name, filename="checkpoint", monitor="val/accuracy", mode="max", save_top_k=5
                        #         )
                        # train_params_copy["callbacks"] = [early_stop_callback, checkpoint_callback]

                        # # make the trainer
                        # trainer = pl.Trainer(**train_params_copy)
                        # # fit the model
                        # trainer.fit(model)
        

Some weights of the model checkpoint at GermanT5/t5-efficient-oscar-german-small-el32 were not used when initializing T5EncoderModel: ['decoder.block.3.layer.1.layer_norm.weight', 'lm_head.weight', 'decoder.final_layer_norm.weight', 'decoder.block.3.layer.2.DenseReluDense.wo.weight', 'decoder.block.5.layer.0.SelfAttention.k.weight', 'decoder.block.0.layer.1.EncDecAttention.q.weight', 'decoder.block.3.layer.0.SelfAttention.v.weight', 'decoder.block.1.layer.1.EncDecAttention.v.weight', 'decoder.block.2.layer.2.DenseReluDense.wi.weight', 'decoder.block.5.layer.1.EncDecAttention.o.weight', 'decoder.block.5.layer.0.SelfAttention.q.weight', 'decoder.block.1.layer.0.SelfAttention.q.weight', 'decoder.block.1.layer.2.DenseReluDense.wi.weight', 'decoder.block.0.layer.2.DenseReluDense.wo.weight', 'decoder.block.1.layer.1.EncDecAttention.q.weight', 'decoder.block.4.layer.0.SelfAttention.q.weight', 'decoder.block.2.layer.1.EncDecAttention.q.weight', 'decoder.block.5.layer.2.DenseReluDense.wo.weight

****************************************************************************************************
RP-Crowd-3 Learning Rate Tuning


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Finding best initial lr:   0%|          | 0/100 [00:00<?, ?it/s]

KeyboardInterrupt: 