In [111]:
import os

In [112]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [113]:
%pwd

'/Users/heddafiedler/Documents/MASTER_DATA_SCIENCE/Semester_3/DL'

# Purpose of the Notebook

In this notebook I create the pipeline for using the model built in the MAGPIE Repository in order to understand its components and test how the elements work together.
Besides, I already look into the parts I want to change, like more detailed logging / debugging steps to better understand the process. Apart from that I will use the code provided by the Repository.

In order to understand the core elements of the model architecture and pipeline, I will only display the main parts here and import utils and other functions.

# Data Ingestion and Preprocessing
Since the repository already provides the datasets in a preprocessed way, I will use these files for the model training according to the data sets I chose (see README file).
Nevertheless, I will need to include data preprocessing in order to do inference on new data. Therefore, the following part tests the preporcessing of random text input and the tokenization of the text. This test is later used to try out how the model works with the data.

In [115]:
#Tokenization
from transformers import DistilBertTokenizerFast

# Initialize the fast tokenizer
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')



In [116]:
text = "This is a test"
tokenized = tokenizer(text, truncation=True, return_tensors="pt")
print(tokenized)

{'input_ids': tensor([[ 101, 2023, 2003, 1037, 3231,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}


# Data Initialization with Task and Subtask Classes
Since the MTL approach is about combining different tasks, the data needs to contain the information bout which model head it needs in the model pipeline.
I will use the Task and Subtask classes from the MAGPIE Repository to create the tasks I want to use. The tasks are defined in the following part.

The Sub Tasks I am going to use according to the datasets I chose are:
- Token-Level Classification (POS) --> noch besser verstehen was das genau ist
- Binary Classification
- Multi-Class Classification
- (Regression) - not used in current implementation
- (Masked Language Modelling) - not used in current implementation

The Subtask class defines how to load and structure the respective data set, as well as other functions like weight scaling and class weights for imbalanced datasets

The Task class is a wrapper for the subtasks and contains the task id and the subtasks list. Since I am using only one dataset for each subtask, the subtask list contains only one subtask. Nevertheless, I am leaving the wrapper in the code in order to use the other steps in the same way as in the repository.

The SubTaskDataset class creates then the actual data loaders and also contains the BatchList class for Training and for Evaluation.



In [117]:
import re
from typing import List, Tuple

import pandas as pd
import torch
import numpy as np
import random

from config.config import DEV_RATIO, MAX_LENGTH, REGRESSION_SCALAR, TRAIN_RATIO
from old_utils.common import get_class_weights
from old_utils.enums import Split
from old_utils.common import set_random_seed
from old_utils.logger import general_logger


In [118]:
"""This part contains the Task class."""

class Task:
    """Wrap subtasks."""

    def __init__(self, task_id, subtasks_list):
        """Initialize a Task."""
        self.task_id = task_id
        self.subtasks_list = subtasks_list

    def __repr__(self):
        """Represent a task."""
        return (
            f"Task {self.task_id} with {len(self.subtasks_list)} subtask{'s' if len(self.subtasks_list) > 1 else ''}"
        )

    def __str__(self) -> str:
        return str(self.task_id)

In [119]:
"""This part contains the Subtask."""

def get_pos_idxs(pos: str, text: str):
    """
    Get the correct idxs of the pos for a given text.

    @param pos: A pattern as text.
    @param text: The text to search trough.
    @return: The ids of the tokens in the text that match the pattern.
    """
    if pos == text:
        mask = np.array(np.ones((len(text))), dtype="int")
    else:
        pos = pos.replace("[", "\[")
        pos = pos.replace("$", "\$")
        pos = pos.replace("?", "\?")
        pos = pos.replace(")", "\)")
        pos = pos.replace("(", "\(")
        pos = pos.replace("*", "\*")
        pos = pos.replace("+", "\+")
        start, end = re.search(pos, text).span()

        mask = np.zeros((len(text)), dtype=int)
        mask[start:end] = 1
    c, idx_list = 0, []
    for t in text.split():
        idx_list.append(c)
        c += len(t) + 1
    mask_idxs = [mask[i] for i in idx_list]
    return mask_idxs


def align_labels_with_tokens(labels: List[int], word_ids: List[int]):
    """Align labels with tokens.

    C/p from https://huggingface.co/course/chapter7/2
    """
    new_labels = []
    current_word = None
    for word_id in word_ids:
        if word_id != current_word:
            # Start of a new word!
            current_word = word_id
            label = (
                -100 if word_id is None else labels[word_id]
            )  # -100 is an index that will be ignored by cross entropy
            new_labels.append(label)
        elif word_id is None:
            # Special token
            new_labels.append(-100)
        else:
            # Same word as previous token
            label = labels[word_id]
            # If the label is B-XXX we change it to I-XXX
            if label % 2 == 1:
                label += 1
            new_labels.append(label)

    return new_labels


def get_tokens_and_labels(pos_list_list, text_list, labels):
    """Get tokens and labels for scattered POS.

    In this objective, we have a list of consecutive spans.
    For each of these consecutive spans, find the correct index of the corresponding tokens in the text_list.
    Returns the bitwise or ('union') of this ids.
    """
    mask_idxs_list = []
    for i, pos_list in enumerate(pos_list_list):
        label = labels[i]
        text = text_list[i]
        observation_mask_idxs = []
        for pos in pos_list:
            if len(pos) == 0:
                # If there is no POS, we just return zeros
                observation_mask_idxs.append(get_pos_idxs("", text))
            else:
                for pos in pos_list:
                    if label == 0:  # In that case, the label is the neutral class
                        observation_mask_idxs.append(get_pos_idxs(pos, text))
                    else:
                        pos_idxs = get_pos_idxs(pos, text)
                        pos_idxs = [label if idx == 1 else 0 for idx in pos_idxs]
                        observation_mask_idxs.append(pos_idxs)

        # reduce observation_mask_idxs
        observation_mask_idxs = np.bitwise_or.reduce(observation_mask_idxs, axis=0)
        mask_idxs_list.append(observation_mask_idxs)

    return [t.split() for t in text_list], mask_idxs_list


class SubTask:
    """A Subtask."""

    def __init__(self, id, task_id, filename, src_col="text", tgt_cols_list=["label"], *args, **kwargs):
        """Raise RuntimeError if this SubTask is instantiated."""
        general_logger.info(f"Initializing SubTask {id} for task {task_id}")
        if type(self) == SubTask:
            raise RuntimeError("Abstract class <SubTask> must not be instantiated.")
        self.attention_masks = None
        self.Y = None
        self.X = None
        self.class_weights = None
        self.id = id
        self.src_col = src_col
        self.tgt_cols_list = tgt_cols_list
        self.task_id = task_id
        self.filename = os.path.join("datasets", filename)
        self.processed = False

    def process(self, force_download: bool = False):
        """Process a SubTask.

        Load the data for this subtask, set properties X, Y and attention_mask.
        """
        general_logger.info(f"Processing SubTask {self.id}")
        X, Y, attention_masks = self.load_data()

        train_split = int(len(X) * TRAIN_RATIO)
        dev_split = train_split + int(len(X) * DEV_RATIO)

        self.X = {Split.TRAIN: X[:train_split], Split.DEV: X[train_split:dev_split], Split.TEST: X[dev_split:]}
        self.attention_masks = {
            Split.TRAIN: attention_masks[:train_split],
            Split.DEV: attention_masks[train_split:dev_split],
            Split.TEST: attention_masks[dev_split:],
        }
        self.Y = {Split.TRAIN: Y[:train_split], Split.DEV: Y[train_split:dev_split], Split.TEST: Y[dev_split:]}
        self.create_class_weights()
        self.processed = True
        general_logger.info(f"SubTask {self.id} processed successfully")

    def load_data(self) -> Tuple:
        """Load the data of a SubTask.

        Must be implemented for inherited.
        """
        raise NotImplementedError

    def create_class_weights(self):
        """Compute the weights for imbalanced classes."""
        pass

    def get_scaling_weight(self):
        """Get the scaling weight of a Subtask.

        Needs to be overwritten.
        """
        raise NotImplementedError

    def get_X(self, split: Split):
        """Get all X of a given split."""
        return self.X[split]

    def get_att_mask(self, split: Split):
        """Get attention_masks for inputs of a given split."""
        return self.attention_masks[split]

    def get_Y(self, split: Split):
        """Get all Y of a given split."""
        return self.Y[split]

    def __str__(self) -> str:
        return str(self.id)


# a[43485:43500]
class ClassificationSubTask(SubTask):
    """A ClassificationSubTask."""

    def __init__(self, num_classes=2, *args, **kwargs):
        """Initialize a ClassificationSubTask."""
        super(ClassificationSubTask, self).__init__(num_classes=num_classes, *args, **kwargs)
        self.num_classes = num_classes

    def load_data(self) -> Tuple[torch.LongTensor, torch.LongTensor, torch.LongTensor]:
        """Load the data of a ClassificationSubTask."""
        general_logger.info(f"Loading data from {self.filename}")
        df = pd.read_csv(self.filename)

        X, Y = df[self.src_col], df[self.tgt_cols_list]
        tokenized_inputs = tokenizer(X.to_list(), padding="max_length", truncation=True,
                                     max_length=MAX_LENGTH)
        X = tokenized_inputs.get("input_ids")
        attention_masks = tokenized_inputs.get("attention_mask")
        assert Y.nunique().squeeze() == self.num_classes
        assert Y[self.tgt_cols_list[0]].min(axis=0) == 0
        if self.num_classes == 2:  # if it's binary classification
            Y = Y.to_numpy()
        else:
            Y = Y[self.tgt_cols_list].to_numpy()
        general_logger.info(f"Data loaded successfully: {len(X)} samples")
        return torch.LongTensor(X), torch.LongTensor(Y), torch.LongTensor(attention_masks)

    def __repr__(self):
        """Represent a Classification Subtask."""
        return f"{'Multi-class' if self.num_classes != 2 else 'Binary'} Classification"

    def create_class_weights(self):
        """Compute the weights."""
        self.class_weights = get_class_weights(self.Y[Split.TRAIN], method="isns")

    def get_scaling_weight(self):
        """Get the weight of a Classification Subtask.

        As with the other tasks, we normalize by the natural logarithm of the domain size.
        """
        return 1 / np.log(self.num_classes)


# in current implementation, the regression subtask is not used
class RegressionSubTask(SubTask):
    """A RegressionSubTask."""

    def __init__(self, *args, **kwargs):
        """Initialize a RegressionSubTask."""
        super(RegressionSubTask, self).__init__(*args, **kwargs)

    def load_data(self) -> Tuple[torch.LongTensor, torch.FloatTensor, torch.LongTensor]:
        """Load the data of a RegressionSubTask."""
        general_logger.info(f"Loading data from {self.filename}")
        df = pd.read_csv(self.filename)
        X, Y = df[self.src_col], df[self.tgt_cols_list]
        tokenized_inputs = tokenizer(X.to_list(), padding="max_length", truncation=True,
                                     max_length=MAX_LENGTH)
        X = tokenized_inputs.get("input_ids")
        attention_masks = tokenized_inputs.get("attention_mask")
        Y = (((Y - Y.min()) / (Y.max() - Y.min())).to_numpy()).astype("float32")  # scale from 0 to 1
        general_logger.info(f"Data loaded successfully: {len(X)} samples")
        return torch.LongTensor(X), torch.FloatTensor(Y), torch.LongTensor(attention_masks)

    def __repr__(self):
        """Represent a Regression Subtask."""
        return "Regression"

    def get_scaling_weight(self):
        """Get the scaling weight of a Regression Subtask.

        As of now, this scaling weight is a simple scalar and is a mere heuristic-based approximation (ie. we eyeballed it).
        """
        return REGRESSION_SCALAR


class MultiLabelClassificationSubTask(SubTask):
    """A MultiLabelClassificationSubTask."""

    def __init__(self, num_classes=2, num_labels=2, *args, **kwargs):
        """Initialize a MultiLabelClassificationSubTask."""
        super(MultiLabelClassificationSubTask, self).__init__(num_classes=2, num_labels=2, *args, **kwargs)
        self.num_classes = num_classes
        self.num_labels = num_labels

    def load_data(self) -> Tuple[torch.LongTensor, torch.LongTensor, torch.LongTensor]:
        """Load the data of a MultiLabelClassificationSubTask."""
        general_logger.info(f"Loading data from {self.filename}")
        df = pd.read_csv(self.filename)
        X, Y = df[self.src_col], df[self.tgt_cols_list]
        tokenized_inputs = tokenizer(X.to_list(), padding="max_length", truncation=True,
                                     max_length=MAX_LENGTH)
        X = tokenized_inputs.get("input_ids")
        attention_masks = tokenized_inputs.get("attention_mask")
        assert Y.max(axis=0).to_numpy().max() == 1
        Y = Y.to_numpy()
        general_logger.info(f"Data loaded successfully: {len(X)} samples")
        return torch.LongTensor(X), torch.LongTensor(Y), torch.LongTensor(attention_masks)

    def __repr__(self):
        """Represent a Multi-label Classification Subtask."""
        return "Multi-label Classification"

    def get_scaling_weight(self):
        """Get the weight of a Multi-label Classification Subtask.

        As with the other tasks, we normalize by the natural logarithm of the domain size.
        """
        return 1 / np.log(self.num_classes * self.num_labels)


class POSSubTask(SubTask):
    """A POSSubTask.

    Each POSSubTask can be either binary classification or multiclass classification.
    If it is binary classification, zero (0) must be the neutral class.
    This neutral class is also applied to all other, 'normal' tokens.
    """

    def __init__(self, tgt_cols_list, label_col=None, *args, **kwargs):
        """Initialize a POSSubTask.

        Normally, we have 3 classes: (0=no-tag, 1=tag-start, 2=tag-continue)
        However, we have POS-tasks where we have more than just 'binary token level classification'.
        In these scenarios, each class has two tags: 'tag-start' and 'tag-continue'.
        The 'no-class' tag has no 'tag-continue'.
        """
        super(POSSubTask, self).__init__(tgt_cols_list=tgt_cols_list, *args, **kwargs)
        self.num_classes = 3   # The default num_classes is 2 or 3 (0=no-tag, 1=tag-start, 2=tag-continue)
        self.label_col = label_col
        assert len(tgt_cols_list) == 1

    def load_data(self) -> Tuple[torch.LongTensor, torch.LongTensor, torch.LongTensor]:
        """Load the data of a POSSubTask."""
        general_logger.info(f"Loading data from {self.filename}")
        df = pd.read_csv(self.filename)

        df[self.tgt_cols_list] = df[self.tgt_cols_list].fillna("")
        mask = df.apply(
            lambda row: all([p in row[self.src_col] for p in row[self.tgt_cols_list[0]].split(";")]), axis=1
        )
        df = df[mask].reset_index(drop=True)
        assert sum(mask) == len(df[self.tgt_cols_list]), "At least one POS is not contained in the source column."

        pos_list_list = df[self.tgt_cols_list[0]].apply(lambda x: x.split(";")).to_list()
        X = df[self.src_col].values
        # If we do not provide a labels column, we assume that, whenever a pos is present, that is the non-neutral class
        labels = (
            df[self.label_col]
            if self.label_col
            else [1 if len(pos) > 0 else 0 for pos in df[self.tgt_cols_list[0]].to_list()]
        )
        tokens, labels = get_tokens_and_labels(pos_list_list=pos_list_list, text_list=X, labels=labels)
        tokenized_inputs = tokenizer(
            tokens, padding="max_length", is_split_into_words=True, truncation=True,
            max_length=MAX_LENGTH
        )
        new_labels = []
        for i, labels in enumerate(labels):
            word_ids = tokenized_inputs.word_ids(i)
            new_labels.append(align_labels_with_tokens(labels, word_ids))
        Y = np.array(new_labels)
        # This should in most cases not alter self.num_classes, as we only use binary tags (+ tag-continue = 3 classes).
        # However, we leave this generic implementation for future tasks.
        self.num_classes = len(np.unique(Y)) - 1
        X = tokenized_inputs.get("input_ids")
        attention_masks = tokenized_inputs.get("attention_mask")
        general_logger.info(f"Data loaded successfully: {len(X)} samples")
        return torch.LongTensor(X), torch.LongTensor(Y), torch.LongTensor(attention_masks)

    def __repr__(self):
        """Represent a Token-level classification Subtask."""
        return "Token-level classification"

    def create_class_weights(self):
        """Compute the weights."""
        labels = self.Y[Split.TRAIN]
        only_class_labels = labels[labels != -100]
        self.class_weights = get_class_weights(only_class_labels, method="isns")

    def get_scaling_weight(self):
        """Get the weight of a POS Subtask.

        As with the other tasks, we normalize by the natural logarithm of the domain size.
        In case of POS subtask, the domain size equals the vocab size.
        """
        return 1 / np.log(self.num_classes)

# not used in current implementation, but important for testing?
class MLMSubTask(SubTask):
    """A Masked Language Modelling Subtask."""

    def __init__(self, *args, **kwargs):
        """Initialize a MLMSubTask."""
        super(MLMSubTask, self).__init__(*args, **kwargs)

    def load_data(self) -> Tuple[torch.LongTensor, torch.LongTensor, torch.LongTensor]:
        """Load the data of a MLMSubTask."""
        general_logger.info(f"Loading data from {self.filename}")
        df = pd.read_csv(self.filename)
        X = df[self.src_col]
        tokenized_inputs = tokenizer(X.to_list(), padding="max_length", truncation=True, max_length=MAX_LENGTH)
        X = torch.LongTensor(tokenized_inputs.get("input_ids"))
        attention_masks = tokenized_inputs.get("attention_mask")

        MASK_TOKEN = tokenizer.mask_token_id
        SEP_TOKEN = tokenizer.sep_token_id
        CLS_TOKEN = tokenizer.cls_token_id
        PAD_TOKEN = tokenizer.pad_token_id

        Y = X.clone()
        rand = torch.rand(X.shape)
        masking_mask = (rand < 0.15) * (X != SEP_TOKEN) * (X != CLS_TOKEN) * (X != PAD_TOKEN)
        X[masking_mask] = MASK_TOKEN
        Y[~masking_mask] = -100
        general_logger.info(f"Data loaded successfully: {len(X)} samples")
        return torch.LongTensor(X), torch.LongTensor(Y), torch.LongTensor(attention_masks)

    def __repr__(self):
        """Represent a MLM Subtask."""
        return "Masked Language Modelling"

    def get_scaling_weight(self):
        """Get the weights for imbalanced classes."""
        return 1 / np.log(len(tokenizer))

In [120]:
"""This module contains the SubTaskDataset."""

from typing import List

from torch.utils.data import DataLoader, Dataset

from old_utils.logger import general_logger
from old_utils.enums import Split


class SubTaskDataset(Dataset):
    """A Datset for a single SubTask."""

    def __init__(self, subtask: SubTask, split: Split):
        """Initialize a SubTaskDataset."""
        general_logger.info(f"Initializing dataset for subtask {subtask.id} with split {split}")
        self.split = split
        self.subtask = subtask
        self.observations: List = []
        self._reset()

    def __len__(self):
        """Get the length of the Dataset."""
        return len(self.observations)

    def __getitem__(self, item):
        """Get the next observation from the Dataset."""
        if self._counter == len(self.observations):
            self._reset()
        i = self.observations[self._counter]
        x = self.subtask.get_X(split=self.split)[i]
        masks = self.subtask.get_att_mask(split=self.split)[i]
        y = self.subtask.get_Y(split=self.split)[i]
        self._counter += 1
        return x, masks, y, self.subtask.id

    def _reset(self):
        general_logger.info(f"Resetting dataset for subtask {self.subtask.id}")
        self.observations = [i for i in range(len(self.subtask.get_X(split=self.split)))]
        set_random_seed()
        np.random.shuffle(self.observations)  # Not a real 'reshuffling' as it will always arrange same.
        self._counter = 0


class BatchList:
    """A BatchList is a wrapper around dataloaders for each subtask.

    This BatchList will never stop; it will always yield super-batches containing one sub-batch per task.
    """

    def __init__(self, subtask_list: List[SubTask], sub_batch_size, split=Split.TRAIN):
        """Initialize a BatchList."""
        general_logger.info(f"Creating BatchList with {len(subtask_list)} subtasks, batch size {sub_batch_size}")
        self.sub_batch_size = sub_batch_size
        self.datasets = {f"{st.id}": SubTaskDataset(subtask=st, split=split) for st in subtask_list}
        self.dataloaders = {
            f"{st_id}": DataLoader(ds, batch_size=self.sub_batch_size) for st_id, ds in self.datasets.items()
        }
        self.iter_dataloaders = {f"{st_id}": iter(dl) for st_id, dl in self.dataloaders.items()}

    def __next__(self):
        """Yield a batch of sub-batches."""
        data = []
        items = list(self.iter_dataloaders.items())  # List of tuples of (key,values)
        random.shuffle(items)
        for st_id, dl in items:
            try:
                batch = next(dl)
            except StopIteration:
                self.iter_dataloaders[st_id] = iter(self.dataloaders[st_id])  # Reset the iter_dataloader
                batch = next(self.iter_dataloaders[st_id])
            data.append(batch)
        general_logger.info(f"Generated batch with {len(data)} sub-batches")
        return data  # Batch contains Sub-batches.

    def _reset(self):
        """Reset this BatchListEvalTest."""
        self.iter_dataloaders = {f"{st_id}": iter(dl) for st_id, dl in self.dataloaders.items()}

class BatchListEvalTest:
    """A BatchListEvalTest is a wrapper around dataloaders for each subtask.

    If one task is exhausted, it will stop yielding sub-batches from this task.
    Instead, it will continue until it has yielded all sub-batches from all tasks.
    """

    def __init__(self, subtask_list: List[SubTask], sub_batch_size, split=Split.TRAIN):
        """Initialize a BatchList."""
        general_logger.info(f"Creating BatchListEvalTest with {len(subtask_list)} subtasks")
        self.sub_batch_size = sub_batch_size
        self.datasets = {f"{st.id}": SubTaskDataset(subtask=st, split=split) for st in subtask_list}
        self.dataloaders = {
            f"{st_id}": DataLoader(ds, batch_size=self.sub_batch_size) for st_id, ds in self.datasets.items()
        }
        self.iter_dataloaders = {f"{st_id}": iter(dl) for st_id, dl in self.dataloaders.items()}

    def __len__(self):
        """Return the length of this BatchListEvalTest.

        The length is the maximum length of all subtask-datadloaders.
        """
        return sum([len(dl) for dl in self.dataloaders.values()])

    def _reset(self):
        """Reset this BatchListEvalTest."""
        self.iter_dataloaders = {f"{st_id}": iter(dl) for st_id, dl in self.dataloaders.items()}

In [121]:
import itertools
# initializing the sub-tasks I want to use
st_1_cw_hard_03 = ClassificationSubTask(
task_id=3,
filename="03_CW_HARD/preprocessed.csv",
id=300001)
st_1_me_too_ma_108 = MultiLabelClassificationSubTask(
num_classes=2,
num_labels=2,
task_id=108,
filename="108_MeTooMA/preprocessed.csv",
id=10801,
tgt_cols_list=["hate_speech_label", "sarcasm_label"],
)
st_1_mdgender_116 = ClassificationSubTask(
task_id=116,
id=11601,
filename="116_MDGender/preprocessed.csv",
num_classes=6
)
st_1_mpqa_103 = ClassificationSubTask(
task_id=103,
id=10301,
filename="103_MPQA/preprocessed.csv")
st_1_stereotype_109 = ClassificationSubTask(
task_id=109,
id=10901,
filename="109_stereotype/preprocessed.csv")
st_2_stereotype_109 = MultiLabelClassificationSubTask(
task_id=109,
id=10902,
filename="109_stereotype/preprocessed.csv",
tgt_cols_list=["stereotype_explicit_label", "stereotype_explicit_label"],
num_classes=2,
num_labels=2,
)
st_1_good_news_everyone_42 = POSSubTask(
tgt_cols_list=["cue_pos"],
task_id=42,
id=42001,
filename="42_GoodNewsEveryone/preprocessed.csv"
)
st_2_good_news_everyone_42 = POSSubTask(
tgt_cols_list=["experiencer_pos"],
task_id=42,
id=42002,
filename="42_GoodNewsEveryone/preprocessed.csv",
)
st_1_pheme_12 = ClassificationSubTask(
task_id=12,
id=12001,
filename="12_PHEME/preprocessed.csv")
st_2_pheme_12 = ClassificationSubTask(
task_id=12,
id=12002,
filename="12_PHEME/preprocessed.csv",
tgt_cols_list=["veracity_label"],
num_classes=3,
)
st_1_babe_10 = ClassificationSubTask(
task_id=10,
id=10001,
filename="10_BABE/preprocessed.csv",
num_classes=2)
st_2_babe_10 = POSSubTask(
task_id=10,
id=10002,
filename="10_BABE/preprocessed.csv",
tgt_cols_list=["biased_words"])
st_1_gwsd_128 = ClassificationSubTask(
task_id=128,
num_classes=3,
filename="128_GWSD/preprocessed.csv",
id=12801)

# Tasks
cw_hard_03 = Task(task_id=3, subtasks_list=[st_1_cw_hard_03])
babe_10 = Task(task_id=10, subtasks_list=[st_1_babe_10, st_2_babe_10])
me_too_ma_108 = Task(task_id=108, subtasks_list=[st_1_me_too_ma_108])
mdgender_116 = Task(task_id=116, subtasks_list=[st_1_mdgender_116])
pheme_12 = Task(task_id=12, subtasks_list=[st_2_pheme_12, st_1_pheme_12])
mpqa_103 = Task(task_id=103, subtasks_list=[st_1_mpqa_103])
stereotype_109 = Task(task_id=109, subtasks_list=[st_1_stereotype_109,
                                              st_2_stereotype_109])
good_news_everyone_42 = Task(task_id=42,
                         subtasks_list=[st_1_good_news_everyone_42,
                                        st_2_good_news_everyone_42])
gwsd_128 = Task(task_id=128, subtasks_list=[st_1_gwsd_128])


# MBIB ###
# st_linguistic = ClassificationSubTask(task_id=11111, id=11111, filename="mbib_linguistic/preprocessed.csv", num_classes=2)
# mbib_lingustic = Task(task_id=11111, subtasks_list=[st_linguistic])

# Create task object
all_tasks = [
babe_10,
cw_hard_03,
me_too_ma_108,
pheme_12,
mdgender_116,
mpqa_103,
stereotype_109,
good_news_everyone_42,
gwsd_128,
]

# Get all subtasks
all_subtasks = list(itertools.chain.from_iterable(t.subtasks_list for t in all_tasks))

# Task families
media_bias = [babe_10]
subjective_bias = [cw_hard_03]
hate_speech = [me_too_ma_108]
gender_bias = [mdgender_116]
sentiment_analysis = [mpqa_103]
fake_news = [pheme_12]
group_bias = [stereotype_109]
emotionality = [good_news_everyone_42]
stance_detection = [gwsd_128]
#mlm = [mlm_0]

[2024-12-04 18:48:54,254: INFO: 3233285385: Initializing SubTask 300001 for task 3]
[2024-12-04 18:48:54,254: INFO: 3233285385: Initializing SubTask 10801 for task 108]
[2024-12-04 18:48:54,255: INFO: 3233285385: Initializing SubTask 11601 for task 116]
[2024-12-04 18:48:54,255: INFO: 3233285385: Initializing SubTask 10301 for task 103]
[2024-12-04 18:48:54,256: INFO: 3233285385: Initializing SubTask 10901 for task 109]
[2024-12-04 18:48:54,256: INFO: 3233285385: Initializing SubTask 10902 for task 109]
[2024-12-04 18:48:54,256: INFO: 3233285385: Initializing SubTask 42001 for task 42]
[2024-12-04 18:48:54,256: INFO: 3233285385: Initializing SubTask 42002 for task 42]
[2024-12-04 18:48:54,257: INFO: 3233285385: Initializing SubTask 12001 for task 12]
[2024-12-04 18:48:54,257: INFO: 3233285385: Initializing SubTask 12002 for task 12]
[2024-12-04 18:48:54,257: INFO: 3233285385: Initializing SubTask 10001 for task 10]
[2024-12-04 18:48:54,258: INFO: 3233285385: Initializing SubTask 10002 

# Building the Model
After initializing the datasets I want to use in the last step, I now build the model:
- The backbone is changes to DistilBERT as it is a smaller model and therefore faster to train.
- For each task a specific model ehad is needed to fulfill the task.For this a head factory is used to decide which head to use for the specific task type.
- Apart from that the model needs a GradsWrapper to get and set the gradients of the weights and biases of all trainable layers.
- In the model factory the model is then instantiated by combining the backbone with the different model heads for the different tasks.

In [122]:
from typing import Dict
from torch import nn
from old_utils.common import rsetattr

class GradsWrapper(nn.Module):
    """Abstract class for a GradsWrapper.

    This class must be extended and not instantiated.
    """

    def __init__(self, *args, **kwargs):
        """Raise RuntimeError if this class is instantiated."""
        if type(self) == GradsWrapper:
            raise RuntimeError("Abstract class <GradsWrapper> must not be instantiated.")
        super(GradsWrapper, self).__init__()

    def get_grads(self) -> Dict:
        """Get the gradients of the weights and biases of all trainable layers."""
        return {k: v.grad.clone() if v.grad is not None else None for k, v in dict(self.named_parameters()).items()}

    def set_grads(self, grads: Dict):
        """Set the gradients of the weights and biases of all trainable layers."""
        for k, v in grads.items():
            rsetattr(self, f"{k}.grad", v)

In [123]:
"""This module contains the implementation of the heads for specific tasks as well a factory-method for deciding which head to use."""

from torch import nn
from torch.nn import CrossEntropyLoss, MSELoss
from torchmetrics import Accuracy, F1Score, MeanSquaredError, Perplexity, R2Score



def HeadFactory(st: SubTask, *args, **kwargs):
    """Decide which head to use for the specific task type
       st: subtask"""
    if isinstance(st, ClassificationSubTask):
        return ClassificationHead(num_classes=st.num_classes, class_weights=st.class_weights, *args, **kwargs)
    elif isinstance(st, MultiLabelClassificationSubTask):
        return ClassificationHead(
            num_classes=st.num_classes, num_labels=st.num_labels, class_weights=st.class_weights, *args, **kwargs)
    elif isinstance(st, POSSubTask):
        return TokenClassificationHead(num_classes=st.num_classes, class_weights=st.class_weights, *args, **kwargs)
    elif isinstance(st, RegressionSubTask):
        return RegressionHead(*args, **kwargs)
    elif isinstance(st, MLMSubTask):
        return LanguageModellingHead(*args, **kwargs)


class ClassificationHead(GradsWrapper):
    """Classifier inspired by one used in RoBERTa."""

    def __init__(
        self,
        input_dimension: int,
        hidden_dimension: int,
        dropout_prob: float,
        num_classes=2,
        num_labels=1,
        class_weights=None,
    ):
        """Initialize the head."""
        super().__init__()
        self.dense = nn.Linear(input_dimension, hidden_dimension)
        self.dropout = nn.Dropout(p=dropout_prob)
        self.out_proj = nn.Linear(hidden_dimension, num_classes * num_labels)
        self.num_classes = num_classes
        self.num_labels = num_labels
        self.loss = CrossEntropyLoss(weight=class_weights)
        self.metrics = {
            "f1": F1Score(num_classes=num_classes, mdmc_reduce="global", average="macro"),
            "acc": Accuracy(mdmc_reduce="global"),
        }

    def forward(self, X, y):
        """Feed the data through head accordingly to RoBERTa approach and compute loss."""
        batch_size = y.shape[0]  # size of data in this subbatch

        x = X[:, 0, :]  # take <s> token (equiv. to [CLS])

        # pass CLS through classifier
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        logits = self.out_proj(x)

        loss = self.loss(logits.view(-1, self.num_classes), y.view(-1))
        logits = logits.view(batch_size, self.num_classes, self.num_labels)  # reshape logits into prediction
        metrics_values = {k: metric(logits.cpu(), y.cpu()) for k, metric in self.metrics.items()}
        return logits, loss, metrics_values


class TokenClassificationHead(GradsWrapper):
    """TokenClassificationHead inspired by one used in RoBERTa."""

    def __init__(self, num_classes: int, class_weights, hidden_dimension: int, dropout_prob: float, *args, **kwargs):
        """Initialize the TokenClassificationHead."""
        super(TokenClassificationHead, self).__init__(*args, **kwargs)
        self.dropout_LM = nn.Dropout(p=dropout_prob)
        self.classifier = nn.Linear(hidden_dimension, num_classes)
        self.loss = CrossEntropyLoss(weight=class_weights)
        self.num_classes = num_classes
        self.metrics = {
            "f1": F1Score(num_classes=num_classes, mdmc_reduce="global", average="macro"),
            "acc": Accuracy(mdmc_reduce="global"),
        }

    def forward(self, X, y):
        """Feed the data through head accordingly to RoBERTa approach and compute loss."""
        sequence_output = self.dropout_LM(X)
        logits = self.classifier(sequence_output)
        loss = self.loss(logits.view(-1, self.num_classes), y.view(-1))

        # Ignore class -100 when computing metrics
        mask = torch.where(y != -100, 1, 0)
        logits = torch.masked_select(logits, (mask.unsqueeze(-1).expand(logits.size()) == 1))

        y = torch.masked_select(y, (mask == 1))
        logits = logits.view(y.shape[0], self.num_classes)
        metrics_values = {k: metric(logits.cpu(), y.cpu()) for k, metric in self.metrics.items()}

        return logits, loss, metrics_values


class RegressionHead(GradsWrapper):
    """Regression head inspired by one used in RoBERTa."""

    def __init__(self, input_dimension: int, hidden_dimension: int, dropout_prob: float):
        """Initialize the RegressionHead."""
        super().__init__()
        self.dense = nn.Linear(input_dimension, hidden_dimension)
        self.dropout = nn.Dropout(p=dropout_prob)
        self.out_proj = nn.Linear(hidden_dimension, 1)
        self.loss = MSELoss()
        self.metrics = {"R2": R2Score(), "MSE": MeanSquaredError()}  # Needs at least 2 samples

    def forward(self, X, y):
        """Feed the data through head accordingly to RoBERTa approach and compute loss."""
        x = X[:, 0, :]  # take <s> token (equiv. to [CLS])
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        logits = self.out_proj(x)

        loss = self.loss(logits.squeeze(), y.squeeze())

        metrics_values = {k: metric(logits.cpu(), y.cpu()).detach() for k, metric in self.metrics.items()}

        return logits, loss, metrics_values


class LanguageModellingHead(GradsWrapper):
    """Roberta Head for masked language modeling."""

    def __init__(self, input_dimension: int, hidden_dimension: int, dropout_prob: float):
        """Initialize LM head."""
        super().__init__()
        self.dense = nn.Linear(input_dimension, hidden_dimension)
        self.layer_norm = nn.LayerNorm(hidden_dimension, eps=1e-5)
        self.gelu = torch.nn.GELU()
        self.loss = CrossEntropyLoss()

        # output dimension is of size of all possible tokens
        self.decoder = nn.Linear(hidden_dimension, tokenizer.vocab_size)
        self.bias = nn.Parameter(torch.zeros(tokenizer.vocab_size))
        self.decoder.bias = self.bias
        self.metrics = {"perplexity": Perplexity()}

    def forward(self, X, y):
        """Feed the data through one layer and then project to vocab size."""
        x = self.dense(X)
        x = self.gelu(x)
        x = self.layer_norm(x)

        # project back to size of vocabulary
        logits = self.decoder(x)
        loss = self.loss(logits.view(-1, tokenizer.vocab_size), y.view(-1))

        metrics_values = {k: metric(logits.cpu(), y.cpu()) for k, metric in self.metrics.items()}

        return logits, loss, metrics_values


In [124]:
class BackboneLM(GradsWrapper):
    """Language encoder model which is shared across all tasks."""
    
    def __init__(self):
        """Fetch Language model from huggingface."""
        super(BackboneLM, self).__init__()
        from transformers import DistilBertModel
        self.backbone = DistilBertModel.from_pretrained('distilbert-base-uncased')

class Model(nn.Module):
    """Torch-based module."""
    
    def __init__(self, stl: List, *args, **kwargs):
        """Initialize model and create heads."""
        super().__init__()
        self.stl = stl
        self.subtask_id_to_subtask = {int(f"{st.id}"): st for st in stl}
        self.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
        
        # Initialize the backbone language model
        self.language_model = BackboneLM()
        self.language_model.backbone.resize_token_embeddings(len(tokenizer))
        
        # Create the task-specific heads
        self.heads = nn.ModuleDict({str(st.id): HeadFactory(st, *args, **kwargs) for st in stl})

    def forward(self, X, attention_masks, Y, st_id):
        """Pass the data through the model and according head decided from heads dict."""
        # Pass through the backbone model
        x_enc = self.language_model.backbone(input_ids=X, attention_mask=attention_masks).last_hidden_state
        # Pass through the appropriate head
        head = self.heads[str(st_id.item())]
        logits, loss, metric_values = head(x_enc, Y)
        return loss, metric_values

In [125]:
"""Module for creating instantiating the appropriate model defined by the task list only."""

from typing import List


def ModelFactory(
    task_list: List, 
    sub_batch_size: int, 
    eval_batch_size: int, 
    pretrained_path: str = None,
    *args, 
    **kwargs
):
    """Create model and return it along with dataloaders."""
    # Get all subtasks from task list
    subtask_list = [st for t in task_list for st in t.subtasks_list]
    
    # Verify data is processed
    for st in subtask_list:
        assert st.processed, "Data must be loaded at this point."

    # Create model
    model = Model(stl=subtask_list, **kwargs)

    if pretrained_path is not None:
        model = load_pretrained_weights(model, pretrained_path=pretrained_path)

    # Move model to appropriate device
    model.to(model.device)

    # Create dataloaders
    batch_list_train = BatchList(
        subtask_list=subtask_list, 
        sub_batch_size=sub_batch_size, 
        split=Split.TRAIN
    )
    
    batch_list_dev = BatchList(
        subtask_list=subtask_list, 
        sub_batch_size=eval_batch_size, 
        split=Split.DEV
    )
    
    batch_list_eval = BatchListEvalTest(
        subtask_list=subtask_list, 
        sub_batch_size=sub_batch_size, 
        split=Split.DEV
    )
    
    batch_list_test = BatchListEvalTest(
        subtask_list=subtask_list, 
        sub_batch_size=sub_batch_size, 
        split=Split.TEST
    )

    return model, batch_list_train, batch_list_dev, batch_list_eval, batch_list_test


def save_head_initializations(model):
    """Save weight initialization of the head. This method will not be called anymore.
     It's only for the initial saving of weight inits for all tasks."""
    for head_name in model.heads.keys():
        torch.save(model.heads[head_name].state_dict(), 'model_files/heads/' + head_name + '_init.pth')
    
def load_head_initializations(model):
    """Load fixed weight initialization for each head in order to ensure reproducibility."""
    for head_name in model.heads.keys():
        weights_path = 'model_files/heads/' + head_name + '_init.pth'
        head_weights = torch.load(weights_path)
        model.heads[head_name].load_state_dict(head_weights,strict=True)

def load_pretrained_weights(model, pretrained_path):
    """Load the weights of a pretrained model."""
    weight_dict = torch.load(pretrained_path)
    model.load_state_dict(weight_dict, strict=False)
    return model


# Training the Model

For the model training the MAGPIE repository first introduces some helper functions. Since they are specific to the training, I include them into the notebook, instead of using them as a separate module like the other utility functions.

In [126]:
from urllib.parse import urlparse

"""This module contains helper classes for model training."""

import copy
import logging
import math
from enum import Enum
from typing import Dict, List

import torch
from old_utils.enums import Split
import wandb

        
class Logger:
    """Logger to keep track of metrics, losses and artifacts.

    This logger is used as an abstraction. If we want to integrate with third party providers (wandb, GCS, ...),
    use this logger.
    """

    def __init__(self, experiment_name: str):
        """Initialize a Logger."""
        PATH = "logging/" + experiment_name
        os.makedirs(PATH, exist_ok=True)

        self.experiment_logfilename = PATH + "/train_data.log"
        experiment_logfile_handler = logging.FileHandler(filename=self.experiment_logfilename)
        experiment_logfile_formatter = logging.Formatter(fmt="%(message)s")
        experiment_logfile_handler.setFormatter(experiment_logfile_formatter)

        self.experiment_logger = logging.getLogger("experiment_logger")
        self.experiment_logger.addHandler(experiment_logfile_handler)
        self.experiment_logger.setLevel("INFO")

    def log(self, out):
        """Log."""
        self.experiment_logger.info(out)
        wandb.log(out)


class EarlyStopperSingle:
    """
    EarlyStopper for a single branch of the model.

    Inspired by .https://stackoverflow.com/questions/71998978/early-stopping-in-pytorch.
    """

    def __init__(self, patience: int, min_delta: int, resurrection: bool):
        """Initialize an EarlyStopperSingle."""
        self.patience = patience
        self.patience_zombie = 10
        self.min_delta = min_delta
        self.counter = 0
        self.counter_zombie = 0
        self.min_dev_loss = np.inf
        self.min_dev_loss_zombie = np.inf
        self.resurrection = resurrection

    def early_stop(self, dev_loss):
        """Return True if dev_loss is steadily increasing."""
        if math.isnan(dev_loss):
            return False
        if dev_loss < self.min_dev_loss:
            self.min_dev_loss = dev_loss
            self.counter = 0
        elif dev_loss > (self.min_dev_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

    def resurrect(self, dev_loss):
        """Return True if dev_loss is steadily increasing and a dead task should resurrect."""
        if math.isnan(dev_loss) or not self.resurrection:
            return False
        if dev_loss < self.min_dev_loss_zombie:
            self.min_dev_loss_zombie = dev_loss
            self.counter_zombie = 0
        elif dev_loss > self.min_dev_loss_zombie:
            self.counter_zombie += 1
            if self.counter_zombie >= self.patience_zombie:
                return True
        return False

    def reset_early_stopper(self):
        """Reset the state of an early stopper.

        As zombies can resurrect and die multiple times, we have to reset their internal variables,
        counter and the min_dev_loss each time a zombie resurrects or dies.
        """
        self.counter_zombie = 0
        self.counter = 0
        self.min_dev_loss_zombie = np.inf
        self.min_dev_loss = np.inf


class EarlyStoppingMode(Enum):
    """Enum for early stopping mode."""

    HEADS = "heads"  # Only stop heads
    BACKBONE = "backbone"  # Also stop backbone
    NONE = "none"


class EarlyStopper:
    """EarlyStopper container for all heads."""

    def __init__(self, st_ids: List[str], mode: EarlyStoppingMode, patience, resurrection: bool, min_delta=0):
        """Initialize an EarlyStopper."""
        self.mode = mode
        self.early_stoppers = {
            st_id: EarlyStopperSingle(patience=patience[st_id], min_delta=min_delta, resurrection=resurrection)
            for st_id in st_ids
        }

    def early_stop(self, st_id, dev_loss):
        """Return True if dev_loss is steadily increasing."""
        return (
            False if self.mode == EarlyStoppingMode.NONE else self.early_stoppers[st_id].early_stop(dev_loss=dev_loss)
        )

    def resurrect(self, st_id, dev_loss):
        """Return True if dev_loss is steadily increasing and a dead task should resurrect."""
        return (
            False if self.mode == EarlyStoppingMode.NONE else self.early_stoppers[st_id].resurrect(dev_loss=dev_loss)
        )

    def reset_early_stopper(self, st_id):
        """Reset the state of an early stopper."""
        self.early_stoppers[st_id].reset_early_stopper()


class Accumulator:
    """Abstract Accumulator."""

    def __init__(self):
        """Raise RuntimeError if this Accumulator is instantiated."""
        if type(self) == Accumulator:
            raise RuntimeError("Abstract class <Accumulator> must not be instantiated.")
        self.gradients = None
        self.n = 0

    def update(self, gradients):
        """Update the values of a gradient.

        Must be overwritten by concrete implementation.
        """
        raise NotImplementedError

    def get_avg_gradients(self):
        """Return the gradients, normalized across 0-axis."""
        out_gradients = copy.deepcopy(self.gradients)
        for k, v in self.gradients.items():
            out_gradients[k] /= self.n
            out_gradients[k] = out_gradients[k].squeeze(dim=0)
        return out_gradients

    def get_gradients(self):
        """Return the gradients.

        Must be overwritten by concrete implementation.
        """
        return self.gradients


class StackedAccumulator(Accumulator):
    """Accumulate the gradients for one SubTask within on Super-Batch."""

    def __init__(self):
        """Initialize a StackedAccumulator."""
        super(StackedAccumulator, self).__init__()

    def update(self, gradients, weight=1.0):
        """Update. Concatenate new set of gradients along 0-axis."""
        if not self.gradients:
            self.gradients = gradients
            # unsqueeze all gradients for later concatenation
            for k, v in self.gradients.items():
                self.gradients[k] = self.gradients[k].unsqueeze(dim=0) * weight
        else:
            for k, v in self.gradients.items():
                new_value = gradients[k].unsqueeze(dim=0) * weight
                self.gradients[k] = torch.cat((v, new_value), dim=0)
        self.n += 1

    def set_gradients(self, gradients: Dict[str, torch.tensor]):
        """Set the gradients."""
        for k, v in self.gradients.items():
            self.gradients[k] = gradients[k].unsqueeze(dim=0)


class RunningSumAccumulator(Accumulator):
    """Keep track of the running sum of gradients."""

    def __init__(self):
        """Initialize a RunningSumAccumulator."""
        super(RunningSumAccumulator, self).__init__()

    def update(self, gradients: Dict[str, torch.tensor], weight=1.0) -> None:
        """Update. Sum the gradients along 0-axis."""
        if not self.gradients:
            self.gradients = gradients
            # unsqueeze all gradients for later concatenation
            for k, v in self.gradients.items():
                self.gradients[k] = self.gradients[k].unsqueeze(dim=0) * weight
        else:
            for k, v in self.gradients.items():
                new_value = gradients[k].unsqueeze(dim=0) * weight
                self.gradients[k] = torch.add(v, new_value)
        self.n += 1


class AverageMeter:
    """The AverageMeter keeps track of a metric."""

    def __init__(self, name):
        """Initialize an AverageMeter."""
        self.values = []
        self.name = name

    def mean_last_k(self, k=10):
        """Return the mean of the last k values."""
        assert 1 <= k
        vals = self.values[-k:]
        if len(vals) < k:
            return float("NaN")

        return np.mean(vals)

    def mean_all(self):
        """Return the mean of all values."""
        return np.mean(self.values)

    def update(self, value=0):
        """Update the Metric by appending a new value."""
        self.values.append(value)

    def reset(self):
        """Reset AverageMeter."""
        self.values.clear()

    def __repr__(self):
        """Print."""
        return f"{self.mean_last_k(1):.2f}"


class Tracker:
    """Keep track of all metrics and losses of an epoch."""

    def __init__(self, heads, logger: Logger):
        """Initialize a Tracker."""
        self.metrics = self.init_metrics(heads=heads)
        self.losses, self.combined_losses = self.init_losses(heads=heads)
        self.logger = logger

    def init_losses(self, heads):
        """Initialize the losses."""
        train_losses = {f"{st_id}": AverageMeter(name=f"{st_id}_train_loss") for st_id, head in heads.items()}
        dev_losses = {f"{st_id}": AverageMeter(name=f"{st_id}_dev_loss") for st_id, head in heads.items()}
        eval_losses = {f"{st_id}": AverageMeter(name=f"{st_id}_eval_loss") for st_id, head in heads.items()}
        test_losses = {f"{st_id}": AverageMeter(name=f"{st_id}_test_loss") for st_id, head in heads.items()}
        combined_losses = {
            Split.TRAIN: AverageMeter(name="combined_train_loss"),
            Split.DEV: AverageMeter(name="combined_dev_loss"),
            Split.TEST: AverageMeter(name="combined_test_loss"),
            Split.EVAL: AverageMeter(name="combined_eval_loss"),
        }
        return {
            Split.TRAIN: train_losses,
            Split.DEV: dev_losses,
            Split.TEST: test_losses,
            Split.EVAL: eval_losses,
        }, combined_losses

    def init_metrics(self, heads=Dict):
        """Initialize the AverageMeters for the metrics."""
        train_metrics = {
            st_id: {m: AverageMeter(name=f"{st_id}_train_{m}") for m in head.metrics.keys()}
            for st_id, head in heads.items()
        }

        dev_metrics = {
            st_id: {m: AverageMeter(name=f"{st_id}_dev_{m}") for m in head.metrics.keys()}
            for st_id, head in heads.items()
        }

        eval_metrics = {
            st_id: {m: AverageMeter(name=f"{st_id}_eval_{m}") for m in head.metrics.keys()}
            for st_id, head in heads.items()
        }

        test_metrics = {
            st_id: {m: AverageMeter(name=f"{st_id}_test_{m}") for m in head.metrics.keys()}
            for st_id, head in heads.items()
        }
        return {Split.TRAIN: train_metrics, Split.DEV: dev_metrics, Split.TEST: test_metrics, Split.EVAL: eval_metrics}

    def update_metric(self, split, st_id, metric, value):
        """Update the metric, given a split and subtask id."""
        self.metrics[split][st_id][metric].update(value=value)

    def update_loss(self, split, st_id, value):
        """Update the loss, given a split and subtask id."""
        self.losses[split][st_id].update(value)

    def update_combined_loss(self, split, value):
        """Update the combined losses, given a split."""
        self.combined_losses[split].update(value)

    def get_last_st_loss(self, split, st_id, k):
        """Get mean of last subtask loss."""
        return self.losses[split][st_id].mean_last_k(k=k)

    def get_last_st_metric(self, split, st_id, k):
        """Get mean of last subtask metric."""
        return self.metrics[split][st_id][next(iter(self.metrics[split][st_id]))].mean_last_k(k=k)

    def __repr__(self):
        """Represent a Tracker."""
        return f"TRAIN LOSS: {self.combined_losses[Split.TRAIN]} - DEV LOSS: {self.combined_losses[Split.DEV]} - EVAL LOSS: {self.combined_losses[Split.EVAL]}"

    def log(self, splits: List[Split], additional_payload: Dict[str, float] = {}):
        """Log the metrics & losses of a list of splits."""
        out: Dict[str, float] = {**additional_payload}
        for split in splits:
            if split in [Split.DEV, Split.TRAIN]:
                metrics = {m.name: m.mean_last_k(1) for d in self.metrics[split].values() for m in d.values()}
                combined_losses = self.combined_losses[split]
                losses = {v.name: v.mean_last_k(1) for v in self.losses[split].values()}
                out = {**out, **metrics, combined_losses.name: combined_losses.mean_last_k(1), **losses}
            else:
                metrics = {m.name: m.mean_all() for d in self.metrics[split].values() for m in d.values()}
                combined_losses = self.combined_losses[split]
                losses = {v.name: v.mean_all() for v in self.losses[split].values()}
                out = {**out, **metrics, combined_losses.name: combined_losses.mean_all(), **losses}

        self.logger.log(out)

In [127]:
from typing import Dict, Optional
import torch
from old_utils.enums import AggregationMethod
import random  # Added missing import

class GradientAggregator:
    """Aggregator class for combining possibly conflicting gradients into one "optimal" grad."""
    
    def __init__(self, aggregation_method: AggregationMethod = AggregationMethod.MEAN):
        """Initialize GradientAggregator."""
        self.aggregation_method = aggregation_method
        self.accumulator = (
            RunningSumAccumulator() if aggregation_method == AggregationMethod.MEAN 
            else StackedAccumulator()
        )
        self._conflicting_gradient_count = 0
        self._nonconflicting_gradient_count = 0

    def reset_accumulator(self) -> None:
        """Reset the accumulator."""
        self.accumulator = (
            RunningSumAccumulator() if self.aggregation_method == AggregationMethod.MEAN 
            else StackedAccumulator()
        )

    def find_nonconflicting_grad(self, grad_tensor: torch.tensor) -> torch.tensor:
        """Use on of the algorithms to find a nonconflicting gradient."""
        if self.aggregation_method == AggregationMethod.PCGRAD:
            return self.pcgrad(grad_tensor).mean(dim=0)
        elif self.aggregation_method == AggregationMethod.PCGRAD_ONLINE:
            assert len(grad_tensor) == 2
            return self.pcgrad_online(grad_tensor)
        else:
            raise Exception

    def aggregate_gradients(self) -> torch.tensor:
        """Aggregate possibly conflicting set of gradients (given as a list of dictionaries)."""
        conflicting_grads = self.accumulator.get_gradients()
        length = len(conflicting_grads[list(conflicting_grads.keys())[0]])

        if (self.aggregation_method == AggregationMethod.PCGRAD_ONLINE
                or self.aggregation_method == AggregationMethod.MEAN):
            assert length == 1
            return self.accumulator.get_avg_gradients()
        elif self.aggregation_method == AggregationMethod.PCGRAD:
            conflicting_grads = [{k: v[i, ...] for k, v in conflicting_grads.items()} 
                               for i in range(length)]
            final_grad: Dict[str, torch.Tensor] = {}

            if len(conflicting_grads) == 1:
                return conflicting_grads[0]

            keys = list(conflicting_grads[0].keys())
            for layer_key in keys:
                list_of_st_grads = [st_grad[layer_key] for st_grad in conflicting_grads]
                final_grad.update({
                    layer_key: self.find_nonconflicting_grad(torch.stack(list_of_st_grads, dim=0))
                })
            return final_grad
        else:
            raise Exception

    def pcgrad(self, grad_tensor: torch.tensor) -> torch.tensor:
        """Project conflicting gradients onto orthogonal plane."""
        pc_grads, num_of_tasks = grad_tensor.clone(), len(grad_tensor)
        original_shape = grad_tensor.shape
        pc_grads = pc_grads.view(num_of_tasks, -1)
        grad_tensor = grad_tensor.view(num_of_tasks, -1)

        for g_i in range(num_of_tasks):
            task_index = list(range(num_of_tasks))
            random.shuffle(task_index)
            for g_j in task_index:
                dot_product = pc_grads[g_i].dot(grad_tensor[g_j])
                if dot_product < 0:
                    pc_grads[g_i] -= ((dot_product / (grad_tensor[g_j].norm() ** 2)) 
                                    * grad_tensor[g_j])
                    self._conflicting_gradient_count += 1
                else:
                    self._nonconflicting_gradient_count += 1
        return pc_grads.view(original_shape)

    def pcgrad_online(self, grad_tensor: torch.tensor) -> torch.tensor:
        """Perform pcgrad (online) algorithm."""
        assert len(grad_tensor) == 2
        p = grad_tensor[0]
        g = grad_tensor[-1]

        p = p.view(-1)
        g = g.view(-1)

        dot_product = p.dot(g)
        if dot_product < 0:
            p = p - (dot_product / (g.norm() ** 2)) * g
            self._conflicting_gradient_count += 1
        else:
            self._nonconflicting_gradient_count += 1

        p += g
        return p.view(grad_tensor[0].shape)

    def aggregate_gradients_online(self) -> Dict[str, torch.tensor]:
        """Aggregate the current overall gradient with a new gradient."""
        conflicting_grads = self.accumulator.get_gradients()
        length = len(conflicting_grads[list(conflicting_grads.keys())[0]])
        conflicting_grads = [{k: v[i, ...] for k, v in conflicting_grads.items()} 
                           for i in range(length)]
        current_overall_grad: Dict[str, torch.Tensor] = {}

        if length == 1:
            return conflicting_grads[0]
        elif length == 2:
            keys = list(conflicting_grads[0].keys())
            for layer_key in keys:
                list_of_st_grads = [st_grad[layer_key] for st_grad in conflicting_grads]
                current_overall_grad.update({
                    layer_key: self.find_nonconflicting_grad(torch.stack(list_of_st_grads, dim=0))
                })
            return current_overall_grad
        else:
            raise Exception

    def update(self, gradients: Dict[str, torch.tensor], scaling_weight: float) -> None:
        """Update the gradients of the accumulator."""
        self.accumulator.update(gradients=gradients, weight=scaling_weight)
        if self.aggregation_method == AggregationMethod.PCGRAD_ONLINE:
            self.accumulator.set_gradients(gradients=self.aggregate_gradients_online())

    def get_conflicting_gradients_ratio(self) -> Optional[float]:
        """Get the ratio of conflicting gradients."""
        if self.aggregation_method == AggregationMethod.MEAN:
            raise Exception
        if self._conflicting_gradient_count + self._nonconflicting_gradient_count == 0:
            raise Exception
        return (self._conflicting_gradient_count / 
                (self._conflicting_gradient_count + self._nonconflicting_gradient_count))

In [141]:
 """This module contains the trainer class."""
import statistics as stats
from typing import Any, Dict, List
from config.config import MAX_NUMBER_OF_STEPS

import numpy as np
import torch
from tqdm import tqdm
from transformers import get_polynomial_decay_schedule_with_warmup

from old_utils.enums import AggregationMethod, LossScaling, Split


class Trainer:
    """Trainer class to train and evaluate a model."""

    def __init__(
        self,
        task_list: List[Task],
        initial_lr,
        model_name: str,
        pretrained_path: str,
        sub_batch_size: int,
        eval_batch_size: int,
        early_stopping_mode,
        resurrection: bool,
        aggregation_method: AggregationMethod,
        loss_scaling: LossScaling,
        num_warmup_steps: int,
        head_specific_lr_dict: Dict[str, float],
        head_specific_patience_dict: Dict[str, int],
        head_specific_max_epoch_dict: Dict[str, int],
        logger: Logger,
        *args,
        **kwargs,
    ):
        """Initialize a Trainer."""
        self.early_stopping_mode = early_stopping_mode
        self.logger = logger
        self.loss_scaling = loss_scaling
        self.model, batch_list_train, batch_list_dev, batch_list_eval, batch_list_test = ModelFactory(
            task_list=task_list,
            sub_batch_size=sub_batch_size,
            eval_batch_size=eval_batch_size,
            pretrained_path=pretrained_path,
            *args,
            **kwargs,
        )
        self.batch_lists = {
            Split.TRAIN: batch_list_train,
            Split.DEV: batch_list_dev,
            Split.EVAL: batch_list_eval,
            Split.TEST: batch_list_test,
        }

        # shared backbone model optimizer
        self.lm_optimizer = torch.optim.AdamW(self.model.language_model.backbone.parameters(), lr=initial_lr)
        self.lm_lr_scheduler = get_polynomial_decay_schedule_with_warmup(
            optimizer=self.lm_optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=max([len(dl) for dl in self.batch_lists[Split.TRAIN].dataloaders.values()])
            * stats.median(head_specific_max_epoch_dict.values()),
        )

        # task-specifics optimizers
        self.head_optimizers = {
            str(st_id): torch.optim.AdamW(head.parameters(), lr=head_specific_lr_dict[st_id])
            for st_id, head in self.model.heads.items()
        }
        self.head_lr_schedulers = {
            str(st_id): get_polynomial_decay_schedule_with_warmup(
                optimizer=self.head_optimizers[st_id],
                num_warmup_steps=num_warmup_steps,
                num_training_steps=len(self.batch_lists[Split.TRAIN].dataloaders[st_id])
                * head_specific_max_epoch_dict[st_id],
            )
            for st_id in self.model.heads.keys()
        }

        # flags controlling stopping and resurrection
        self.task_alive_flags = {str(st_id): True for st_id in self.model.heads.keys()}
        self.task_zombie_flags = {str(st_id): False for st_id in self.model.heads.keys()}
        self.early_stopper = EarlyStopper(
            st_ids=self.model.heads.keys(),
            mode=self.early_stopping_mode,
            patience=head_specific_patience_dict,
            resurrection=resurrection,
        )

        self.tracker = Tracker(heads=self.model.heads, logger=logger)
        self.GA = GradientAggregator(aggregation_method=aggregation_method)
        self.progress_bar = tqdm(range(len(self.model.heads)))
        self.model_name = model_name
        self.scaling_weights = {str(st.id): st.get_scaling_weight() for t in task_list for st in t.subtasks_list}
        self.MAX_NUMBER_OF_STEPS = MAX_NUMBER_OF_STEPS
        self.k = 50

    def head_specific_optimization(self, st_id: str, lm_grads, scaling_weight):
        """
        Perform the optimization of a task-specific head.

        This method is only called when mode is training.
        @param st_id: The subtask id.
        @param lm_grads: The LM gradients.
        @param scaling_weight: The scaling weight of that subtask.
        @return: A dictionary with additional payload containing the conflicting gradients ratio.
        """
        additional_payload = {}
        last_dev_loss = self.tracker.get_last_st_loss(split=Split.DEV, st_id=st_id, k=self.k)
        should_stop_now = (
            self.early_stopper.early_stop(st_id=st_id, dev_loss=last_dev_loss)
            if (self.task_alive_flags[st_id] or self.task_zombie_flags[st_id])
            else False
        )

        should_resurrect_now = (
            self.early_stopper.resurrect(st_id=st_id, dev_loss=last_dev_loss)
            if (not self.task_zombie_flags[st_id] and not self.task_alive_flags[st_id])
            else False
        )

        should_stay_zombie = not self.task_alive_flags[st_id] and self.task_zombie_flags[st_id] and not should_stop_now

        # Eval + Log task when it DIES
        if should_stop_now and self.task_alive_flags[st_id]:
            print(f"Subtask {st_id} is now DEAD.")
            self.eval_st(split=Split.EVAL, st_id=st_id)
            self.tracker.log(splits=[Split.EVAL], additional_payload={st_id + "_STOPPED": 0})
            self.progress_bar.update()

        # Eval + Log task when it RESURRECTS
        elif should_resurrect_now and not self.task_zombie_flags[st_id]:
            print(f"Subtask {st_id} is now ZOMBIE.")
            additional_payload[st_id + "_ZOMBIE"] = 0
            self.early_stopper.reset_early_stopper(st_id=st_id)

        # Eval + Log task when a ZOMBIE DIES
        elif should_stop_now and self.task_zombie_flags[st_id]:
            print(f"Subtask {st_id} is now DEAD AGAIN.")
            additional_payload[st_id + "_DEAD_ZOMBIE"] = 0
            self.early_stopper.reset_early_stopper(st_id=st_id)

        self.task_alive_flags[st_id] = self.task_alive_flags[st_id] and not (
            should_stop_now or self.tracker.get_last_st_metric(split=Split.DEV, st_id=st_id, k=10) == 1
        )
        self.task_zombie_flags[st_id] = should_resurrect_now or should_stay_zombie

        # We optimize a task if it is alive or zombie
        optimize_task = self.task_alive_flags[str(st_id)] or self.task_zombie_flags[str(st_id)]
        if optimize_task:
            self.head_optimizers[st_id].step()
            self.head_lr_schedulers[st_id].step()

        if self.early_stopping_mode != EarlyStoppingMode.BACKBONE or optimize_task:
            self.GA.update(lm_grads, scaling_weight=scaling_weight)

        return additional_payload

    def backbone_optimization(self) -> Dict[str, Any]:
        """
        Perform the optimization of the backbone.

        This method is only called when mode is training.
        @return: A dictionary with additional payload containing the conflicting gradients ratio.
        """
        # Optimize the LM such that: we aggregate gradients from subtasks and set the final
        # gradient to the LM and subsequently optimize (only the LM)
        additional_payload = {}
        if any(self.task_alive_flags.values()):
            aggregated_gradients = self.GA.aggregate_gradients()
            self.model.language_model.set_grads(aggregated_gradients)
            self.lm_optimizer.step()
            self.lm_lr_scheduler.step()
        if self.GA.aggregation_method in [AggregationMethod.PCGRAD, AggregationMethod.PCGRAD_ONLINE]:
            conflicting_gradients_ratio = self.GA.get_conflicting_gradients_ratio()
            additional_payload["conflicting_gradients_ratio"] = conflicting_gradients_ratio

        return additional_payload

    def handle_batch(self, batch, split: Split = Split.TRAIN) -> Dict[str, Any]:
        """Handle a batch.

         (always) Pass a batch of sub_batches through the network.
         (in train-mode) For each sub_batch, accumulate the gradients of the LM.
         For each sub_batch and each st_id,
            - (in train-mode) accumulate the gradients of the respective head,
            - (always) accumulate the metric of the respective head,
            - (always) accumulate the loss of the respective head.
        (always) Log all metrics and losses to wandb.
         (in train-mode) After all sub_batches are processed, normalize the LM gradients and the head-specific gradients.
         (in train-mode) Then, perform the step of the lr_scheduler and the optimizer.

        @param batch: The batch containing sub-batches.
        @param split: The split (TRAIN, DEV, TEST)
        @return: A dictionary containing additional payload that needs to be logged.
        """
        training = split == Split.TRAIN
        losses = []
        additional_payloads: Dict[str, Any] = {}
        # reset accumulator only if it's a new batch for training, otherwise eval drops accumulated gradients
        if training:
            self.GA.reset_accumulator()

        # sub_batch consists of data of one subtask only
        for sub_batch in batch:
            X, attention_masks, Y, st_id = sub_batch
            loss, metric_values, lm_grads = self._step((X, attention_masks, Y, st_id.unique()), training=training)
            st_id = str(st_id.unique().item())
            scaling_weight = self.scaling_weights[st_id] if self.loss_scaling == LossScaling.STATIC else 1.0

            if training:
                additional_payload = self.head_specific_optimization(
                    st_id=st_id, lm_grads=lm_grads, scaling_weight=scaling_weight
                )
                additional_payloads = {**additional_payload, **additional_payloads}

            # Update losses & metrics
            for metric, value in metric_values.items():
                self.tracker.update_metric(split=split, st_id=st_id, metric=metric, value=value)
            self.tracker.update_loss(split=split, st_id=st_id, value=loss.item())
            losses.append(loss.item())

        if training:
            additional_payload = self.backbone_optimization()
            additional_payloads = {**additional_payload, **additional_payloads}

        self.tracker.update_combined_loss(split=split, value=np.mean(losses))
        return additional_payloads

    def fit(self):
        """Fit a model."""
        step = 0
        
        for i in range(self.MAX_NUMBER_OF_STEPS):
            additional_payload_train, additional_payload_dev = {}, {}
            # Check if any task is still training
            if not any(self.task_alive_flags.values()):
                break
            step += 1
            batch = next(self.batch_lists[Split.TRAIN])
            additional_payload_train = self.handle_batch(batch=batch, split=Split.TRAIN)
            if step % 3 == 0:
                batch = next(self.batch_lists[Split.DEV])
                additional_payload_dev = self.handle_batch(batch=batch, split=Split.DEV)
            self.refresh_pbar()
            self.tracker.log(
                splits=[Split.TRAIN, Split.DEV],
                additional_payload={**additional_payload_train, **additional_payload_dev},
            )

        self.eval(split=Split.EVAL)

    def _step(self, batch, training: bool = True):
        """
        Make one step.

        @param batch: A dictionary containing X, Y, std_ids and attention_masks.
        """
        inputs = {"X": batch[0], "attention_masks": batch[1], "Y": batch[2], "st_id": batch[3]}
        inputs = {k: v.to(self.model.device) for k, v in inputs.items()}

        if training:
            self.model.train()
            loss, heads_metrics_values = self.model(**inputs)
            self.lm_optimizer.zero_grad()
            for st_id, optim in self.head_optimizers.items():
                optim.zero_grad()
            loss.backward()
            lm_gradients = self.model.language_model.get_grads()
        else:
            self.model.eval()
            lm_gradients = None
            with torch.no_grad():
                loss, heads_metrics_values = self.model(**inputs)

        del inputs
        return loss, heads_metrics_values, lm_gradients

    def eval(self, split):
        """Evaluate the model on the entire test or dev set."""
        assert split in [Split.EVAL, Split.TEST]

        for st_id in self.batch_lists[split].iter_dataloaders.keys():
            self.eval_st(split=split, st_id=st_id)

        self.tracker.log(splits=[split])

    def eval_st(self, split, st_id):
        """Evaluate on a subtask, given a certain split."""
        batch_list = self.batch_lists[split]
        batch_list._reset()
        idl = batch_list.iter_dataloaders[st_id]
        for batch in idl:
            _ = self.handle_batch(batch=[batch], split=split)

    def refresh_pbar(self):
        """Update the progress bar."""
        desc = str(self.tracker)
        self.progress_bar.set_description(desc=desc)
        self.progress_bar.refresh()

    def fit_debug(self, k: int):
        """Fit for k iterations only to check if a model can process the data."""
        step = 0
        for _ in range(k):
            step += 1
            batch = next(self.batch_lists[Split.TRAIN])
            self.handle_batch(batch=batch, split=Split.TRAIN)
            # Evaluate on dev-batch
            batch = next(self.batch_lists[Split.DEV])
            self.handle_batch(batch=batch, split=Split.DEV)

    def save_model(self):
        """Save the model."""
        os.makedirs("model_files", exist_ok=True) # added 
        model_files_path = "model_files/" + self.model_name + ".pth"
        torch.save(self.model.state_dict(), model_files_path)

# Running the experiment
For actually running the experiment the configurations from the "cotrain_random_tasks.py" were taken and adapted to the changes (MFFLOW logging etc.). 
Instead of the .fit() method of the trainer class, I use the .fit_debug() method, to check the general ability of the model to process the dta.
The experiment was run on the local machine.

In [142]:
print(f"Current working directory: {os.getcwd()}")

Current working directory: /Users/heddafiedler/Documents/MASTER_DATA_SCIENCE/Semester_3/DL/DL_Project


In [143]:
# changing working directory to the root of the project:/Users/heddafiedler/Documents/MASTER_DATA_SCIENCE/Semester_3/DL/DL_Project
os.chdir("/Users/heddafiedler/Documents/MASTER_DATA_SCIENCE/Semester_3/DL/DL_Project")

In [144]:
print(f"Current working directory: {os.getcwd()}")

Current working directory: /Users/heddafiedler/Documents/MASTER_DATA_SCIENCE/Semester_3/DL/DL_Project


In [145]:
"""Script for executing the experiment 1. Run co-training of all families."""
import os
import wandb
from old_utils.enums import Split, AggregationMethod, LossScaling
from old_utils.common import set_random_seed
from config.config import (
    head_specific_lr,
    head_specific_max_epoch,
    head_specific_patience)
from dotenv import load_dotenv



EXPERIMENT_NAME = "experiment_baseline_check"
selected_tasks = [
babe_10,
cw_hard_03,
]
tasks = selected_tasks

for t in tasks:
    for st in t.subtasks_list:
        st.process()


# training config
config = {
   "sub_batch_size": 32,
   "eval_batch_size": 128,
   "initial_lr": 4e-5,
   "dropout_prob": 0.1,
   "hidden_dimension": 768,
   "input_dimension": 768,
   "aggregation_method": AggregationMethod.MEAN,
   "early_stopping_mode": EarlyStoppingMode.HEADS,
   "loss_scaling": LossScaling.STATIC,
   "num_warmup_steps": 10,
   "pretrained_path": None,
   "resurrection": True,
   "model_name": "YOUR_MODEL_NAME",
   "head_specific_lr_dict": head_specific_lr,
   "head_specific_patience_dict": head_specific_patience,
   "head_specific_max_epoch_dict": head_specific_max_epoch,
   "logger": Logger(EXPERIMENT_NAME),
 }

set_random_seed() # default is 321
wandb.init(project=EXPERIMENT_NAME,name="YOUR_MODEL_NAME")
trainer = Trainer(task_list=tasks, **config)
trainer.fit_debug(k=1)
trainer.eval(split=Split.TEST)
trainer.save_model()
wandb.finish()

[2024-12-04 19:01:26,252: INFO: 3233285385: Processing SubTask 10001]
[2024-12-04 19:01:26,253: INFO: 3233285385: Loading data from datasets/10_BABE/preprocessed.csv]
[2024-12-04 19:01:26,533: INFO: 3233285385: Data loaded successfully: 3672 samples]
[2024-12-04 19:01:26,574: INFO: 3233285385: SubTask 10001 processed successfully]
[2024-12-04 19:01:26,575: INFO: 3233285385: Processing SubTask 10002]
[2024-12-04 19:01:26,575: INFO: 3233285385: Loading data from datasets/10_BABE/preprocessed.csv]
[2024-12-04 19:01:27,258: INFO: 3233285385: Data loaded successfully: 3672 samples]
[2024-12-04 19:01:27,310: INFO: 3233285385: SubTask 10002 processed successfully]
[2024-12-04 19:01:27,311: INFO: 3233285385: Processing SubTask 300001]
[2024-12-04 19:01:27,312: INFO: 3233285385: Loading data from datasets/03_CW_HARD/preprocessed.csv]
[2024-12-04 19:01:27,714: INFO: 3233285385: Data loaded successfully: 6843 samples]
[2024-12-04 19:01:27,792: INFO: 3233285385: SubTask 300001 processed successful

0,1
10001_test_acc,▁
10001_test_f1,▁
10001_test_loss,▁
10002_test_acc,▁
10002_test_f1,▁
10002_test_loss,▁
300001_test_acc,▁
300001_test_f1,▁
300001_test_loss,▁
combined_test_loss,▁

0,1
10001_test_acc,0.52344
10001_test_f1,0.43736
10001_test_loss,0.68885
10002_test_acc,0.14876
10002_test_f1,0.10404
10002_test_loss,1.15814
300001_test_acc,0.30802
300001_test_f1,0.29569
300001_test_loss,0.7169
combined_test_loss,0.82469


Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[2024-12-04 19:01:31,730: INFO: 4155765917: Creating BatchList with 3 subtasks, batch size 32]
[2024-12-04 19:01:31,731: INFO: 4155765917: Initializing dataset for subtask 10001 with split Split.TRAIN]
[2024-12-04 19:01:31,731: INFO: 4155765917: Resetting dataset for subtask 10001]
[2024-12-04 19:01:31,733: INFO: 4155765917: Initializing dataset for subtask 10002 with split Split.TRAIN]
[2024-12-04 19:01:31,734: INFO: 4155765917: Resetting dataset for subtask 10002]
[2024-12-04 19:01:31,735: INFO: 4155765917: Initializing dataset for subtask 300001 with split Split.TRAIN]
[2024-12-04 19:01:31,736: INFO: 4155765917: Resetting dataset for subtask 300001]
[2024-12-04 19:01:31,737: INFO: 4155765917: Creating BatchList with 3 subtasks, batch size 128]
[2024-12-04 19:01:31,738: INFO: 4155765917: Initializing dataset for subtask 10001 with split Split.DEV]
[2024-12-04 19:01:31,738: INFO: 4155765917: Resetting dataset for subtask 10001]
[2024-12-04 19:01:31,740: INFO: 4155765917: Initializing 

  0%|          | 0/3 [00:00<?, ?it/s]

[2024-12-04 19:01:31,755: INFO: 4155765917: Generated batch with 3 sub-batches]
[2024-12-04 19:01:38,826: INFO: 4155765917: Generated batch with 3 sub-batches]
[2024-12-04 19:02:12,734: INFO: 2804225146: {'10001_test_f1': 0.4373633, '10001_test_acc': 0.5234375, '10002_test_f1': 0.10404127, '10002_test_acc': 0.14875808, '300001_test_f1': 0.29568994, '300001_test_acc': 0.3080201, 'combined_test_loss': 0.8246867604877638, '10001_test_loss': 0.6888488233089447, '10002_test_loss': 1.1581404407819111, '300001_test_loss': 0.7168963551521301}]


0,1
10001_test_acc,▁
10001_test_f1,▁
10001_test_loss,▁
10002_test_acc,▁
10002_test_f1,▁
10002_test_loss,▁
300001_test_acc,▁
300001_test_f1,▁
300001_test_loss,▁
combined_test_loss,▁

0,1
10001_test_acc,0.52344
10001_test_f1,0.43736
10001_test_loss,0.68885
10002_test_acc,0.14876
10002_test_f1,0.10404
10002_test_loss,1.15814
300001_test_acc,0.30802
300001_test_f1,0.29569
300001_test_loss,0.7169
combined_test_loss,0.82469
