# Classify group mention multi-label attribute with NLI

based on https://colab.research.google.com/github/MoritzLaurer/less-annotating-with-bert-nli/blob/master/BERT_NLI_demo.ipynb




In [1]:
from types import SimpleNamespace

args = SimpleNamespace()

# args.model_name = 'MoritzLaurer/ModernBERT-base-zeroshot-v2.0'
# args.model_name = 'MoritzLaurer/deberta-v3-large-zeroshot-v2.0'
args.model_name = 'MoritzLaurer/roberta-large-zeroshot-v2.0-c'

args.data_file = '../../data/annotations/group_mention_categorization/consolidated_annotations.tsv'

args.experiment_name = 'mention_attributes_nli'
args.experiment_model_path = './../../models/'

args.test_size = 0.15
args.dev_size = 0.15
args.seed = 1234

args.metric = 'f1_macro'
args.epochs = 5
args.learning_rate = 4e-5
args.train_batch_size = 16
args.eval_batch_size = 64
args.weight_decay = 0.3

In [2]:
import numpy as np
import pandas as pd
import re

# from transformers.trainer_utils import PredictionOutput
# from sklearn.metrics import balanced_accuracy_score, f1_score, precision_recall_fscore_support, accuracy_score, classification_report
# from transformers.pipelines.base import ArgumentHandler # , ChunkPipeline, build_pipeline_init_args

from typing import List, Dict, Union, Tuple, Union

def clean_text(text: str) -> str:
    text = re.sub(r'"+', '"', text)
    text = re.sub(r'\s+', ' ', text)
    return text


def tokenize_nli_format(examples, tokenizer, **kwargs):
  return tokenizer(examples["text_prepared"], examples["hypothesis"], truncation=True, **kwargs)


# def format_nli_trainset(
#         df: pd.DataFrame,
#         hypo_label_dict: Dict,
#         label_text_col: str='label_text',
#         text_col: str='text_prepared',
#         keep_label_text_col: bool=False,
#         random_seed: int=42,
#         verbose: bool=False
#     ) -> pd.DataFrame:
#     """
#     Formats the training data for NLI task.

#     Args:
#     df: pd.DataFrame
#         The training data.
#     hypo_label_dict: Dict
#         A dictionary with keys as label_text and values as hypothesis template.
#         Note that the hypothesis template should have a placeholder which can be replace with the string values in column `entity_col`.
#     text_col: str
#         The column name which contains the texts.
#     random_seed: int
#         Random seed for reproducibility.
#     verbose: bool
#         Whether to print the logs.

#     Returns:
#     pd.DataFrame
#         The formatted training data.
#     """

#     assert len(df) > 0, "The training data is empty."
#     assert len(hypo_label_dict) > 0, "The hypo_label_dict is empty."
#     required_cols = [label_text_col, text_col]
#     missing_cols = [c for c in required_cols if c not in df.columns]
#     assert len(missing_cols) == 0, f"Missing columns: {missing_cols}"

#     if verbose: print(f"Length of df before formatting step: {len(df)}.")
#     length_original_data_train = len(df)

#     dfs = []
#     for label_text, hypothesis in hypo_label_dict.items():
#         ## entailment
#         df_step = df[df.label_text == label_text].copy(deep=True)
#         df_step["hypothesis"] = hypothesis # df_step[entity_col].apply(lambda m: hypothesis % m)
#         df_step["label"] = [0] * len(df_step)
#         ## not entailment
#         df_step_not_entail = df[df.label_text != label_text].copy(deep=True)
#         # down-sample not-entailment examples (if needed)
#         df_step_not_entail = df_step_not_entail.sample(n=min(len(df_step), len(df_step_not_entail)), random_state=random_seed)
#         df_step_not_entail["hypothesis"] = hypothesis # df_step_not_entail[entity_col].apply(lambda m: hypothesis % m)
#         df_step_not_entail["label"] = [1] * len(df_step_not_entail)
#         # append
#         dfs.append(pd.concat([df_step, df_step_not_entail]))
#     df = pd.concat(dfs)

#     # encode label
#     df["label"] = df.label.apply(int)

#     # shuffle
#     df = df.sample(frac=1, random_state=random_seed).reset_index()

#     if verbose:
#         print(f"After adding not_entailment training examples, the training data was augmented to {len(df)} texts.")
#         print(f"Max augmentation could be: len(df) * 2 = {length_original_data_train*2}. It can also be lower, if there are more entail examples than not-entail for a majority class.")
#     cols = ['label', 'hypothesis', text_col]
#     if keep_label_text_col:
#         cols.append(label_text_col)
#     return df[cols].copy(deep=True)

# def format_nli_testset(
#         df: pd.DataFrame,
#         hypo_label_dict: Dict,
#         label_text_col: str='label_text',
#         text_col: str='text_prepared',
#         verbose: bool=False
#     ):
#     """
#     Formats the test data for NLI task.

#     Args:
#     df: pd.DataFrame
#         The training data.
#     hypo_label_dict: Dict
#         A dictionary with keys as label_text and values as hypothesis template.
#         Note that the hypothesis template should have a placeholder which can be replace with the string values in column `entity_col`.
#     text_col: str
#         The column name which contains the texts.
#     verbose: bool
#         Whether to print the logs.

#     Returns:
#     pd.DataFrame
#         The formatted training data.
#     """

#     assert len(df) > 0, "The training data is empty."
#     assert len(hypo_label_dict) > 0, "The hypo_label_dict is empty."
#     required_cols = [label_text_col, text_col]
#     missing_cols = [c for c in required_cols if c not in df.columns]
#     assert len(missing_cols) == 0, f"Missing columns: {missing_cols}"

#     ## explode test dataset for N hypotheses
#     hypothesis_list = list(hypo_label_dict.values())
#     if verbose: print("Number of hypothesis_list/classes: ", len(hypothesis_list))

#     # label lists with 0 at alphabetical position of their true hypo, 1 for not-true hypos
#     label_text_label_dict_explode = {}
#     for key, value in hypo_label_dict.items():
#         labels = [0 if value == hypo else 1 for hypo in hypothesis_list]
#         label_text_label_dict_explode[key] = labels

#     df["label"] = df.label_text.map(label_text_label_dict_explode)
#     df["hypothesis"] = [list(hypo_label_dict.values())]*len(df)  # list of lists, one for each row
#     if verbose: print(f"Original test set size: {len(df)}")

#     # explode dataset to have K-1 additional rows with not_entail label and K-1 other hypothesis_list
#     # ! after exploding, cannot sample anymore, because distorts the order to true label values, which needs to be preserved for evaluation code
#     df = df.explode(["hypothesis", "label"])    # multi-column explode requires pd.__version__ >= '1.3.0'
#     if verbose: print(f"Test set size for NLI classification: {len(df)}\n")

#     # df["label_nli_explicit"] = ["True" if label == 0 else "Not-True" for label in df["label"]]    # adding this just to simplify readibility

#     cols = ['label', 'hypothesis', text_col]
#     return df[cols].copy(deep=True)



# from sklearn.metrics import balanced_accuracy_score, f1_score, precision_recall_fscore_support, accuracy_score, classification_report

# def compute_metrics_binary(eval_pred: PredictionOutput, label_classes: List[str]):
#     predictions, labels = eval_pred

#     label_text_alphabetical = sorted(label_classes)

#     ### reformat model output to enable calculation of standard metrics
#     # split in chunks with predictions for each hypothesis for one unique premise
#     def chunks(x: List, n):  # Yield successive n-sized chunks from lst. https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
#         for i in range(0, len(x), n):
#             yield x[i:i + n]

#     # for each chunk/premise, select the most likely hypothesis
#     softmax = Softmax(dim=1)
#     prediction_chunks_lst = list(chunks(predictions, len(set(label_text_alphabetical)) ))
#     hypo_position_highest_prob = []
#     for i, chunk in enumerate(prediction_chunks_lst):
#         hypo_position_highest_prob.append(np.argmax(np.array(chunk)[:, 0]))  # only accesses the first column of the array, i.e. the entailment/true prediction logit of all hypos and takes the highest one

#     label_chunks_lst = list(chunks(labels, len(set(label_text_alphabetical)) ))
#     label_position_gold = []
#     for chunk in label_chunks_lst:
#         label_position_gold.append(np.argmin(chunk))  # argmin to detect the position of the 0 among the 1s

#     ### calculate standard metrics
#     f1_macro = f1_score(label_position_gold, hypo_position_highest_prob, average='macro', zero_division=0.0)
#     f1_micro = f1_score(label_position_gold, hypo_position_highest_prob, average='micro', zero_division=0.0)
#     acc_balanced = balanced_accuracy_score(label_position_gold, hypo_position_highest_prob)
#     acc_not_balanced = accuracy_score(label_position_gold, hypo_position_highest_prob)
#     metrics = {'f1_macro': f1_macro,
#                'f1_micro': f1_micro,
#                'accuracy': acc_not_balanced,
#                'balanced_accuracy': acc_balanced,
#                }
#     tmp = classification_report(label_position_gold, hypo_position_highest_prob, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]), target_names=label_text_alphabetical, output_dict=True, zero_division=0.0)

#     tmp = {
#         str(f'f1_{re.sub(",? ", "_", l)}'): v
#         for l in label_classes
#         for m, v in tmp[l].items() if 'f1' in m
#       }
#     metrics.update(tmp)

#     metrics['worst_class_f1'] = min([metrics[f"f1_{l}"] for l in label_classes])


#     return metrics



# # NOTE: the only thing we need to modify when using the zero-shot pipeline for NLI is that the hypothesis template must allow including the mentioned entity
# class ZeroShotMentionClassificationArgumentHandler(ArgumentHandler):
#     """
#     Handles arguments for zero-shot for text classification by turning each possible label into an NLI
#     premise/hypothesis pair.

#     based on ZeroShotClassificationArgumentHandler from transformers.pipelines.zero_shot_classification
#      (see https://github.com/huggingface/transformers/blob/fc689d75a04e846f63f8d7a4a420da0cf796f86b/src/transformers/pipelines/zero_shot_classification.py#L14)
#     """

#     def _parse_labels(self, labels):
#         if isinstance(labels, str):
#             labels = [label.strip() for label in labels.split(",") if label.strip()]
#         return labels

#     def __call__(self, sequences: Union[Tuple[str, str], List[Tuple[str, str]]], labels: List[str], hypothesis_template: str = "{entity} is {label}."):
#         if isinstance(sequences, tuple):
#             sequences = [sequences]

#         if len(labels) == 0 or len(sequences) == 0:
#             raise ValueError("You must include at least one label and at least one sequence.")
#         if any(len(sequence) != 2 for sequence in sequences):
#             raise ValueError("the sequence inputs must be a list of tuples with two elements: the text and the mentioned entity.")
#         entities = [entity for _, entity in sequences]
#         sequences = [sequence for sequence, _ in sequences]

#         if hypothesis_template.format(entity=entities[0], label=labels[0]) == hypothesis_template:
#             raise ValueError(
#                 # TODO: change the error message
#                 (
#                     'The provided hypothesis_template "{}" was not able to be formatted with the target labels. '
#                     "Make sure the passed template includes formatting syntax such as {{}} where the label should go."
#                 ).format(hypothesis_template)
#             )

#         sequence_pairs = []
#         for sequence, entity in zip(sequences, entities):
#             sequence_pairs.extend([[sequence, hypothesis_template.format(label=label, entity=entity)] for label in labels])

#         return sequence_pairs, sequences


In [3]:
!pip -q install transformers==4.50.3 datasets accelerate

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/10.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/10.2 MB[0m [31m209.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━[0m [32m9.2/10.2 MB[0m [31m134.8 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m10.2/10.2 MB[0m [31m138.3 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m97.0 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/491.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m40.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K 

In [4]:
## Load general packages
import pandas as pd

import torch
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    set_seed,
)

from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
# from utils.nli import (
#     clear_memory,
#     clean_text,
#     tokenize_nli_format
# )
# from utils.nli.attributes import (
#     format_nli_trainset,
#     format_nli_testset,
#     tokenize_nli_format,
#     compute_metrics_binary
# )

In [5]:
set_seed(args.seed)

In [6]:
!wget -O data.tsv https://www.dropbox.com/scl/fi/ocemhfa9vr8pfn9zqkl4w/consolidated_annotations.tsv?rlkey=kk6lyf4m2yuu6s0lkqo8gwu0z&st=kyvm7k73&dl=1
args.data_file = 'data.tsv'

--2025-05-01 09:09:47--  https://www.dropbox.com/scl/fi/ocemhfa9vr8pfn9zqkl4w/consolidated_annotations.tsv?rlkey=kk6lyf4m2yuu6s0lkqo8gwu0z
Resolving www.dropbox.com (www.dropbox.com)... 162.125.81.18, 2620:100:6031:18::a27d:5112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.81.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://uc16dd00fa260dfad0005d9bc90d.dl.dropboxusercontent.com/cd/0/inline/Co2XvnQdO5fMVD-ysu8chsSqGQdfuDpnT1GauVmfx-kSzhb2gO_lOx7a3NRbszesnKeGdT24RHsPqS3qUGmXBhU3tYxB6X89vpwHnxb6_fuhtt-yW4GntFB9NMh0h0B4RKQ/file# [following]
--2025-05-01 09:09:48--  https://uc16dd00fa260dfad0005d9bc90d.dl.dropboxusercontent.com/cd/0/inline/Co2XvnQdO5fMVD-ysu8chsSqGQdfuDpnT1GauVmfx-kSzhb2gO_lOx7a3NRbszesnKeGdT24RHsPqS3qUGmXBhU3tYxB6X89vpwHnxb6_fuhtt-yW4GntFB9NMh0h0B4RKQ/file
Resolving uc16dd00fa260dfad0005d9bc90d.dl.dropboxusercontent.com (uc16dd00fa260dfad0005d9bc90d.dl.dropboxusercontent.com)... 162.125.81.15, 2620:100:6031:15::a27d:510f


## Prepare the data

In [7]:
df = pd.read_csv(args.data_file, sep="\t", )

df = df[df.q_id!='stance']

df = df[df.category!='other']
df = df[df.attribute!='universal'] # !!!

df.groupby('mention_id').filter(lambda x: sum(x.label=='Yes') > 0)

df.loc[:, 'attribute_combination'] = df.attribute_combination.str.removesuffix(': ')

label_classes = df.attribute_combination.unique().tolist()

df['attribute_combination'] = pd.Categorical(df['attribute_combination'], categories=label_classes, ordered=True)

df.sort_values(by=['mention_id', 'attribute_combination'], inplace=True)


In [8]:
df = df[['mention_id', 'text', 'mention', 'attribute', 'category', 'label']]

In [9]:
PREMISE_TEMPLATE = 'The sentence """{sentence}""" contains the quote "{mention}" that mentions a social group.'
ATTRIBUTE_HYPOTHESIS_TEMPLATE = 'The group mentioned in the quote is defined using the {attribute} attribute {category}.'

In [10]:
df['text_prepared'] = df.apply(lambda x: PREMISE_TEMPLATE.format(sentence=clean_text(x.text), mention=clean_text(x.mention)), axis=1)
df['hypothesis'] = df.apply(lambda x: ATTRIBUTE_HYPOTHESIS_TEMPLATE.format(attribute=x.attribute, category=x.category), axis=1)

In [11]:
config = AutoConfig.from_pretrained(args.model_name)
config.label2id = {'not_entailment': 1, 'entailment': 0} # like Laurer
label2entailment = {'No': config.label2id['not_entailment'], 'Yes': config.label2id['entailment']}
df['label_text'] = df['label']
df['label'] = df['label_text'].map(label2entailment)

config.json:   0%|          | 0.00/884 [00:00<?, ?B/s]

In [12]:
df.head()

Unnamed: 0,mention_id,text,mention,attribute,category,label,text_prepared,hypothesis,label_text
0,11110_198809-390636-1,Give parents the right to become municipal day...,parents,economic,class membership,1,"The sentence """"""Give parents the right to beco...",The group mentioned in the quote is defined us...,No
1,11110_198809-390636-1,Give parents the right to become municipal day...,parents,economic,ecology of group,1,"The sentence """"""Give parents the right to beco...",The group mentioned in the quote is defined us...,No
2,11110_198809-390636-1,Give parents the right to become municipal day...,parents,economic,education level,1,"The sentence """"""Give parents the right to beco...",The group mentioned in the quote is defined us...,No
3,11110_198809-390636-1,Give parents the right to become municipal day...,parents,economic,employment status,1,"The sentence """"""Give parents the right to beco...",The group mentioned in the quote is defined us...,No
4,11110_198809-390636-1,Give parents the right to become municipal day...,parents,economic,income/wealth/economic status,1,"The sentence """"""Give parents the right to beco...",The group mentioned in the quote is defined us...,No


In [13]:
df['label_text'].value_counts()

Unnamed: 0_level_0,count
label_text,Unnamed: 1_level_1
No,8879
Yes,737


## Train/dev/test split

In [14]:
# determine split indexes
mention_ids = df.mention_id.unique().tolist()
tmp, test_ids = train_test_split(mention_ids, test_size=args.test_size, random_state=args.seed)
train_ids, dev_ids = train_test_split(tmp, test_size=args.dev_size, random_state=args.seed)

# get split data frames
df_train = df[df.mention_id.isin(train_ids)]
df_dev = df[df.mention_id.isin(dev_ids)]
df_test = df[df.mention_id.isin(test_ids)]
df_train = df_train.reset_index(drop=True)
df_dev = df_dev.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
print(len(df_train), len(df_dev), len(df_test))

# # covnert to NLI format
# df_train = format_nli_trainset(df=df_train, hypo_label_dict=hypothesis_label_dict, random_seed=args.seed, keep_label_text_col=True)
# df_dev = format_nli_testset(df=df_dev, hypo_label_dict=hypothesis_label_dict)
# df_test = format_nli_testset(df=df_test, hypo_label_dict=hypothesis_label_dict)
# print(len(df_train), len(df_dev), len(df_test))

6928 1232 1456


In [15]:
df_train.value_counts(['label_text', 'label'], sort=False, normalize=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,proportion
label_text,label,Unnamed: 2_level_1
No,1,0.924221
Yes,0,0.075779


In [16]:
# # balance
# min_size, max_size = 30, 60
# df_train = df_train.groupby(['label_text', 'label']).apply(lambda x: x.sample(n=min(max_size, max(min_size, len(x))), replace=len(x)<min_size, random_state=args.seed)).reset_index(drop=True)
# df_train = df_train.sample(frac=1.0, random_state=args.seed).reset_index(drop=True)

## Fine-tuning

In [17]:
tokenizer = AutoTokenizer.from_pretrained(args.model_name, model_max_length=512)

tokenizer_config.json:   0%|          | 0.00/1.22k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

**Tokenize data**

In [18]:
# convert pandas dataframes to Hugging Face datasets object to facilitate pre-processing
datasets = DatasetDict({
    "train": Dataset.from_pandas(df_train, preserve_index=False),
    "dev": Dataset.from_pandas(df_dev, preserve_index=False),
    "test": Dataset.from_pandas(df_test, preserve_index=False)
})

datasets = datasets.map(lambda ex: tokenize_nli_format(ex, tokenizer, max_length=512), batched=True)

# remove unnecessary columns for model training
datasets = datasets.select_columns(["input_ids", "attention_mask", "label", "attribute", "category"])
datasets.set_format("torch")

Map:   0%|          | 0/6928 [00:00<?, ? examples/s]

Map:   0%|          | 0/1232 [00:00<?, ? examples/s]

Map:   0%|          | 0/1456 [00:00<?, ? examples/s]

### Setting training arguments / hyperparameters

In [19]:
args.model_name

'MoritzLaurer/roberta-large-zeroshot-v2.0-c'

The following cell sets several important hyperparameters. We chose parameters that work well in general to avoid the need for hyperparameter search. Further below, we also provide code for hyperparameter search, if researchers want to try to increase performance by a few percentage points.

In [20]:
import os

model_path = os.path.join(args.experiment_model_path, args.experiment_name)

train_args = TrainingArguments(
#     output_dir=os.path.join(model_path, 'results'),
#     logging_dir=os.path.join(model_path, 'logs'),
#     # hyperparameters
#     learning_rate=2e-5, # args.learning_rate,
#     lr_scheduler_type="cosine",
#     per_device_train_batch_size=32, #args.train_batch_size,
#     # gradient_accumulation_steps=1,
#     per_device_eval_batch_size=args.eval_batch_size,
#     num_train_epochs=5, #args.epochs,
#     warmup_ratio=0.1,
#     weight_decay=0.1,
#     # reproducibility
#     seed=args.seed,
#     data_seed=args.seed,
    report_to="none",
    # full_determinism=True,
#     # model storing and loading
#     eval_strategy="steps",
#     eval_steps=100,
#     save_strategy="epoch",
#     save_total_limit=2,
#     load_best_model_at_end=True,
#     metric_for_best_model='worst_class_f1', # args.metric,
#     greater_is_better=True,
#   fp16=True,
#    fp16_full_eval=True,
#    bf16=False
)

### Custom function to compute metrics for NLI

We multiplied each text N times for each class in the test set and NLI can only predict 2 or 3 classes: true/not-true or true/neutral/false. This means that we cannot use standard functions for computing metrics. The following function reformats the model's output in a way that allows for the calculation of standard metrics like accuracy, F1-macro etc.

### Fine-tuning and evaluation

In [21]:
config.torch_dtype
trainer.model.dtype

NameError: name 'trainer' is not defined

In [None]:
# use GPU (cuda) if available, otherwise use CPU
model_init = lambda: AutoModelForSequenceClassification.from_pretrained(args.model_name, torch_dtype="auto", device_map="auto")

In [None]:
from torch.nn import CrossEntropyLoss
class ClassWeightsTrainer(Trainer):

    def __init__(self, class_weights: Union[List, Dict[Union[int, str], float]], **kwargs):
        """
        argument ``class_weights`` should be a dictionary mapping class labels to weights or a list of only the weights
        """
        super().__init__(**kwargs)
        # self.model = self.model.to(self.model.device)
        if len(class_weights) != self.model.config.num_labels:
            raise ValueError(f'length of `class_weights` must be {self.model.config.num_labels}')
        if isinstance(class_weights, dict):
            if set(class_weights.keys()) != set(self.model.config.id2label.keys()):
                raise ValueError(f'keys of `class_weights` mismatch label classes {list(self.model.config.id2label.keys())}')
            class_weights = [v for k, v in sorted(class_weights.items(), key=lambda item: item[1])]
        self.class_weights = torch.tensor(class_weights, dtype=torch.float32).to(self.model.device)

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop('labels')
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get('logits')
        # compute custom loss
        loss_fct = CrossEntropyLoss(weight=self.class_weights)
        loss = loss_fct(logits.float().view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
prev = df_train.label.value_counts(normalize=True)
weights = prev.max() / prev
weights = weights / weights.sum()
class_weights = weights.to_dict()

Let's start fine-tuning the model!

If you get an 'out-of-memory' error, reduce the 'per_device_train_batch_size' to 8 or 4 in the TrainingArguments above and restart the runtime. If you don't restart your runtime (menu to the to left 'Runtime' > 'Restart runtime') and rerun the entire script, the 'out-of-memory' error will probably not go away.

In [None]:


# training
trainer = Trainer(
    model_init=model_init,
    # class_weights=class_weights,
    processing_class=tokenizer,
    args=train_args,
    train_dataset=datasets["train"], #.shard(index=1, num_shards=100),  # could shard data for faster testing https://huggingface.co/docs/datasets/processing.html#sharding-the-dataset-shard
    # eval_dataset=datasets["dev"],  #.shard(index=1, num_shards=100),
    # compute_metrics=lambda p: compute_metrics_binary(p, label_classes=list(label2id.keys())),
    # callbacks = [EarlyStoppingCallback(early_stopping_patience=5)]
)
trainer.train()



<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:wandb: Paste an API key from your profile and hit enter:wandb: Paste an API key from your profile and hit enter:wandb: Paste an API key from your profile and hit enter:wandb: Paste an API key from your profile and hit enter:

In [None]:
n_ = len(datasets["test"]) # len(label_classes) * 20
eval_pred = trainer.predict(datasets["test"].select(range(n_)))

In [None]:
eval_pred

In [None]:
# predictions, labels = eval_pred
predictions, labels = eval_pred.predictions, eval_pred.label_ids

### reformat model output to enable calculation of standard metrics
# split in chunks with predictions for each hypothesis for one unique premise
def chunks(x: List, n):  # Yield successive n-sized chunks from lst. https://stackoverflow.com/questions/312443/how-do-you-split-a-list-into-evenly-sized-chunks
    for i in range(0, len(x), n):
        yield x[i:i + n]

predictions = (predictions.argmax(axis=1)==0).astype(int).tolist()
labels = labels.astype(int).tolist()
y_pred = list(chunks(predictions, len(set(label_classes)) ))
y_true = list(chunks(labels, len(set(label_classes)) ))

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_true, y_pred, target_names=label_classes, zero_division=0.0))

In [None]:
# for each chunk/premise, select the most likely hypothesis
prediction_chunks_lst = list(chunks(predictions, len(set(label_classes)) ))
hypo_position_highest_prob = []
for i, chunk in enumerate(prediction_chunks_lst):
    hypo_position_highest_prob.append(np.argmax(np.array(chunk)[:, 0]))  # only accesses the first column of the array, i.e. the entailment/true prediction logit of all hypos and takes the highest one

label_chunks_lst = list(chunks(labels, len(set(label_text_alphabetical)) ))
label_position_gold = []
for chunk in label_chunks_lst:
    label_position_gold.append(np.argmin(chunk))  # argmin to detect the position of the 0 among the 1s

### calculate standard metrics
f1_macro = f1_score(label_position_gold, hypo_position_highest_prob, average='macro', zero_division=0.0)
f1_micro = f1_score(label_position_gold, hypo_position_highest_prob, average='micro', zero_division=0.0)
acc_balanced = balanced_accuracy_score(label_position_gold, hypo_position_highest_prob)
acc_not_balanced = accuracy_score(label_position_gold, hypo_position_highest_prob)
metrics = {'f1_macro': f1_macro,
            'f1_micro': f1_micro,
            'accuracy': acc_not_balanced,
            'balanced_accuracy': acc_balanced,
            }
tmp = classification_report(label_position_gold, hypo_position_highest_prob, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]), target_names=label_text_alphabetical, output_dict=True, zero_division=0.0)

tmp = {
    str(f'f1_{re.sub(",? ", "_", l)}'): v
    for l in label_classes
    for m, v in tmp[l].items() if 'f1' in m
    }
metrics.update(tmp)

metrics['worst_class_f1'] = min([metrics[f"f1_{l}"] for l in label_classes])


return metrics

In [None]:
fully_heldout = dict(
  text = 'Those who live on Ibiza and party hard all year long should get their pensions cut.',
  mention = 'Those who live on Ibiza and party hard all year long',
)
fully_heldout['text_prepared'] = PREMISE_TEMPLATE.format(sentence=clean_text(fully_heldout['text']), mention=clean_text(fully_heldout['mention']))
fully_heldout = pd.DataFrame(fully_heldout, index=[0])
fully_heldout['hypothesis'] = [label_classes]
fully_heldout = fully_heldout.explode('hypothesis')
fully_heldout = Dataset.from_pandas(fully_heldout)
fully_heldout = fully_heldout.map(lambda ex: tokenize_nli_format(ex, tokenizer, max_length=512), batched=True)
fully_heldout.select_columns(["input_ids", "attention_mask"])
fully_heldout.set_format("torch")

In [None]:
trainer.predict(fully_heldout)

In [None]:
preds._asdict().keys()

In [None]:
preds.predictions.argmax(axis=1)

In [None]:
## Evaluate the fine-tuned model on the held-out test set
results = trainer.evaluate(eval_dataset=datasets["test"], metric_key_prefix='test')
{k.removeprefix('test_'): v for k, v in results.items() if 'f1' in k}

In [None]:
import shutil
shutil.rmtree(model_path)
trainer.model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
trainer.model.to('cpu');
del trainer
clean_memory()