# Classify group mention polarity with NLI

based on https://colab.research.google.com/github/MoritzLaurer/less-annotating-with-bert-nli/blob/master/BERT_NLI_demo.ipynb




In [1]:
from types import SimpleNamespace

args = SimpleNamespace()

# you can choose any of the NLI models here: https://huggingface.co/MoritzLaurer
# model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-docnli-ling-2c"  # English model: "MoritzLaurer/DeBERTa-v3-base-mnli-fever-docnli-ling-2c"; multilingual model: "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
args.model_name = 'MoritzLaurer/DeBERTa-v3-base-mnli-fever-docnli-ling-2c'

args.data_file = '../../data/annotations/group_mention_categorization/consolidated_annotations.tsv'

args.experiment_name = 'mention_stance_nli'
args.experiment_results_path = './../../models/'

args.test_size = 0.10
args.dev_size = 0.10
args.seed = 1234

args.metric = 'f1_macro'
args.epochs = 5
args.learning_rate = 4e-5
args.train_batch_size = 16
args.eval_batch_size = 64
args.weight_decay = 0.3

In [3]:
## Load general packages
import re

import pandas as pd
import numpy as np

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    set_seed,
)

from utils.io import read_jsonlines
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

In [4]:
from typing import List, Dict, Union

def format_nli_trainset(
        df: pd.DataFrame, 
        hypo_label_dict: Dict, 
        text_col: str='text_prepared',
        entity_col: str='mention',
        random_seed: int=0, 
        verbose: bool=False
    ) -> pd.DataFrame:
    """
    Formats the training data for NLI task.

    Args:
    df: pd.DataFrame
        The training data.
    hypo_label_dict: Dict
        A dictionary with keys as label_text and values as hypothesis template.
        Note that the hypothesis template should have a placeholder which can be replace with the string values in column `entity_col`.
    text_col: str
        The column name which contains the texts.
    entity_col: str
        The column name which contains the string values to be replaced in the hypothesis template.
    random_seed: int
        Random seed for reproducibility.
    verbose: bool
        Whether to print the logs.

    Returns:
    pd.DataFrame
        The formatted training data.
    """

    assert len(df) > 0, "The training data is empty."
    assert len(hypo_label_dict) > 0, "The hypo_label_dict is empty."
    required_cols = ['label_text', entity_col, text_col]
    missing_cols = [c for c in required_cols if c not in df.columns]
    assert len(missing_cols) == 0, f"Missing columns: {missing_cols}"

    if verbose: print(f"Length of df before formatting step: {len(df)}.")
    length_original_data_train = len(df)

    dfs = []
    for label_text, hypothesis in hypo_label_dict.items():
        ## entailment
        df_step = df[df.label_text == label_text].copy(deep=True)
        df_step["hypothesis"] = df_step[entity_col].apply(lambda m: hypothesis % m)
        df_step["label"] = [0] * len(df_step)
        ## not entailment
        df_step_not_entail = df[df.label_text != label_text].copy(deep=True)
        # down-sample not-entailment examples (if needed)
        df_step_not_entail = df_step_not_entail.sample(n=min(len(df_step), len(df_step_not_entail)), random_state=random_seed)
        df_step_not_entail["hypothesis"] = df_step_not_entail[entity_col].apply(lambda m: hypothesis % m)
        df_step_not_entail["label"] = [1] * len(df_step_not_entail)
        # append
        dfs.append(pd.concat([df_step, df_step_not_entail]))
    df = pd.concat(dfs)

    # encode label
    df["label"] = df.label.apply(int)

    # shuffle
    df = df.sample(frac=1, random_state=random_seed).reset_index()

    if verbose: 
        print(f"After adding not_entailment training examples, the training data was augmented to {len(df)} texts.")
        print(f"Max augmentation could be: len(df) * 2 = {length_original_data_train*2}. It can also be lower, if there are more entail examples than not-entail for a majority class.")

    return df[['label', 'hypothesis', text_col]].copy(deep=True)

def format_nli_testset(
        df: pd.DataFrame, 
        hypo_label_dict: Dict, 
        text_col: str='text_prepared',
        entity_col: str='mention',
        verbose: bool=False
    ):
    """
    Formats the test data for NLI task.

    Args:
    df: pd.DataFrame
        The training data.
    hypo_label_dict: Dict
        A dictionary with keys as label_text and values as hypothesis template.
        Note that the hypothesis template should have a placeholder which can be replace with the string values in column `entity_col`.
    text_col: str
        The column name which contains the texts.
    entity_col: str
        The column name which contains the string values to be replaced in the hypothesis template.
    verbose: bool
        Whether to print the logs.

    Returns:
    pd.DataFrame
        The formatted training data.
    """

    assert len(df) > 0, "The training data is empty."
    assert len(hypo_label_dict) > 0, "The hypo_label_dict is empty."
    required_cols = ['label_text', entity_col, text_col]
    missing_cols = [c for c in required_cols if c not in df.columns]
    assert len(missing_cols) == 0, f"Missing columns: {missing_cols}"

    ## explode test dataset for N hypotheses
    hypothesis_list = list(hypo_label_dict.values())
    if verbose: print("Number of hypothesis_list/classes: ", len(hypothesis_list))

    # label lists with 0 at alphabetical position of their true hypo, 1 for not-true hypos
    label_text_label_dict_explode = {}
    for key, value in hypo_label_dict.items():
        labels = [0 if value == hypo else 1 for hypo in hypothesis_list]
        label_text_label_dict_explode[key] = labels

    df["label"] = df.label_text.map(label_text_label_dict_explode)
    df["hypothesis"] = df[entity_col].apply(lambda m: [hypo % m for hypo in hypo_label_dict.values()]).values
    if verbose: print(f"Original test set size: {len(df)}")

    # explode dataset to have K-1 additional rows with not_entail label and K-1 other hypothesis_list
    # ! after exploding, cannot sample anymore, because distorts the order to true label values, which needs to be preserved for evaluation code
    df = df.explode(["hypothesis", "label"])    # multi-column explode requires pd.__version__ >= '1.3.0'
    if verbose: print(f"Test set size for NLI classification: {len(df)}\n")

    # df["label_nli_explicit"] = ["True" if label == 0 else "Not-True" for label in df["label"]]    # adding this just to simplify readibility

    return df[['label', 'hypothesis', text_col]].copy(deep=True)

In [5]:
set_seed(args.seed)

## Prepare the data

In [40]:
df = pd.read_csv(args.data_file, sep="\t", )

df = df[df.q_id=='stance']
df = df[df.label!='Unsure']

len(df)

299

In [41]:
import regex
df[['s', 'e']] = df.apply(lambda x: regex.search(regex.escape(x.mention), x.text).span(), axis=1, result_type='expand')

In [42]:
def format_input(text, s, e):
    return text[:s]+'[['+text[s:e]+']]'+text[e:]

def clean_text(text: str):
    text = re.sub(r'"+', '"', text)
    text = re.sub(r'\s+', ' ', text)
    return text

In [43]:
df['input'] = df.apply(lambda x: clean_text(format_input(x['text'], x['s'], x['e'])), axis=1)
df = df[['mention_id', 'input', 'mention', 'label']]
df.loc[:, 'label'] = df.label.str.lower()
df.label.value_counts()

label
positive    242
negative     34
neutral      23
Name: count, dtype: int64

In [44]:
id2label = dict(enumerate(df.label.unique()))
label2id = {l: i for i, l in id2label.items()}

## Create NLI hypotheses

In [45]:
# dictionary mapping the dataset's label to manually formulated hypotheses based on the codebook
hypothesis_label_dict = {l: f"How the author of the quote talkes about the group [[%s]] is {l}."  for l in label2id.keys()}

In [46]:
# prepare the input text
df.loc[:, "text_prepared"] = 'The quote: """' + df.input.fillna("") + '""" - end of the quote.'

In [47]:
df.rename(columns={'label': 'label_text'}, inplace=True)

## Train/dev/test split

In [48]:
args.test_size

0.1

In [49]:
# determine split sizes
n_test = int(len(df)*args.test_size)
n_dev = int(len(df)*args.dev_size)
n_train = len(df)-n_dev-n_test

# determine split indexes
tmp, test_idxs = train_test_split(df.index, test_size=n_test, stratify=df.label_text)
train_idxs, dev_idxs = train_test_split(tmp, test_size=n_dev, stratify=df.label_text[tmp])

# get split data frames
df_train = df.loc[train_idxs]
df_dev = df.loc[dev_idxs]
df_test = df.loc[test_idxs]
print(len(df_train), len(df_dev), len(df_test))

# covnert to NLI format
df_train = format_nli_trainset(df=df_train, hypo_label_dict=hypothesis_label_dict, random_seed=args.seed)
df_dev = format_nli_testset(df=df_dev, hypo_label_dict=hypothesis_label_dict)
df_test = format_nli_testset(df=df_test, hypo_label_dict=hypothesis_label_dict)
print(len(df_train), len(df_dev), len(df_test))

241 29 29
335 87 87


## Fine-tuning

In [50]:
tokenizer = AutoTokenizer.from_pretrained(args.model_name, use_fast=True, model_max_length=512)

ValueError: Converting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path with a SentencePiece tokenizer.model file.Currently available slow->fast convertors: ['AlbertTokenizer', 'BartTokenizer', 'BarthezTokenizer', 'BertTokenizer', 'BigBirdTokenizer', 'BlenderbotTokenizer', 'CamembertTokenizer', 'CLIPTokenizer', 'CodeGenTokenizer', 'ConvBertTokenizer', 'DebertaTokenizer', 'DebertaV2Tokenizer', 'DistilBertTokenizer', 'DPRReaderTokenizer', 'DPRQuestionEncoderTokenizer', 'DPRContextEncoderTokenizer', 'ElectraTokenizer', 'FNetTokenizer', 'FunnelTokenizer', 'GPT2Tokenizer', 'HerbertTokenizer', 'LayoutLMTokenizer', 'LayoutLMv2Tokenizer', 'LayoutLMv3Tokenizer', 'LayoutXLMTokenizer', 'LongformerTokenizer', 'LEDTokenizer', 'LxmertTokenizer', 'MarkupLMTokenizer', 'MBartTokenizer', 'MBart50Tokenizer', 'MPNetTokenizer', 'MobileBertTokenizer', 'MvpTokenizer', 'NllbTokenizer', 'OpenAIGPTTokenizer', 'PegasusTokenizer', 'Qwen2Tokenizer', 'RealmTokenizer', 'ReformerTokenizer', 'RemBertTokenizer', 'RetriBertTokenizer', 'RobertaTokenizer', 'RoFormerTokenizer', 'SeamlessM4TTokenizer', 'SqueezeBertTokenizer', 'T5Tokenizer', 'UdopTokenizer', 'WhisperTokenizer', 'XLMRobertaTokenizer', 'XLNetTokenizer', 'SplinterTokenizer', 'XGLMTokenizer', 'LlamaTokenizer', 'CodeLlamaTokenizer', 'GemmaTokenizer', 'Phi3Tokenizer']

**Tokenize data**

In [17]:
# convert pandas dataframes to Hugging Face dataset object to facilitate pre-processing
dataset = DatasetDict({
    "train": Dataset.from_pandas(df_train, preserve_index=False),
    "dev": Dataset.from_pandas(df_dev, preserve_index=False),
    "test": Dataset.from_pandas(df_test, preserve_index=False)
})

# tokenize
def tokenize_nli_format(examples):
  return tokenizer(examples["text_prepared"], examples["hypothesis"], truncation=True, max_length=512)

dataset = dataset.map(tokenize_nli_format, batched=True)

# remove unnecessary columns for model training
dataset = dataset.remove_columns(["hypothesis", "text_prepared"])
dataset.set_format("torch")

Map:   0%|          | 0/5066 [00:00<?, ? examples/s]

Map:   0%|          | 0/1580 [00:00<?, ? examples/s]

Map:   0%|          | 0/1580 [00:00<?, ? examples/s]

### Setting training arguments / hyperparameters

The following cell sets several important hyperparameters. We chose parameters that work well in general to avoid the need for hyperparameter search. Further below, we also provide code for hyperparameter search, if researchers want to try to increase performance by a few percentage points.

In [18]:
import os

# FP16 is a hyperparameter which can increase training speed and reduce memory consumption, but only on GPU and if batch-size > 8, see here: https://huggingface.co/transformers/performance.html?#fp16
fp16_bool = bool(torch.cuda.is_available())
# FP16 does not work on CPU or for multilingual mDeBERTa models
if "mdeberta" in args.model_name.lower(): fp16_bool = False  # multilingual mDeBERTa does not support FP16 yet: https://github.com/microsoft/DeBERTa/issues/77

results_path = os.path.join(args.experiment_results_path, args.experiment_name)

train_args = TrainingArguments(
    output_dir=os.path.join(results_path, 'results'),
    logging_dir=os.path.join(results_path, 'logs'),
    # hyperparameters
    learning_rate=args.learning_rate,
    per_device_train_batch_size=args.train_batch_size,
    # gradient_accumulation_steps=4,
    per_device_eval_batch_size=args.eval_batch_size,
    num_train_epochs=args.epochs,
    warmup_ratio=0.25,
    weight_decay=0.1,
    # reproducibility
    seed=args.seed,
    data_seed=args.seed,
    full_determinism=True,
    # model storing and loading
    evaluation_strategy="epoch",
    report_to="all",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model=args.metric,
    greater_is_better=True,
    fp16=fp16_bool,
    fp16_full_eval=False,
)

In [19]:
# helper function to clean memory and reduce risk of out-of-memory error
import gc
def clean_memory():
  #del(model)
  if torch.cuda.is_available():
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()
  gc.collect()

clean_memory()

### Custom function to compute metrics for NLI

We multiplied each text N times for each class in the test set and NLI can only predict 2 or 3 classes: true/not-true or true/neutral/false. This means that we cannot use standard functions for computing metrics. The following function reformats the model's output in a way that allows for the calculation of standard metrics like accuracy, F1-macro etc.

In [20]:
from utils.nli_metrics import compute_metrics_nli_binary

### Fine-tuning and evaluation

In [21]:
# use GPU (cuda) if available, otherwise use CPU
device = "cuda" if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else "cpu"
model_init = lambda: AutoModelForSequenceClassification.from_pretrained(args.model_name).to(device)

Let's start fine-tuning the model!

If you get an 'out-of-memory' error, reduce the 'per_device_train_batch_size' to 8 or 4 in the TrainingArguments above and restart the runtime. If you don't restart your runtime (menu to the to left 'Runtime' > 'Restart runtime') and rerun the entire script, the 'out-of-memory' error will probably not go away.

In [22]:
# training
trainer = Trainer(
    model_init=model_init,
    tokenizer=tokenizer,
    args=train_args,
    train_dataset=dataset["train"].select(range(500)),  #.shard(index=1, num_shards=100),  # could shard data for faster testing https://huggingface.co/docs/datasets/processing.html#sharding-the-dataset-shard
    eval_dataset=dataset["dev"].select(range(500)),  #.shard(index=1, num_shards=100),
    compute_metrics=lambda p: compute_metrics_nli_binary(p, label_classes=LABEL_CLASSES),
    callbacks = [EarlyStoppingCallback(early_stopping_patience=3)]
)

trainer.train()


dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/160 [00:00<?, ?it/s]

  0%|          | 0/8 [00:00<?, ?it/s]

  tmp = classification_report(label_position_gold, hypo_position_highest_prob, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]), target_names=label_text_alphabetical, output_dict=True, zero_division=0.0)


{'eval_loss': 0.9617348313331604, 'eval_f1_macro': 0.35759371579146293, 'eval_f1_micro': 0.48, 'eval_accuracy': 0.48, 'eval_balanced_accuracy': 0.3806149732620321, 'eval_f1_very_positive': 0.0, 'eval_f1_rather_positive': 0.2962962962962963, 'eval_f1_neutral': 0.5882352941176471, 'eval_f1_rather_negative': 0.3076923076923077, 'eval_f1_very_negative': 0.5957446808510638, 'eval_runtime': 103.6072, 'eval_samples_per_second': 4.826, 'eval_steps_per_second': 0.077, 'epoch': 1.0}


  0%|          | 0/8 [00:00<?, ?it/s]

  tmp = classification_report(label_position_gold, hypo_position_highest_prob, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]), target_names=label_text_alphabetical, output_dict=True, zero_division=0.0)


{'eval_loss': 0.5620649456977844, 'eval_f1_macro': 0.38510256815904653, 'eval_f1_micro': 0.54, 'eval_accuracy': 0.54, 'eval_balanced_accuracy': 0.4046791443850267, 'eval_f1_very_positive': 0.0, 'eval_f1_rather_positive': 0.6046511627906976, 'eval_f1_neutral': 0.2222222222222222, 'eval_f1_rather_negative': 0.40816326530612246, 'eval_f1_very_negative': 0.6904761904761905, 'eval_runtime': 88.1719, 'eval_samples_per_second': 5.671, 'eval_steps_per_second': 0.091, 'epoch': 2.0}


  0%|          | 0/8 [00:00<?, ?it/s]

  tmp = classification_report(label_position_gold, hypo_position_highest_prob, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]), target_names=label_text_alphabetical, output_dict=True, zero_division=0.0)


{'eval_loss': nan, 'eval_f1_macro': 0.5398777279058968, 'eval_f1_micro': 0.63, 'eval_accuracy': 0.63, 'eval_balanced_accuracy': 0.543048128342246, 'eval_f1_very_positive': 0.0, 'eval_f1_rather_positive': 0.6153846153846154, 'eval_f1_neutral': 0.8571428571428571, 'eval_f1_rather_negative': 0.6071428571428571, 'eval_f1_very_negative': 0.6197183098591549, 'eval_runtime': 86.1275, 'eval_samples_per_second': 5.805, 'eval_steps_per_second': 0.093, 'epoch': 3.0}


  0%|          | 0/8 [00:00<?, ?it/s]

  tmp = classification_report(label_position_gold, hypo_position_highest_prob, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]), target_names=label_text_alphabetical, output_dict=True, zero_division=0.0)


{'eval_loss': 0.7215349078178406, 'eval_f1_macro': 0.5398369661527556, 'eval_f1_micro': 0.63, 'eval_accuracy': 0.63, 'eval_balanced_accuracy': 0.5460561497326204, 'eval_f1_very_positive': 0.0, 'eval_f1_rather_positive': 0.6818181818181818, 'eval_f1_neutral': 0.8148148148148148, 'eval_f1_rather_negative': 0.5964912280701754, 'eval_f1_very_negative': 0.6060606060606061, 'eval_runtime': 86.5926, 'eval_samples_per_second': 5.774, 'eval_steps_per_second': 0.092, 'epoch': 4.0}


  0%|          | 0/8 [00:00<?, ?it/s]

  tmp = classification_report(label_position_gold, hypo_position_highest_prob, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]), target_names=label_text_alphabetical, output_dict=True, zero_division=0.0)


{'eval_loss': 0.7493066191673279, 'eval_f1_macro': 0.5344738862817053, 'eval_f1_micro': 0.62, 'eval_accuracy': 0.62, 'eval_balanced_accuracy': 0.5403743315508021, 'eval_f1_very_positive': 0.0, 'eval_f1_rather_positive': 0.6521739130434783, 'eval_f1_neutral': 0.8571428571428571, 'eval_f1_rather_negative': 0.5660377358490566, 'eval_f1_very_negative': 0.5970149253731343, 'eval_runtime': 88.8546, 'eval_samples_per_second': 5.627, 'eval_steps_per_second': 0.09, 'epoch': 5.0}
{'train_runtime': 1646.6724, 'train_samples_per_second': 1.518, 'train_steps_per_second': 0.097, 'train_loss': 0.5769464492797851, 'epoch': 5.0}


TrainOutput(global_step=160, training_loss=0.5769464492797851, metrics={'train_runtime': 1646.6724, 'train_samples_per_second': 1.518, 'train_steps_per_second': 0.097, 'train_loss': 0.5769464492797851, 'epoch': 5.0})

In [23]:
## Evaluate the fine-tuned model on the held-out test set
results = trainer.evaluate(eval_dataset=dataset["test"], metric_key_prefix='test')
{k.removeprefix('test_f1_'): v for k, v in results.items() if 'f1' in k}

  0%|          | 0/25 [00:00<?, ?it/s]

  tmp = classification_report(label_position_gold, hypo_position_highest_prob, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]), target_names=label_text_alphabetical, output_dict=True, zero_division=0.0)
early stopping required metric_for_best_model, but did not find eval_f1_macro so early stopping is disabled


{'macro': 0.5231935026908083,
 'micro': 0.629746835443038,
 'very_positive': 0.10526315789473684,
 'rather_positive': 0.5263157894736842,
 'neutral': 0.6666666666666666,
 'rather_negative': 0.5838509316770186,
 'very_negative': 0.7338709677419355}

In [26]:
# model.cpu(); del model; clean_memory()
trainer.model.cpu(); del trainer; clean_memory()