# Classify group mention polarity with NLI

based on https://colab.research.google.com/github/MoritzLaurer/less-annotating-with-bert-nli/blob/master/BERT_NLI_demo.ipynb




In [2]:
from types import SimpleNamespace

args = SimpleNamespace()

args.model_name = 'MoritzLaurer/ModernBERT-large-zeroshot-v2.0'

args.data_file = '../../data/annotations/group_mention_categorization/consolidated_annotations.tsv'

args.experiment_name = 'mention_stance_nli'
args.experiment_model_path = './../../models/'

args.test_size = 0.15
args.dev_size = 0.15
args.seed = 1234

args.metric = 'f1_macro'
args.epochs = 5
args.learning_rate = 4e-5
args.train_batch_size = 16
args.eval_batch_size = 64
args.weight_decay = 0.3

In [None]:
## Load general packages
import pandas as pd

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    set_seed,
)

from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from utils.nli import STANCE_LABEL_CLASSES as label_classes
from utils.nli import (
    clean_memory,
    clean_text,
    format_nli_trainset, 
    format_nli_testset, 
    tokenize_nli_format,
    compute_metrics_binary
)

In [4]:
set_seed(args.seed)

## Prepare the data

In [None]:
df = pd.read_csv(args.data_file, sep="\t", )

df = df[df.q_id=='stance']
df = df[df.label.str.lower().isin(label_classes)]

len(df)

299

In [7]:
df['input'] = df.text.apply(clean_text)
df = df[['mention_id', 'input', 'mention', 'label']]
df.loc[:, 'label'] = df.label.str.lower()

In [8]:
id2label = dict(enumerate(label_classes))
label2id = {l: i for i, l in id2label.items()}

## Create NLI hypotheses

In [None]:
from utils.nli import STANCE_HYPOTHESIS_TEMPLATE
hypothesis_label_dict = {l: STANCE_HYPOTHESIS_TEMPLATE.format(label=l, entity='%s')  for l in label2id.keys()}

{'positive': 'The author of the quote takes a positive stance towards "%s".',
 'neutral': 'The author of the quote takes a neutral stance towards "%s".',
 'negative': 'The author of the quote takes a negative stance towards "%s".'}

In [12]:
# prepare the input text
df.loc[:, "text_prepared"] = 'The quote: """' + df.input.fillna("") + '""" - end of the quote.'

In [13]:
df.rename(columns={'label': 'label_text'}, inplace=True)

## Train/dev/test split

In [15]:
# determine split sizes
n_test = int(len(df)*args.test_size)
n_dev = int(len(df)*args.dev_size)
n_train = len(df)-n_dev-n_test

# determine split indexes
tmp, test_idxs = train_test_split(df.index, test_size=n_test, stratify=df.label_text)
train_idxs, dev_idxs = train_test_split(tmp, test_size=n_dev, stratify=df.label_text[tmp])

# get split data frames
df_train = df.loc[train_idxs]
df_dev = df.loc[dev_idxs]
df_test = df.loc[test_idxs]
print(len(df_train), len(df_dev), len(df_test))

# covnert to NLI format
df_train = format_nli_trainset(df=df_train, hypo_label_dict=hypothesis_label_dict, random_seed=args.seed, keep_label_text_col=True)
df_dev = format_nli_testset(df=df_dev, hypo_label_dict=hypothesis_label_dict)
df_test = format_nli_testset(df=df_test, hypo_label_dict=hypothesis_label_dict)
print(len(df_train), len(df_dev), len(df_test))

211 44 44
293 132 132


In [16]:
df_train.value_counts(['label_text', 'label'], sort=False)

label_text  label
negative    0         24
            1         25
neutral     0         17
            1         20
positive    0        170
            1         37
Name: count, dtype: int64

In [17]:
# balance
min_size, max_size = 30, 60
df_train = df_train.groupby(['label_text', 'label']).apply(lambda x: x.sample(n=min(max_size, max(min_size, len(x))), replace=len(x)<min_size, random_state=args.seed)).reset_index(drop=True)
df_train = df_train.sample(frac=1.0, random_state=args.seed).reset_index(drop=True)

  df_train = df_train.groupby(['label_text', 'label']).apply(lambda x: x.sample(n=min(max_size, max(min_size, len(x))), replace=len(x)<min_size, random_state=args.seed)).reset_index(drop=True)


In [18]:
del df_train['label_text']

## Fine-tuning

In [19]:
# !pip install tiktoken==0.9.0
# args.model_name = 'answerdotai/ModernBERT-base'

In [20]:
tokenizer = AutoTokenizer.from_pretrained(args.model_name, model_max_length=512)

**Tokenize data**

In [None]:
# convert pandas dataframes to Hugging Face dataset object to facilitate pre-processing
dataset = DatasetDict({
    "train": Dataset.from_pandas(df_train, preserve_index=False),
    "dev": Dataset.from_pandas(df_dev, preserve_index=False),
    "test": Dataset.from_pandas(df_test, preserve_index=False)
})

dataset = dataset.map(lambda ex: tokenize_nli_format(ex, tokenizer), batched=True)

# remove unnecessary columns for model training
dataset = dataset.remove_columns(["hypothesis", "text_prepared"])
dataset.set_format("torch")

Map: 100%|██████████| 217/217 [00:00<00:00, 28981.50 examples/s]
Map: 100%|██████████| 132/132 [00:00<00:00, 30222.62 examples/s]
Map: 100%|██████████| 132/132 [00:00<00:00, 31484.11 examples/s]


### Setting training arguments / hyperparameters

The following cell sets several important hyperparameters. We chose parameters that work well in general to avoid the need for hyperparameter search. Further below, we also provide code for hyperparameter search, if researchers want to try to increase performance by a few percentage points.

In [None]:
import os

# FP16 is a hyperparameter which can increase training speed and reduce memory consumption, but only on GPU and if batch-size > 8, see here: https://huggingface.co/transformers/performance.html?#fp16
fp16_bool = bool(torch.cuda.is_available())
# FP16 does not work on CPU or for multilingual mDeBERTa models
if "mdeberta" in args.model_name.lower(): fp16_bool = False  # multilingual mDeBERTa does not support FP16 yet: https://github.com/microsoft/DeBERTa/issues/77

model_path = os.path.join(args.experiment_model_path, args.experiment_name)

train_args = TrainingArguments(
    output_dir=os.path.join(model_path, 'results'),
    logging_dir=os.path.join(model_path, 'logs'),
    # hyperparameters
    learning_rate=2e-5, # args.learning_rate,
    lr_scheduler_type="cosine",
    per_device_train_batch_size=16, #args.train_batch_size,
    # gradient_accumulation_steps=4,
    per_device_eval_batch_size=args.eval_batch_size,
    num_train_epochs=15, #args.epochs,
    warmup_ratio=0.1,
    weight_decay=0.1,
    # reproducibility
    seed=args.seed,
    data_seed=args.seed,
    # full_determinism=True,
    # model storing and loading
    evaluation_strategy="epoch",
    report_to="all",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='worst_class_f1', # args.metric,
    greater_is_better=True,
    # fp16=fp16_bool,
    # fp16_full_eval=False,
    bf16=True
)



In [93]:
args.train_batch_size

16

In [95]:
df_train.iloc[0].to_dict()

{'label': 0,
 'hypothesis': 'The author of the quote takes a negative stance towards "some who believe there is nothing we can do to change things for the better".',
 'text_prepared': 'The quote: """There are some who believe there is nothing we can do to change things for the better.""" - end of the quote.'}

### Custom function to compute metrics for NLI

We multiplied each text N times for each class in the test set and NLI can only predict 2 or 3 classes: true/not-true or true/neutral/false. This means that we cannot use standard functions for computing metrics. The following function reformats the model's output in a way that allows for the calculation of standard metrics like accuracy, F1-macro etc.

### Fine-tuning and evaluation

In [96]:
# use GPU (cuda) if available, otherwise use CPU
device = "cuda" if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else "cpu"
model_init = lambda: AutoModelForSequenceClassification.from_pretrained(args.model_name, torch_dtype="auto", device_map="auto")#.to(device)

Let's start fine-tuning the model!

If you get an 'out-of-memory' error, reduce the 'per_device_train_batch_size' to 8 or 4 in the TrainingArguments above and restart the runtime. If you don't restart your runtime (menu to the to left 'Runtime' > 'Restart runtime') and rerun the entire script, the 'out-of-memory' error will probably not go away.

In [None]:
# training
trainer = Trainer(
    model_init=model_init,
    tokenizer=tokenizer,
    args=train_args,
    train_dataset=dataset["train"],  #.shard(index=1, num_shards=100),  # could shard data for faster testing https://huggingface.co/docs/datasets/processing.html#sharding-the-dataset-shard
    eval_dataset=dataset["dev"],  #.shard(index=1, num_shards=100),
    compute_metrics=lambda p: compute_metrics_binary(p, label_classes=list(label2id.keys())),
    callbacks = [EarlyStoppingCallback(early_stopping_patience=5)]
)

  trainer = Trainer(


In [98]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1 Macro,F1 Micro,Accuracy,Balanced Accuracy,F1 Positive,F1 Neutral,F1 Negative,Worst Class F1
1,No log,0.890543,0.450142,0.545455,0.545455,0.598148,0.461538,0.222222,0.666667,0.222222
2,No log,0.957434,0.588889,0.681818,0.681818,0.768519,0.714286,0.285714,0.766667,0.285714
3,No log,0.818312,0.63444,0.75,0.75,0.796296,0.714286,0.363636,0.825397,0.363636
4,No log,0.70978,0.673423,0.795455,0.795455,0.814815,0.714286,0.444444,0.861538,0.444444
5,No log,0.69642,0.505051,0.75,0.75,0.592593,0.666667,0.0,0.848485,0.0
6,No log,0.684749,0.505051,0.75,0.75,0.592593,0.666667,0.0,0.848485,0.0
7,No log,0.661712,0.505051,0.75,0.75,0.592593,0.666667,0.0,0.848485,0.0
8,No log,0.673053,0.510779,0.772727,0.772727,0.601852,0.666667,0.0,0.865672,0.0
9,No log,0.683464,0.510779,0.772727,0.772727,0.601852,0.666667,0.0,0.865672,0.0


  tmp = classification_report(label_position_gold, hypo_position_highest_prob, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]), target_names=label_text_alphabetical, output_dict=True, zero_division=0.0)
  tmp = classification_report(label_position_gold, hypo_position_highest_prob, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]), target_names=label_text_alphabetical, output_dict=True, zero_division=0.0)
  tmp = classification_report(label_position_gold, hypo_position_highest_prob, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]), target_names=label_text_alphabetical, output_dict=True, zero_division=0.0)
  tmp = classification_report(label_position_gold, hypo_position_highest_prob, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]), target_names=label_text_alphabetical, output_dict=True, zero_division=0.0)
  tmp = classification_report(label_position_gold, hypo_position_highest_prob, labels=np.sort(pd.factorize(l

TrainOutput(global_step=126, training_loss=0.5250408233158173, metrics={'train_runtime': 26.7454, 'train_samples_per_second': 121.703, 'train_steps_per_second': 7.852, 'total_flos': 291420712744308.0, 'train_loss': 0.5250408233158173, 'epoch': 9.0})

In [101]:
## Evaluate the fine-tuned model on the held-out test set
results = trainer.evaluate(eval_dataset=dataset["test"], metric_key_prefix='test')
{k.removeprefix('test_'): v for k, v in results.items() if 'f1' in k}

  tmp = classification_report(label_position_gold, hypo_position_highest_prob, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]), target_names=label_text_alphabetical, output_dict=True, zero_division=0.0)
early stopping required metric_for_best_model, but did not find eval_worst_class_f1 so early stopping is disabled


{'f1_macro': 0.6349363197189285,
 'f1_micro': 0.8409090909090909,
 'f1_positive': 0.7272727272727273,
 'f1_neutral': 0.25,
 'f1_negative': 0.927536231884058,
 'worst_class_f1': 0.25}

In [None]:
import shutil
shutil.rmtree(model_path)
trainer.model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('./../../models/mention_stance_nli/tokenizer_config.json',
 './../../models/mention_stance_nli/special_tokens_map.json',
 './../../models/mention_stance_nli/tokenizer.json')

In [None]:
trainer.model.to('cpu');
del trainer
clean_memory()

## Inference

In [156]:
# import inspect
from typing import List, Union, Tuple
from transformers.pipelines.base import ArgumentHandler # , ChunkPipeline, build_pipeline_init_args

# NOTE: the only thing we need to modify when using the zero-shot pipeline for NLI is that the hypothesis template must allow including the mentioned entity
class ZeroShotMentionClassificationArgumentHandler(ArgumentHandler):
    """
    Handles arguments for zero-shot for text classification by turning each possible label into an NLI
    premise/hypothesis pair.

    based on ZeroShotClassificationArgumentHandler from transformers.pipelines.zero_shot_classification
     (see https://github.com/huggingface/transformers/blob/fc689d75a04e846f63f8d7a4a420da0cf796f86b/src/transformers/pipelines/zero_shot_classification.py#L14)
    """

    def _parse_labels(self, labels):
        if isinstance(labels, str):
            labels = [label.strip() for label in labels.split(",") if label.strip()]
        return labels

    def __call__(self, sequences: Union[Tuple[str, str], List[Tuple[str, str]]], labels: List[str], hypothesis_template: str = "{entity} is {label}."):
        if isinstance(sequences, tuple):
            sequences = [sequences]

        if len(labels) == 0 or len(sequences) == 0:
            raise ValueError("You must include at least one label and at least one sequence.")
        if any(len(sequence) != 2 for sequence in sequences):
            raise ValueError("the sequence inputs must be a list of tuples with two elements: the text and the mentioned entity.")
        entities = [entity for _, entity in sequences]
        sequences = [sequence for sequence, _ in sequences]

        if hypothesis_template.format(entity=entities[0], label=labels[0]) == hypothesis_template:
            raise ValueError(
                # TODO: change the error message
                (
                    'The provided hypothesis_template "{}" was not able to be formatted with the target labels. '
                    "Make sure the passed template includes formatting syntax such as {{}} where the label should go."
                ).format(hypothesis_template)
            )

        sequence_pairs = []
        for sequence, entity in zip(sequences, entities):
            sequence_pairs.extend([[sequence, hypothesis_template.format(label=label, entity=entity)] for label in labels])

        return sequence_pairs, sequences

In [None]:
from transformers import pipeline

classifier = pipeline(
    task='zero-shot-classification',
    model=model_path,
    # model=trainer.model.eval(),
    # tokenizer=trainer.tokenizer,
    framework='pt',
    #device=device,
    args_parser=ZeroShotMentionClassificationArgumentHandler()
)

Device set to use cuda:0


In [186]:
fp = '../../data/labeled/manifesto_sentences_predicted_group_mentions_spans.tsv'
df = pd.read_csv(fp, sep="\t") #, nrows=1000)

max_spans = df['span_nr'].max()
df['mention_id'] = df['sentence_id'].astype(str) + '-' + df['span_nr'].astype(str).str.zfill(len(str(max_spans)))

df = df[df.label=='social group']
del df['label']

df.rename(columns={'text': 'mention', 'sentence_text': 'text'}, inplace=True)
cols = ['mention_id', 'text', 'mention']
df = df[cols]

In [187]:
df.loc[:, "text_prepared"] = 'The quote: """' + df.text.fillna("") + '""" - end of the quote.'
cols.append('text_prepared')
df = df[cols]

In [188]:
def _predict_batch(df, batch_size=64):
    inputs = df[['text_prepared', 'mention']].apply(tuple, axis=1).to_list()
    preds = classifier(
        inputs,
        candidate_labels=list(hypothesis_label_dict.keys()),
        hypothesis_template=HYPOTHESIS_TEMPLATE,
        multi_label=False,
        batch_size=batch_size
    )
    preds_df = pd.concat(pd.DataFrame(pred['scores'], index=pred['labels']).T for pred in preds)
    preds_df['pred'] = preds_df.columns[preds_df.values.argmax(axis=1)]

    preds_df = pd.concat([
        df[['mention_id', 'text', 'mention']].reset_index(drop=True),
        preds_df.reset_index(drop=True)
    ], axis=1)

    return preds_df

In [None]:
from tqdm import tqdm
n = len(df)
batch_size = 64
chunk_size = batch_size*20

preds_dfs = []
for i in tqdm(range(0, n, chunk_size)):
    if i + chunk_size > n:
        chunk_size = n - i
    # print(f"Processing {i} to {i+chunk_size} of {n}")
    preds_df = _predict_batch(df.iloc[i:i+chunk_size], batch_size=batch_size)
    preds_dfs.append(preds_df)
    del preds_df
    clean_memory()
    # preds_df.to_csv(os.path.join(model_path, f'predictions_{i}.tsv'), sep="\t", index=False)

  2%|▏         | 4/164 [00:39<25:44,  9.65s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
 60%|█████▉    | 98/164 [14:04<10:43,  9.75s/it]

In [185]:
pd.concat(preds_dfs)

Unnamed: 0,mention_id,text,mention,positive,neutral,negative,pred
0,63110_200410-00001-1,The Greens believe that everyone has a right t...,everyone,0.930811,0.053947,0.015241,positive
1,63110_200410-00010-1,The most obvious example in Australia is the a...,Indigenous peoples,0.031075,0.097694,0.871231,negative
2,63110_200410-00017-1,It promotes paper recycling and incorporates r...,workers,0.824241,0.164518,0.011241,positive
3,63110_200410-00030-1,Australians are the world's highest per capita...,Australians,0.029262,0.336320,0.634418,negative
4,63110_200410-00030-2,Australians are the world's highest per capita...,the world's highest per capita users of water,0.104216,0.345450,0.550335,negative
...,...,...,...,...,...,...,...
235,63110_201008-01711-1,the promotion amongst Australians of a broad u...,Australians,0.605993,0.378024,0.015982,positive
236,63110_201008-01711-2,the promotion amongst Australians of a broad u...,their people,0.584369,0.404133,0.011497,positive
237,63110_201008-01713-1,support the growth within our region of locali...,disenfranchised communities,0.945091,0.042553,0.012356,positive
238,63110_201008-01713-2,support the growth within our region of locali...,women,0.880113,0.109260,0.010627,positive
