# Classify group mention polarity with NLI

based on https://colab.research.google.com/github/MoritzLaurer/less-annotating-with-bert-nli/blob/master/BERT_NLI_demo.ipynb




In [3]:
from types import SimpleNamespace

args = SimpleNamespace()

args.model_name = 'MoritzLaurer/ModernBERT-large-zeroshot-v2.0'

args.data_file = '../../data/annotations/group_mention_categorization/consolidated_annotations.tsv'

args.experiment_name = 'mention_stance_nli'
args.experiment_model_path = './../../models/'

args.test_size = 0.15
args.dev_size = 0.15
args.seed = 1234

args.metric = 'f1_macro'
args.epochs = 5
args.learning_rate = 4e-5
args.train_batch_size = 16
args.eval_batch_size = 64
args.weight_decay = 0.3

In [5]:
## Load general packages
import pandas as pd

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    EarlyStoppingCallback,
    set_seed,
)

from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict
from utils.nli import (
    clear_memory,
    clean_text,
    tokenize_nli_format,
    compute_metrics_binary
) 
from utils.nli.stance import STANCE_LABEL_CLASSES as label_classes
from utils.nli.stance import (
    format_nli_trainset, 
    format_nli_testset, 
)

In [6]:
set_seed(args.seed)

## Prepare the data

In [7]:
df = pd.read_csv(args.data_file, sep="\t", )

df = df[df.q_id=='stance']
df = df[df.label.str.lower().isin(label_classes)]

len(df)

299

In [8]:
df['input'] = df.text.apply(clean_text)
df = df[['mention_id', 'input', 'mention', 'label']]
df.loc[:, 'label'] = df.label.str.lower()

In [10]:
id2label = dict(enumerate(label_classes))
label2id = {l: i for i, l in id2label.items()}
label2id

{'positive': 0, 'neutral': 1, 'negative': 2}

## Create NLI hypotheses

In [12]:
from utils.nli.stance import STANCE_HYPOTHESIS_TEMPLATE
hypothesis_label_dict = {l: STANCE_HYPOTHESIS_TEMPLATE.format(label=l, entity='%s')  for l in label2id.keys()}
hypothesis_label_dict

{'positive': 'The author of the quote takes a positive stance towards "%s".',
 'neutral': 'The author of the quote takes a neutral stance towards "%s".',
 'negative': 'The author of the quote takes a negative stance towards "%s".'}

In [13]:
# prepare the input text
df.loc[:, "text_prepared"] = 'The quote: """' + df.input.fillna("") + '""" - end of the quote.'

In [14]:
df.rename(columns={'label': 'label_text'}, inplace=True)

## Train/dev/test split

In [15]:
# determine split sizes
n_test = int(len(df)*args.test_size)
n_dev = int(len(df)*args.dev_size)
n_train = len(df)-n_dev-n_test

# determine split indexes
tmp, test_idxs = train_test_split(df.index, test_size=n_test, stratify=df.label_text)
train_idxs, dev_idxs = train_test_split(tmp, test_size=n_dev, stratify=df.label_text[tmp])

# get split data frames
df_train = df.loc[train_idxs]
df_dev = df.loc[dev_idxs]
df_test = df.loc[test_idxs]
print(len(df_train), len(df_dev), len(df_test))

# covnert to NLI format
df_train = format_nli_trainset(df=df_train, hypo_label_dict=hypothesis_label_dict, random_seed=args.seed, keep_label_text_col=True)
df_dev = format_nli_testset(df=df_dev, hypo_label_dict=hypothesis_label_dict)
df_test = format_nli_testset(df=df_test, hypo_label_dict=hypothesis_label_dict)
print(len(df_train), len(df_dev), len(df_test))

211 44 44
293 132 132


In [None]:
# prevalence
print(df_train.label.mean())

0.27986348122866894


In [25]:
df_train.iloc[[290],:]

Unnamed: 0,label,hypothesis,text_prepared,label_text
290,1,The author of the quote takes a positive stanc...,"The quote: """"""The inevitable bill must be paid...",negative


In [22]:
print(df_train.iloc[290].to_json(indent=4))

{
    "label":1,
    "hypothesis":"The author of the quote takes a positive stance towards \"those who have done well in the last 5 years\".",
    "text_prepared":"The quote: \"\"\"The inevitable bill must be paid by those who have done well in the last 5 years.\"\"\" - end of the quote.",
    "label_text":"negative"
}


In [32]:
df_train.loc[df_train.text_prepared==df_train.text_prepared.iloc[290], ['label', 'hypothesis']].to_dict(orient='records')

[{'label': 0,
  'hypothesis': 'The author of the quote takes a negative stance towards "those who have done well in the last 5 years".'},
 {'label': 1,
  'hypothesis': 'The author of the quote takes a positive stance towards "those who have done well in the last 5 years".'}]

In [16]:
df_train.value_counts(['label_text', 'label'], sort=False)

label_text  label
negative    0         24
            1         25
neutral     0         17
            1         20
positive    0        170
            1         37
Name: count, dtype: int64

In [17]:
# balance
min_size, max_size = 30, 60
df_train = df_train.groupby(['label_text', 'label']).apply(lambda x: x.sample(n=min(max_size, max(min_size, len(x))), replace=len(x)<min_size, random_state=args.seed)).reset_index(drop=True)
df_train = df_train.sample(frac=1.0, random_state=args.seed).reset_index(drop=True)

  df_train = df_train.groupby(['label_text', 'label']).apply(lambda x: x.sample(n=min(max_size, max(min_size, len(x))), replace=len(x)<min_size, random_state=args.seed)).reset_index(drop=True)


In [18]:
del df_train['label_text']

## Fine-tuning

In [19]:
# !pip install tiktoken==0.9.0
# args.model_name = 'answerdotai/ModernBERT-base'

In [20]:
tokenizer = AutoTokenizer.from_pretrained(args.model_name, model_max_length=512)

**Tokenize data**

In [None]:
# convert pandas dataframes to Hugging Face dataset object to facilitate pre-processing
dataset = DatasetDict({
    "train": Dataset.from_pandas(df_train, preserve_index=False),
    "dev": Dataset.from_pandas(df_dev, preserve_index=False),
    "test": Dataset.from_pandas(df_test, preserve_index=False)
})

dataset = dataset.map(lambda ex: tokenize_nli_format(ex, tokenizer), batched=True)

# remove unnecessary columns for model training
dataset = dataset.remove_columns(["hypothesis", "text_prepared"])
dataset.set_format("torch")

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 217/217 [00:00<00:00, 28981.50 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 132/132 [00:00<00:00, 30222.62 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 132/132 [00:00<00:00, 31484.11 examples/s]


### Setting training arguments / hyperparameters

The following cell sets several important hyperparameters. We chose parameters that work well in general to avoid the need for hyperparameter search. Further below, we also provide code for hyperparameter search, if researchers want to try to increase performance by a few percentage points.

In [None]:
import os

# FP16 is a hyperparameter which can increase training speed and reduce memory consumption, but only on GPU and if batch-size > 8, see here: https://huggingface.co/transformers/performance.html?#fp16
fp16_bool = bool(torch.cuda.is_available())
# FP16 does not work on CPU or for multilingual mDeBERTa models
if "mdeberta" in args.model_name.lower(): fp16_bool = False  # multilingual mDeBERTa does not support FP16 yet: https://github.com/microsoft/DeBERTa/issues/77

model_path = os.path.join(args.experiment_model_path, args.experiment_name)

train_args = TrainingArguments(
    output_dir=os.path.join(model_path, 'results'),
    logging_dir=os.path.join(model_path, 'logs'),
    # hyperparameters
    learning_rate=2e-5, # args.learning_rate,
    lr_scheduler_type="cosine",
    per_device_train_batch_size=16, #args.train_batch_size,
    # gradient_accumulation_steps=4,
    per_device_eval_batch_size=args.eval_batch_size,
    num_train_epochs=15, #args.epochs,
    warmup_ratio=0.1,
    weight_decay=0.1,
    # reproducibility
    seed=args.seed,
    data_seed=args.seed,
    # full_determinism=True,
    # model storing and loading
    evaluation_strategy="epoch",
    report_to="all",
    save_strategy="epoch",
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='worst_class_f1', # args.metric,
    greater_is_better=True,
    # fp16=fp16_bool,
    # fp16_full_eval=False,
    bf16=True
)



In [93]:
args.train_batch_size

16

In [95]:
df_train.iloc[0].to_dict()

{'label': 0,
 'hypothesis': 'The author of the quote takes a negative stance towards "some who believe there is nothing we can do to change things for the better".',
 'text_prepared': 'The quote: """There are some who believe there is nothing we can do to change things for the better.""" - end of the quote.'}

### Custom function to compute metrics for NLI

We multiplied each text N times for each class in the test set and NLI can only predict 2 or 3 classes: true/not-true or true/neutral/false. This means that we cannot use standard functions for computing metrics. The following function reformats the model's output in a way that allows for the calculation of standard metrics like accuracy, F1-macro etc.

### Fine-tuning and evaluation

In [96]:
# use GPU (cuda) if available, otherwise use CPU
device = "cuda" if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else "cpu"
model_init = lambda: AutoModelForSequenceClassification.from_pretrained(args.model_name, torch_dtype="auto", device_map="auto")#.to(device)

Let's start fine-tuning the model!

If you get an 'out-of-memory' error, reduce the 'per_device_train_batch_size' to 8 or 4 in the TrainingArguments above and restart the runtime. If you don't restart your runtime (menu to the to left 'Runtime' > 'Restart runtime') and rerun the entire script, the 'out-of-memory' error will probably not go away.

In [None]:
# training
trainer = Trainer(
    model_init=model_init,
    tokenizer=tokenizer,
    args=train_args,
    train_dataset=dataset["train"],  #.shard(index=1, num_shards=100),  # could shard data for faster testing https://huggingface.co/docs/datasets/processing.html#sharding-the-dataset-shard
    eval_dataset=dataset["dev"],  #.shard(index=1, num_shards=100),
    compute_metrics=lambda p: compute_metrics_binary(p, label_classes=list(label2id.keys())),
    callbacks = [EarlyStoppingCallback(early_stopping_patience=5)]
)

  trainer = Trainer(


In [98]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1 Macro,F1 Micro,Accuracy,Balanced Accuracy,F1 Positive,F1 Neutral,F1 Negative,Worst Class F1
1,No log,0.890543,0.450142,0.545455,0.545455,0.598148,0.461538,0.222222,0.666667,0.222222
2,No log,0.957434,0.588889,0.681818,0.681818,0.768519,0.714286,0.285714,0.766667,0.285714
3,No log,0.818312,0.63444,0.75,0.75,0.796296,0.714286,0.363636,0.825397,0.363636
4,No log,0.70978,0.673423,0.795455,0.795455,0.814815,0.714286,0.444444,0.861538,0.444444
5,No log,0.69642,0.505051,0.75,0.75,0.592593,0.666667,0.0,0.848485,0.0
6,No log,0.684749,0.505051,0.75,0.75,0.592593,0.666667,0.0,0.848485,0.0
7,No log,0.661712,0.505051,0.75,0.75,0.592593,0.666667,0.0,0.848485,0.0
8,No log,0.673053,0.510779,0.772727,0.772727,0.601852,0.666667,0.0,0.865672,0.0
9,No log,0.683464,0.510779,0.772727,0.772727,0.601852,0.666667,0.0,0.865672,0.0


  tmp = classification_report(label_position_gold, hypo_position_highest_prob, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]), target_names=label_text_alphabetical, output_dict=True, zero_division=0.0)
  tmp = classification_report(label_position_gold, hypo_position_highest_prob, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]), target_names=label_text_alphabetical, output_dict=True, zero_division=0.0)
  tmp = classification_report(label_position_gold, hypo_position_highest_prob, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]), target_names=label_text_alphabetical, output_dict=True, zero_division=0.0)
  tmp = classification_report(label_position_gold, hypo_position_highest_prob, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]), target_names=label_text_alphabetical, output_dict=True, zero_division=0.0)
  tmp = classification_report(label_position_gold, hypo_position_highest_prob, labels=np.sort(pd.factorize(l

TrainOutput(global_step=126, training_loss=0.5250408233158173, metrics={'train_runtime': 26.7454, 'train_samples_per_second': 121.703, 'train_steps_per_second': 7.852, 'total_flos': 291420712744308.0, 'train_loss': 0.5250408233158173, 'epoch': 9.0})

In [101]:
## Evaluate the fine-tuned model on the held-out test set
results = trainer.evaluate(eval_dataset=dataset["test"], metric_key_prefix='test')
{k.removeprefix('test_'): v for k, v in results.items() if 'f1' in k}

  tmp = classification_report(label_position_gold, hypo_position_highest_prob, labels=np.sort(pd.factorize(label_text_alphabetical, sort=True)[0]), target_names=label_text_alphabetical, output_dict=True, zero_division=0.0)
early stopping required metric_for_best_model, but did not find eval_worst_class_f1 so early stopping is disabled


{'f1_macro': 0.6349363197189285,
 'f1_micro': 0.8409090909090909,
 'f1_positive': 0.7272727272727273,
 'f1_neutral': 0.25,
 'f1_negative': 0.927536231884058,
 'worst_class_f1': 0.25}

In [None]:
import shutil
shutil.rmtree(model_path)
trainer.model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('./../../models/mention_stance_nli/tokenizer_config.json',
 './../../models/mention_stance_nli/special_tokens_map.json',
 './../../models/mention_stance_nli/tokenizer.json')

In [None]:
trainer.model.to('cpu');
del trainer
clean_memory()