# Social group mention stance classification


In this notebook, we fine-tune a pre-trained sentence transformer model for multilabel classifiers using the `setfit` library to categorize into which attribute dimensions social group mentions belong.

notebook based on https://github.com/huggingface/setfit/blob/main/notebooks/text-classification_multilabel.ipynb

See also:

- https://huggingface.co/docs/setfit/en/how_to/multilabel
- https://github.com/huggingface/setfit/issues/413#issuecomment-1697751329

## Setup

In [1]:
import sys
sys.path.append('../../code/mention-classification')

In [2]:
import os
import numpy as np
import pandas as pd
import regex

import torch
import datasets
from sklearn.model_selection import train_test_split

from utils.setfit import get_class_weights, model_init, TrainerForSpanClassification

from transformers import AutoTokenizer, set_seed
from setfit import TrainingArguments, Trainer

from utils.metrics import *

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# pandas
# numpy
# regex
# torch
# accelerate
# tokenizers
# sentencepiece
# datasets
# transformers
# setfit[absa]
# scikit-learn

In [3]:
SEED = 42
set_seed(SEED)

In [4]:
model_path = '../../models'
# base_model = os.path.join(model_path, 'paraphrase-mpnet-base-v2-social-group-mention-attributes-embedding')
base_model = "sentence-transformers/paraphrase-mpnet-base-v2"

## Preparing the dataset

In [6]:
data_path = '../../data/annotations/group_mention_categorization'
fp = os.path.join(data_path, 'consolidated_annotations.tsv')
df = pd.read_csv(fp, sep='\t')

In [7]:
tmp = df[df.attribute=="stance"]
tmp = tmp[['mention_id', 'text', 'mention', 'label']].drop_duplicates()

In [8]:
tmp['label'].value_counts(dropna=False)

label
Positive    242
Negative     34
Neutral      23
Unsure        1
Name: count, dtype: int64

In [9]:
tmp = tmp[tmp.label != 'Unsure']

In [13]:
# tmp = tmp[tmp.label != 'Neutral']

In [10]:
id2label = dict(enumerate(tmp.label.unique()))
label2id = {l: i for i, l in id2label.items()}

In [11]:
tmp.loc[:,'labels'] = tmp.label.map(label2id)

In [12]:
tokenizer = AutoTokenizer.from_pretrained(base_model)

In [13]:
# using concat strategy
tmp['input'] = tmp.text + tokenizer.sep_token + tmp.mention 
max_length_ = max(tokenizer(tmp.input.to_list(), truncation=False, padding=False, return_length=True).length)
cols = ['input', 'labels']
cols_mapping = {"input": "text", "labels": "label"}

### split the data

In [14]:
trn, tst = train_test_split(range(len(tmp)), test_size=0.25, random_state=SEED, stratify=tmp.label)

In [15]:
tmp_train = tmp.iloc[trn][cols]
# print(tmp_train.labels.value_counts(dropna=False))
# # downsample the training set
# tmp_train = tmp_train.groupby('labels').sample(50, random_state=SEED, replace=True).reset_index(drop=True)

In [16]:
dataset = datasets.DatasetDict({
    'train': datasets.Dataset.from_pandas(tmp_train, preserve_index=False),
    'test': datasets.Dataset.from_pandas(tmp.iloc[tst][cols], preserve_index=False)
})

## Prepare setfit fine-tuning

In [17]:
y_train = np.array(dataset['train']['labels'])
class_weights = get_class_weights(y_train)
class_weights = class_weights.astype(float)
class_weights

array([0.0537386 , 0.57215805, 0.37410334])

In [50]:
model_id = 'social-group-mention-stance-classifier'
model_dir = os.path.join(model_path, model_id)

training_args = TrainingArguments(
    output_dir=model_dir,
    batch_size=(32, 8),
    max_length=max_length_,
    num_epochs=(1, 15),
    max_steps=50,
    end_to_end=True,
    # loss=CosineSimilarityLoss,
    # samples_per_label=2,
    # use_amp=True,
    #report_to='none',
    eval_strategy='steps',
    eval_steps=25,
    #eval_strategy = 'epoch',
    #save_strategy = 'epoch',
    #save_total_limit=2,
    #load_best_model_at_end=True,
    ## metric_for_best_model='balanced_accuracy',
    seed=SEED
)

In [51]:
from utils.metrics import compute_metrics_multiclass

# trainer = TrainerForSpanClassification(
trainer = Trainer(
    model_init=lambda: model_init(
        model_name=base_model,
        id2label=id2label,
        # multitarget_strategy='one-vs-rest',
        class_weights=class_weights,
        use_span_embedding=False,#True,
        # device='mps'
    ),
    metric=lambda p, t: compute_metrics_multiclass(p, t, id2label),
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    column_mapping=cols_mapping
)

# for deterministic results
trainer._args.seed = SEED
trainer.st_trainer.args.seed = SEED
trainer.st_trainer.args.data_seed = SEED
trainer.st_trainer.args.full_determinism = True

Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset
Map: 100%|██████████| 224/224 [00:00<00:00, 31068.92 examples/s]


### Fine-tune

In [52]:
trainer.train()

***** Running training *****
  Num unique pairs = 1600
  Batch size = 32
  Num epochs = 1


Step,Training Loss,Validation Loss
25,0.2985,0.236249
50,0.2084,0.217134


Epoch: 100%|██████████| 15/15 [00:15<00:00,  1.03s/it]               


### Evaluate

In [55]:
metrics = trainer.evaluate()
metrics

***** Running evaluation *****


{'accuracy': 0.8533333333333334,
 'balanced_accuracy': 0.5669398907103825,
 'f1_macro': 0.5544619422572179,
 'precision_macro': 0.5429292929292929,
 'recall_macro': 0.5669398907103825,
 'precision_Positive': 0.8787878787878788,
 'recall_Positive': 0.9508196721311475,
 'f1_Positive': 0.9133858267716536,
 'precision_Neutral': 0.0,
 'recall_Neutral': 0.0,
 'f1_Neutral': 0.0,
 'precision_Negative': 0.75,
 'recall_Negative': 0.75,
 'f1_Negative': 0.75}

### Save the model

In [56]:
# import shutil
# shutil.rmtree(model_dir)
# trainer.model.save_pretrained(model_dir)
# tokenizer.save_pretrained(model_dir)
# trainer.model.to('cpu');
# del trainer

## Fine-tune with setfitABSA

### Prepare the data

In [57]:
tmp[['manifesto_id', 'sentence_nr', 'mention_nr']] = tmp.mention_id.str.split('-', expand=True)

In [58]:
import regex
tmp['span'] = tmp.apply(lambda x: regex.search(regex.escape(x.mention), x.text).span(), axis=1)

In [59]:
tmp_train = tmp.iloc[trn]
tmp_test = tmp.iloc[tst]

In [60]:
# rank spans within sentence
tmp_train.loc[:, 'ordinal'] = tmp_train.groupby(['manifesto_id', 'sentence_nr', 'mention']).cumcount() 
tmp_test.loc[:, 'ordinal'] = tmp_test.groupby(['manifesto_id', 'sentence_nr', 'mention']).cumcount() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_train.loc[:, 'ordinal'] = tmp_train.groupby(['manifesto_id', 'sentence_nr', 'mention']).cumcount()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_test.loc[:, 'ordinal'] = tmp_test.groupby(['manifesto_id', 'sentence_nr', 'mention']).cumcount()


In [61]:
cols = ['text', 'mention', 'label', 'ordinal']
dataset = datasets.DatasetDict({
    'train': datasets.Dataset.from_pandas(tmp_train[cols], preserve_index=False),
    'test': datasets.Dataset.from_pandas(tmp_test[cols], preserve_index=False)
})

In [58]:
# !pip install spacy==3.8.5
# !python -m spacy download en_core_web_lg==3.8.0

In [63]:
del trainer
torch.cuda.empty_cache()

In [62]:
from setfit import AbsaTrainer, TrainingArguments, AbsaModel

model = AbsaModel.from_pretrained(
    "sentence-transformers/paraphrase-mpnet-base-v2",
)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [68]:
TrainingArguments?

[31mInit signature:[39m
TrainingArguments(
    output_dir: [33m'str'[39m = [33m'checkpoints'[39m,
    batch_size: [33m'Union[int, Tuple[int, int]]'[39m = ([32m16[39m, [32m2[39m),
    num_epochs: [33m'Union[int, Tuple[int, int]]'[39m = ([32m1[39m, [32m16[39m),
    max_steps: [33m'int'[39m = -[32m1[39m,
    sampling_strategy: [33m'str'[39m = [33m'oversampling'[39m,
    num_iterations: [33m'Optional[int]'[39m = [38;5;28;01mNone[39;00m,
    body_learning_rate: [33m'Union[float, Tuple[float, float]]'[39m = ([32m2e-05[39m, [32m1e-05[39m),
    head_learning_rate: [33m'float'[39m = [32m0.01[39m,
    loss: [33m'Callable'[39m = <[38;5;28;01mclass[39;00m [33m'sentence_transformers.losses.CosineSimilarityLoss.CosineSimilarityLoss'[39m>,
    distance_metric: [33m'Callable'[39m = <function BatchHardTripletLossDistanceFunction.cosine_distance at [32m0x7c571754c860[39m>,
    margin: [33m'float'[39m = [32m0.25[39m,
    end_to_end: [33m'bool'[39m =

In [69]:
args = TrainingArguments(
    num_epochs=1,
    max_steps=50,
    batch_size=4,
    num_iterations=20,
    save_strategy="no",
    report_to="none"
)

In [70]:
trainer = AbsaTrainer(
    model,
    args=args,
    train_dataset=dataset['train'],  # if you want to train over the entire train set change experiment_ds to train_ds
    column_mapping={
        "text": "text",
        "mention": "span",
        "label": "label",
        "ordinal": "ordinal",
    },
)

Map: 100%|██████████| 1094/1094 [00:00<00:00, 47094.60 examples/s]
Map: 100%|██████████| 221/221 [00:00<00:00, 22861.47 examples/s]


In [71]:
trainer.train()

***** Running training *****
  Num unique pairs = 43760
  Batch size = 4
  Num epochs = 1


Step,Training Loss


***** Running training *****
  Num unique pairs = 8840
  Batch size = 4
  Num epochs = 1


Step,Training Loss


In [75]:
ds.rename_column?

[31mSignature:[39m
ds.rename_column(
    original_column_name: str,
    new_column_name: str,
    new_fingerprint: Optional[str] = [38;5;28;01mNone[39;00m,
) -> [33m'Dataset'[39m
[31mDocstring:[39m
Rename a column in the dataset, and move the features associated to the original column under the new column
name.

Args:
    original_column_name (`str`):
        Name of the column to rename.
    new_column_name (`str`):
        New name for the column.
    new_fingerprint (`str`, *optional*):
        The new fingerprint of the dataset after transform.
        If `None`, the new fingerprint is computed using a hash of the previous fingerprint, and the transform arguments.

Returns:
    [`Dataset`]: A copy of the dataset with a renamed column.

Example:

```py
>>> from datasets import load_dataset
>>> ds = load_dataset("rotten_tomatoes", split="validation")
>>> ds = ds.rename_column('label', 'label_new')
Dataset({
    features: ['text', 'label_new'],
    num_rows: 1066
})
```
[31mF

In [None]:
ds = dataset['test'].remove_columns(['label'])
ds = ds.rename_column('mention', 'span')
output  = model.predict(ds) # a new column which holds the predicted polarity, "pred_polarity", is added to the dataset

Dataset({
    features: ['text', 'span', 'ordinal', 'pred_polarity'],
    num_rows: 75
})

In [80]:
y_pred = [label2id[l] for l in output['pred_polarity']]
y_test = [label2id[l] for l in dataset['test']['label']]
compute_metrics_multiclass(y_pred, y_test, id2label)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'accuracy': 0.8266666666666667,
 'balanced_accuracy': 0.5198087431693988,
 'f1_macro': 0.4867724867724868,
 'precision_macro': 0.45897435897435895,
 'recall_macro': 0.5198087431693988,
 'precision_Positive': 0.8769230769230769,
 'recall_Positive': 0.9344262295081968,
 'f1_Positive': 0.9047619047619048,
 'precision_Neutral': 0.0,
 'recall_Neutral': 0.0,
 'f1_Neutral': 0.0,
 'precision_Negative': 0.5,
 'recall_Negative': 0.625,
 'f1_Negative': 0.5555555555555556}