# Social group mention stance classification


In this notebook, we fine-tune a pre-trained sentence transformer model for multilabel classifiers using the `setfit` library to categorize into which attribute dimensions social group mentions belong.

notebook based on https://github.com/huggingface/setfit/blob/main/notebooks/text-classification_multilabel.ipynb

See also:

- https://huggingface.co/docs/setfit/en/how_to/multilabel
- https://github.com/huggingface/setfit/issues/413#issuecomment-1697751329

## Setup

In [1]:
import sys
sys.path.append('../../code/mention-classification')

In [2]:
import os
import numpy as np
import pandas as pd
import regex

import torch
import datasets
from sklearn.model_selection import train_test_split

from utils.setfit import get_class_weights, model_init, TrainerForSpanClassification

from transformers import AutoTokenizer, set_seed
from setfit import TrainingArguments, Trainer

from utils.metrics import *

In [None]:
# pandas
# numpy
# regex
# torch
# accelerate
# tokenizers
# sentencepiece
# datasets
# transformers
# setfit[absa]
# scikit-learn

In [3]:
SEED = 42
set_seed(SEED)

In [6]:
model_path = '../../models'
# base_model = os.path.join(model_path, 'paraphrase-mpnet-base-v2-social-group-mention-attributes-embedding')
base_model = "sentence-transformers/paraphrase-mpnet-base-v2"

## Preparing the dataset

In [9]:
data_path = '../../data/annotations/group_mention_categorization'
fp = os.path.join(data_path, 'consolidated_annotations.tsv')
df = pd.read_csv(fp, sep='\t')

In [10]:
tmp = df[df.attribute=="stance"]
tmp = tmp[['mention_id', 'text', 'mention', 'label']].drop_duplicates()

In [11]:
tmp['label'].value_counts(dropna=False)

label
Positive    242
Negative     34
Neutral      23
Unsure        1
Name: count, dtype: int64

In [12]:
tmp = tmp[tmp.label != 'Unsure']

In [13]:
# tmp = tmp[tmp.label != 'Neutral']

In [14]:
id2label = dict(enumerate(tmp.label.unique()))
label2id = {l: i for i, l in id2label.items()}

In [15]:
tmp.loc[:,'labels'] = tmp.label.map(label2id)

In [16]:
tokenizer = AutoTokenizer.from_pretrained(base_model)

In [17]:
# using concat strategy
tmp['input'] = tmp.text + tokenizer.sep_token + tmp.mention 
max_length_ = max(tokenizer(tmp.input.to_list(), truncation=False, padding=False, return_length=True).length)
cols = ['input', 'labels']
cols_mapping = {"input": "text", "labels": "label"}

### split the data

In [18]:
trn, tst = train_test_split(range(len(tmp)), test_size=0.25, random_state=SEED, stratify=tmp.label)

In [19]:
tmp_train = tmp.iloc[trn][cols]
# print(tmp_train.labels.value_counts(dropna=False))
# # downsample the training set
# tmp_train = tmp_train.groupby('labels').sample(50, random_state=SEED, replace=True).reset_index(drop=True)

In [20]:
dataset = datasets.DatasetDict({
    'train': datasets.Dataset.from_pandas(tmp_train, preserve_index=False),
    'test': datasets.Dataset.from_pandas(tmp.iloc[tst][cols], preserve_index=False)
})

## Prepare setfit fine-tuning

In [21]:
y_train = np.array(dataset['train']['labels'])
class_weights = get_class_weights(y_train)
class_weights = class_weights.astype(float)
class_weights

array([0.0537386 , 0.57215805, 0.37410334])

In [22]:
model_id = 'social-group-mention-stance-classifier'
model_dir = os.path.join(model_path, model_id)

training_args = TrainingArguments(
    output_dir=model_dir,
    batch_size=(32, 4),
    max_length=max_length_,
    num_epochs=(0, 7),
    max_steps=-1,
    end_to_end=True,
    # loss=CosineSimilarityLoss,
    # samples_per_label=2,
    # use_amp=True,
    #report_to='none',
    eval_strategy = 'epoch',
    
    save_strategy = 'epoch',
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model='balanced_accuracy',
    seed=SEED
)

In [23]:
from utils.metrics import compute_metrics_multiclass

# trainer = TrainerForSpanClassification(
trainer = Trainer(
    model_init=lambda: model_init(
        model_name=base_model,
        id2label=id2label,
        # multitarget_strategy='one-vs-rest',
        class_weights=class_weights,
        use_span_embedding=False,#True,
        device='mps'
    ),
    metric=lambda p, t: compute_metrics_multiclass(p, t, id2label),
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    column_mapping=cols_mapping
)

# for deterministic results
trainer._args.seed = SEED
trainer.st_trainer.args.seed = SEED
trainer.st_trainer.args.data_seed = SEED
trainer.st_trainer.args.full_determinism = True

Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


README.md:   0%|          | 0.00/3.52k [00:00<?, ?B/s]

Map:   0%|          | 0/224 [00:00<?, ? examples/s]

### Fine-tune

In [24]:
trainer.train()

***** Running training *****
  Num unique pairs = 33950
  Batch size = 32
  Num epochs = 0


0it [00:00, ?it/s]

{'train_runtime': 0.0079, 'train_samples_per_second': 0.0, 'train_steps_per_second': 0.0, 'train_loss': 0.0, 'epoch': 0}


Epoch:   0%|          | 0/7 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

### Evaluate

In [25]:
metrics = trainer.evaluate()

***** Running evaluation *****
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [27]:
metrics

{'accuracy': 0.8133333333333334,
 'balanced_accuracy': 0.4781420765027322,
 'f1_macro': 0.45607534352323603,
 'precision_macro': 0.436026936026936,
 'recall_macro': 0.4781420765027322,
 'precision_Positive': 0.8636363636363636,
 'recall_Positive': 0.9344262295081968,
 'f1_Positive': 0.8976377952755905,
 'precision_Neutral': 0.0,
 'recall_Neutral': 0.0,
 'f1_Neutral': 0.0,
 'precision_Negative': 0.4444444444444444,
 'recall_Negative': 0.5,
 'f1_Negative': 0.47058823529411764}

### Save the model

In [None]:
import shutil
shutil.rmtree(model_dir)

In [None]:
trainer.model.save_pretrained(model_dir)

In [None]:
trainer.model.to('cpu');
del trainer

## Fine-tune with setfitABSA

### Prepare the data

In [64]:
tmp[['manifesto_id', 'sentence_nr', 'mention_nr']] = tmp.mention_id.str.split('-', expand=True)

In [65]:
import regex
tmp['span'] = tmp.apply(lambda x: regex.search(regex.escape(x.mention), x.text).span(), axis=1)

In [66]:
tmp_train = tmp.iloc[trn]
tmp_test = tmp.iloc[tst]

In [67]:
# rank spans within sentence
tmp_train.loc[:, 'ordinal'] = tmp_train.groupby(['manifesto_id', 'sentence_nr', 'mention']).cumcount() 
tmp_test.loc[:, 'ordinal'] = tmp_test.groupby(['manifesto_id', 'sentence_nr', 'mention']).cumcount() 

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_train.loc[:, 'ordinal'] = tmp_train.groupby(['manifesto_id', 'sentence_nr', 'mention']).cumcount()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_test.loc[:, 'ordinal'] = tmp_test.groupby(['manifesto_id', 'sentence_nr', 'mention']).cumcount()


In [68]:
cols = ['text', 'mention', 'label', 'ordinal']
dataset = datasets.DatasetDict({
    'train': datasets.Dataset.from_pandas(tmp_train[cols], preserve_index=False),
    'test': datasets.Dataset.from_pandas(tmp_test[cols], preserve_index=False)
})

In [58]:
# !pip install spacy==3.8.5
# !python -m spacy download en_core_web_lg==3.8.0

In [59]:
from setfit import AbsaTrainer, TrainingArguments, AbsaModel

model = AbsaModel.from_pretrained(
    "sentence-transformers/paraphrase-mpnet-base-v2",
)

model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.
model_head.pkl not found on HuggingFace Hub, initialising classification head with random weights. You should TRAIN this model on a downstream task to use it for predictions and inference.


In [60]:
args = TrainingArguments(
    num_epochs=1,
    batch_size=4,
    num_iterations=20,
    save_strategy="no",
    report_to="none"
)

In [63]:
tmp_train[tmp_train.text.str.startswith('In collaboration with a number')]

Unnamed: 0,mention_id,text,mention,label,labels,input,manifesto_id,sentence_nr,mention_nr,span,ordinal
5618,93712_199209-377225-5,In collaboration with a number of collectives ...,economists,Positive,0,In collaboration with a number of collectives ...,93712_199209,377225,5,"(101, 111)",0
5598,93712_199209-377225-3,In collaboration with a number of collectives ...,psychologists,Positive,0,In collaboration with a number of collectives ...,93712_199209,377225,3,"(77, 90)",1


In [69]:
trainer = AbsaTrainer(
    model,
    args=args,
    train_dataset=dataset['train'],  # if you want to train over the entire train set change experiment_ds to train_ds
    column_mapping={
        "text": "text",
        "mention": "span",
        "label": "label",
        "ordinal": "ordinal",
    },
)

Map:   0%|          | 0/1095 [00:00<?, ? examples/s]

Map:   0%|          | 0/221 [00:00<?, ? examples/s]

In [70]:
trainer.train()

***** Running training *****
  Num unique pairs = 43800
  Batch size = 4
  Num epochs = 1


  0%|          | 0/10950 [00:00<?, ?it/s]

: 