# Social group mention attribute category and attributes multilabel text classification

We have collected human annotations that categorize mentions of social groups in party manifestos into the following (hierarchical) scheme of attribute dimensions and attribute classes:

- economic attributes
    - class membership
    - ecology of group
    - education level
    - employment status
    - income/wealth/economic status
    - occupation/profession
    - other
- non-economic attributes:
   - age
   - crime
   - ethnicity
   - family
   - gender/sexuality
   - health
   - nationality
   - other
   - place/location
   - religion
   - shared values/mentalities
- universal

In this notebook, we fine-tune a pre-trained sentence transformer model for multilabel classifiers using the `setfit` library to categorize into which attribute dimensions social group mentions belong.

notebook based on https://github.com/huggingface/setfit/blob/main/notebooks/text-classification_multilabel.ipynb

See also:

- https://huggingface.co/docs/setfit/en/how_to/multilabel
- https://github.com/huggingface/setfit/issues/413#issuecomment-1697751329

## Setup

In [1]:
import sys
sys.path.append('../../code/mention-classification')

In [3]:
import os
import numpy as np
import pandas as pd
import regex

import torch
import datasets
from sklearn.model_selection import train_test_split

from utils.setfit import get_class_weights, model_init, TrainerForSpanClassification

from transformers import AutoTokenizer, set_seed
from setfit import TrainingArguments, Trainer

from utils.metrics import *

In [4]:
SEED = 42
set_seed(SEED)

In [5]:
model_path = '../../models'
base_model = os.path.join(model_path, 'paraphrase-mpnet-base-v2-social-group-mention-attributes-embedding')

## Preparing the dataset

In [23]:
data_path = '../../data/annotations/group_mention_categorization'
fp = os.path.join(data_path, 'consolidated_annotations.tsv')
df = pd.read_csv(fp, sep='\t')

## Universal/econ/non-econ as three-way multilabel problem

### Prepare the data

In [7]:
# stack by category
tmp = pd.concat([
    df[df.q_id == 'universal_attributes'].drop(columns=['category']),
    df[df.q_id == 'economic_attributes'].groupby(['mention_id', 'text', 'mention', 'q_id']).agg({'label': lambda x: 'Yes' if (x=='Yes').any() else 'No'}).reset_index(),
    df[df.q_id == 'non-economic_attributes'].groupby(['mention_id', 'text', 'mention', 'q_id']).agg({'label': lambda x: 'Yes' if (x=='Yes').any() else 'No'}).reset_index()
])
tmp.reset_index(drop=True, inplace=True)

# get dimensions
tmp.q_id = tmp.q_id.str.removesuffix('_attributes')
features = tmp.q_id.unique().tolist()

# reshape to wide format
tmp = tmp.pivot(index=['mention_id', 'text', 'mention'], columns='q_id', values='label').reset_index()
tmp = tmp.rename_axis(None, axis=1)

# keep only fully gold-labeled examples
tmp = tmp[tmp[features].isna().sum(axis=1) == 0]

In [8]:
tmp[features].value_counts(dropna=False)

universal  economic  non-economic
No         No        Yes             192
           Yes       No              144
                     Yes              56
Yes        No        No               55
No         No        No                2
Name: count, dtype: int64

In [9]:
tmp = tmp[~(tmp[features]=='No').all(axis=1)]

In [10]:
label2id = {'No': 0, 'Yes': 1}
id2label = {0: 'No', 1: 'Yes'}
tmp.loc[:,features] = tmp.loc[:,features].apply(lambda x: x.map(label2id))

In [11]:
tmp['labels'] = tmp.loc[:,features].apply(list, axis=1)

In [12]:
tmp[features].mean(axis=0)
# strong label class imbalance

universal       0.123043
economic        0.447427
non-economic     0.55481
dtype: object

In [13]:
tokenizer = AutoTokenizer.from_pretrained(base_model)

In [15]:
# using concat strategy
tmp['input'] = tmp.text + tokenizer.sep_token + tmp.mention 
max_length_ = max(tokenizer(tmp.input.to_list(), truncation=False, padding=False, return_length=True).length)
cols = ['input', 'labels']
cols_mapping = {"input": "text", "labels": "label"}

# # using span embedding strategy
# tmp['span'] = tmp.apply(lambda x: regex.search(regex.escape(x.mention), x.text).span(), axis=1)
# max_length_ = max(tokenizer(tmp.text.to_list(), truncation=False, padding=False, return_length=True).length)
# cols = ['text', 'span', 'labels']
# cols_mapping = {'text': 'text', 'span': 'span', 'labels': 'label'}

### split the data

In [16]:
trn, tst = train_test_split(range(len(tmp)), test_size=0.25, random_state=SEED)

In [17]:
dataset = datasets.DatasetDict({
    'train': datasets.Dataset.from_pandas(tmp.iloc[trn][cols], preserve_index=False),
    'test': datasets.Dataset.from_pandas(tmp.iloc[tst][cols], preserve_index=False)
})

### Prepare fine-tuning

In [18]:
feats = tmp.iloc[trn][features].to_numpy()
class_weights = get_class_weights(feats, multitarget=True)
class_weights = class_weights.astype(float)
class_weights

array([0.67261321, 0.18427759, 0.14310919])

In [19]:
id2label = {i: l for i, l in enumerate(features)}
label2id = {l: i for i, l in enumerate(features)}
id2label

{0: 'universal', 1: 'economic', 2: 'non-economic'}

In [65]:
model_id = 'social-group-mention-attribute-dimension-classifier'
model_dir = os.path.join(model_path, model_id)

training_args = TrainingArguments(
    output_dir=model_dir,
    batch_size=(32, 4),
    max_length=max_length_,
    num_epochs=(0, 15),
    max_steps=-1,
    end_to_end=True,
    # loss=CosineSimilarityLoss,
    # samples_per_label=2,
    # use_amp=True,
    report_to='none',
    seed=SEED
)

In [21]:
from utils.metrics import compute_metrics_multilabel

# trainer = TrainerForSpanClassification(
trainer = Trainer(
    model_init=lambda: model_init(
        model_name=base_model,
        id2label=id2label,
        multitarget_strategy='one-vs-rest',
        class_weights=class_weights,
        use_span_embedding=False,#True,
        device='mps'
    ),
    metric=lambda p, t: compute_metrics_multilabel(p, t, id2label),
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    column_mapping=cols_mapping
)

# for deterministic results
trainer._args.seed = SEED
trainer.st_trainer.args.seed = SEED
trainer.st_trainer.args.data_seed = SEED
trainer.st_trainer.args.full_determinism = True

Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Map: 100%|██████████| 335/335 [00:00<00:00, 26033.72 examples/s]


### Fine-tune

In [22]:
trainer.train()

***** Running training *****
  Num unique pairs = 75454
  Batch size = 32
  Num epochs = 0
0it [00:00, ?it/s]


{'train_runtime': 0.0021, 'train_samples_per_second': 0.0, 'train_steps_per_second': 0.0, 'train_loss': 0.0, 'epoch': 0}


Epoch: 100%|██████████| 15/15 [05:24<00:00, 21.61s/it]


### Evaluate

In [23]:
metrics = trainer.evaluate()

***** Running evaluation *****


In [24]:
res = pd.DataFrame(metrics, index=[0]).T.reset_index().rename(columns={'index': 'metric', 0: 'value'})
res[['metric', 'category']] = res.metric.str.split('_', expand=True)
res = res.pivot(index='category', columns='metric', values='value')
# remove index names
res.columns.name = None
res.index.name = None
res.loc[['macro']+features]

Unnamed: 0,f1,precision,recall,support
macro,0.903006,0.957418,0.859259,
universal,0.888889,1.0,0.8,15.0
economic,0.927273,0.910714,0.944444,54.0
non-economic,0.892857,0.961538,0.833333,60.0


## error analysis

In [25]:
probs = trainer.model.predict_proba(dataset['test']['input'], as_numpy=True)
preds = np.where(probs > 0.5, 1, 0)

In [27]:
# sanity check: any universal and other attributes? (not allowed)
idxs = np.where(np.logical_and(preds[:, 0]==1, preds[:, 1:].sum(axis=1)>0))[0]
len(idxs)
# okay!

0

In [40]:
def parse_input(x):
  # text, mention = x.split(tokenizer.sep_token)
  # span = regex.search(regex.escape(mention), text).span()
  return x.split(tokenizer.sep_token)

In [54]:
errors_df = [] 
for attribute, attribute_id in label2id.items():
    errors = preds != dataset['test']['labels']
    idxs = np.where(errors[:, attribute_id])[0]

    tmp = pd.DataFrame([parse_input(x) for x in dataset['test'].select(idxs)['input']], columns=['text', 'mention'])
    tmp['attribute'] = attribute
    tmp['label'] = np.array(dataset['test'].select(idxs)['labels'])[:, attribute_id]
    tmp['pred'] = preds[idxs, attribute_id]
    errors_df.append(tmp)

errors_df = pd.concat(errors_df)
errors_df

Unnamed: 0,text,mention,attribute,label,pred
0,Party for living people.,living people,universal,1,0
1,x Businesses should pay a normal share of thei...,society,universal,1,0
2,Young people need places where they can develo...,groups,universal,1,0
0,In a continuously technologically improving so...,a continuously technologically improving society,economic,0,1
1,A society where wealth is not measured in cons...,A society where wealth is not measured in cons...,economic,0,1
2,x Businesses should pay a normal share of thei...,society,economic,0,1
3,The Greens propose to experiment with new ways...,those who are undergoing profound democratic r...,economic,0,1
4,"Children with a migrant background, children f...",children with a dependent or mentally ill parent,economic,1,0
5,Elderly care and care should be of high qualit...,mentors,economic,1,0
6,Young people who don't want education now - bu...,Young people who don't want education now - bu...,economic,1,0


## Save the model

In [61]:
import shutil
shutil.rmtree(model_dir)

In [70]:
trainer.model.save_pretrained(model_dir)

In [71]:
trainer.model.to('cpu');
del trainer

## granular attribute classification problem

### Prepare the data

In [30]:
tmp = df[df.q_id!='stance']

tmp.loc[:, 'attribute_combination'] = tmp.attribute_combination.str.removesuffix(': ')

features = tmp.attribute_combination.unique().tolist()

# pivot labels for attribute_combination to columns using mention_id, text, and mention as id vars
tmp = tmp.pivot(index=['mention_id', 'text', 'mention'], columns='attribute_combination', values='label').reset_index()
tmp = tmp.rename_axis(None, axis=1)

# keep only fully gold-labeled examples
tmp = tmp[tmp[features].isna().sum(axis=1) == 0]
tmp = tmp[~(tmp[features]=='No').all(axis=1)]

['economic: class membership',
 'economic: ecology of group',
 'economic: education level',
 'economic: employment status',
 'economic: income/wealth/economic status',
 'economic: occupation/profession',
 'economic: other',
 'non-economic: age',
 'non-economic: crime',
 'non-economic: ethnicity',
 'non-economic: family',
 'non-economic: gender/sexuality',
 'non-economic: health',
 'non-economic: nationality',
 'non-economic: other',
 'non-economic: place/location',
 'non-economic: religion',
 'non-economic: shared values/mentalities',
 'universal']

In [34]:
# one-hot encode labels
label2id = {'No': 0, 'Yes': 1}
id2label = {0: 'No', 1: 'Yes'}
tmp.loc[:,features] = tmp.loc[:,features].apply(lambda x: x.map(label2id))

In [45]:
cnts = tmp[features].sum(axis=0)
drop_these = cnts[cnts <= 4].index.tolist()
for f in drop_these:
    features.remove(f)

tmp = tmp[['mention_id', 'text', 'mention'] + features]

In [47]:
tmp.loc[:, 'labels'] = tmp.loc[:,features].apply(list, axis=1)

In [48]:
tokenizer = AutoTokenizer.from_pretrained(base_model)

In [52]:
# using concat strategy
tmp.loc[:, 'input'] = tmp.text + tokenizer.sep_token + tmp.mention 
max_length_ = max(tokenizer(tmp.input.to_list(), truncation=False, padding=False, return_length=True).length)
cols = ['input', 'labels']
cols_mapping = {"input": "text", "labels": "label"}

# # using span embedding strategy
# tmp['span'] = tmp.apply(lambda x: regex.search(regex.escape(x.mention), x.text).span(), axis=1)
# max_length_ = max(tokenizer(tmp.text.to_list(), truncation=False, padding=False, return_length=True).length)
# cols = ['text', 'span', 'labels']
# cols_mapping = {'text': 'text', 'span': 'span', 'labels': 'label'}

In [None]:
tmp['signature'] = tmp[features].apply(lambda r: '; '.join([f for f in features if r[f]==1]), axis=1)

### split the data

In [58]:
trn, tst = train_test_split(range(len(tmp)), test_size=0.25, random_state=SEED)

In [59]:
dataset = datasets.DatasetDict({
    'train': datasets.Dataset.from_pandas(tmp.iloc[trn][cols], preserve_index=False),
    'test': datasets.Dataset.from_pandas(tmp.iloc[tst][cols], preserve_index=False)
})

### Prepare fine-tuning

In [60]:
feats = tmp.iloc[trn][features].to_numpy()
class_weights = get_class_weights(feats, multitarget=True)
class_weights = class_weights.astype(float)
class_weights

array([0.03643279, 0.05725153, 0.07286558, 0.02428853, 0.02504754,
       0.01292776, 0.01512305, 0.08905793, 0.1335869 , 0.02671738,
       0.10019017, 0.05343476, 0.01541387, 0.10019017, 0.20038035,
       0.01705365, 0.02003803])

In [61]:
id2label = {i: l for i, l in enumerate(features)}
label2id = {l: i for i, l in enumerate(features)}
id2label

{0: 'economic: class membership',
 1: 'economic: ecology of group',
 2: 'economic: education level',
 3: 'economic: employment status',
 4: 'economic: income/wealth/economic status',
 5: 'economic: occupation/profession',
 6: 'non-economic: age',
 7: 'non-economic: crime',
 8: 'non-economic: ethnicity',
 9: 'non-economic: family',
 10: 'non-economic: gender/sexuality',
 11: 'non-economic: health',
 12: 'non-economic: nationality',
 13: 'non-economic: place/location',
 14: 'non-economic: religion',
 15: 'non-economic: shared values/mentalities',
 16: 'universal'}

In [62]:
model_id = 'social-group-mention-attributes-classifier'
model_dir = os.path.join(model_path, model_id)

training_args = TrainingArguments(
    output_dir=model_dir,
    batch_size=(32, 4),
    max_length=max_length_,
    num_epochs=(0, 15),
    max_steps=-1,
    end_to_end=True,
    # loss=CosineSimilarityLoss,
    # samples_per_label=2,
    # use_amp=True,
    report_to='none',
    seed=SEED
)

In [63]:
from utils.metrics import compute_metrics_multilabel

# trainer = TrainerForSpanClassification(
trainer = Trainer(
    model_init=lambda: model_init(
        model_name=base_model,
        id2label=id2label,
        multitarget_strategy='one-vs-rest',
        class_weights=class_weights,
        use_span_embedding=False,#True,
        device='mps'
    ),
    metric=lambda p, t: compute_metrics_multilabel(p, t, id2label),
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    column_mapping=cols_mapping
)

# for deterministic results
trainer._args.seed = SEED
trainer.st_trainer.args.seed = SEED
trainer.st_trainer.args.data_seed = SEED
trainer.st_trainer.args.full_determinism = True

Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Map: 100%|██████████| 335/335 [00:00<00:00, 13817.95 examples/s]


### Fine-tune

In [64]:
trainer.train()

***** Running training *****
  Num unique pairs = 105492
  Batch size = 32
  Num epochs = 0
0it [00:00, ?it/s]


{'train_runtime': 0.0027, 'train_samples_per_second': 0.0, 'train_steps_per_second': 0.0, 'train_loss': 0.0, 'epoch': 0}


Epoch: 100%|██████████| 15/15 [09:48<00:00, 39.25s/it]


### Evaluate

In [65]:
metrics = trainer.evaluate()

***** Running evaluation *****


In [66]:
res = pd.DataFrame(metrics, index=[0]).T.reset_index().rename(columns={'index': 'metric', 0: 'value'})
res[['metric', 'category']] = res.metric.str.split('_', expand=True)
res = res.pivot(index='category', columns='metric', values='value')
# remove index names
res.columns.name = None
res.index.name = None
res.loc[['macro']+features]

Unnamed: 0,f1,precision,recall,support
macro,0.093395,0.159804,0.079132,
economic: class membership,0.0,0.0,0.0,6.0
economic: ecology of group,0.0,0.0,0.0,3.0
economic: education level,0.0,0.0,0.0,3.0
economic: employment status,0.0,0.0,0.0,13.0
economic: income/wealth/economic status,0.25,1.0,0.142857,14.0
economic: occupation/profession,0.0,0.0,0.0,23.0
non-economic: age,0.916667,0.916667,0.916667,12.0
non-economic: crime,0.0,0.0,0.0,6.0
non-economic: ethnicity,0.0,0.0,0.0,1.0


In [71]:
y_true = np.array(dataset['test']['labels'])
y_pred = np.where(trainer.model.predict_proba(dataset['test']['input'], as_numpy=True) > 0.5, 1, 0)

In [72]:
#def compute_metrics_hierarchical_multilabel(y_pred, y_true, id2label, label_sep=': '):
# y_true, y_pred = np.array(y_true), np.array(y_pred)
# granularly
granular_scores = {}
for i, l in id2label.items():
    p, r, f1, _ = precision_recall_fscore_support(y_true[:, i], y_pred[:, i], average='binary', zero_division=0.0)
    granular_scores[l] = {'f1': f1, 'precision': p, 'recall': r, 'support': np.sum(y_true[:, i])}
macros = {m: np.mean([d[m] for d in granular_scores.values()]) for m in ['f1', 'precision', 'recall']}
granular_scores = {'macro_granular': macros} | granular_scores
granular_scores

{'macro_granular': {'f1': 0.09339525283797728,
  'precision': 0.15980392156862747,
  'recall': 0.07913165266106442},
 'economic: class membership': {'f1': 0.0,
  'precision': 0.0,
  'recall': 0.0,
  'support': 6},
 'economic: ecology of group': {'f1': 0.0,
  'precision': 0.0,
  'recall': 0.0,
  'support': 3},
 'economic: education level': {'f1': 0.0,
  'precision': 0.0,
  'recall': 0.0,
  'support': 3},
 'economic: employment status': {'f1': 0.0,
  'precision': 0.0,
  'recall': 0.0,
  'support': 13},
 'economic: income/wealth/economic status': {'f1': 0.25,
  'precision': 1.0,
  'recall': 0.14285714285714285,
  'support': 14},
 'economic: occupation/profession': {'f1': 0.0,
  'precision': 0.0,
  'recall': 0.0,
  'support': 23},
 'non-economic: age': {'f1': 0.9166666666666666,
  'precision': 0.9166666666666666,
  'recall': 0.9166666666666666,
  'support': 12},
 'non-economic: crime': {'f1': 0.0,
  'precision': 0.0,
  'recall': 0.0,
  'support': 6},
 'non-economic: ethnicity': {'f1': 0.0,

In [None]:
# coarse
granular2coarse = {i: l.split(label_sep)[0] for i, l in id2label.items()}
coarse_cats = set(granular2coarse.values())
y_true = [granular2coarse[i] for i in y_true]
y_pred = [granular2coarse[i] for i in y_pred]
p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, average=None, zero_division=0.0)
coarse_scores = {}
coarse_scores['macro_coarse'] = {'f1': np.mean(f1), 'precision': np.mean(p), 'recall': np.mean(r)}
for i, l in coarse_cats:
    coarse_scores[l] = {'f1': f1[i], 'precision': p[i], 'recall': r[i]}
# flatten
scores = coarse_scores | granular_scores
scores = {f'{m}_{l}': v for l, d in scores.items() for m, v in d.items()}
return scores