# Social group mention attribute category and attributes multilabel text classification

We have collected human annotations that categorize mentions of social groups in party manifestos into the following (hierarchical) scheme of attribute dimensions and attribute classes:

- economic attributes
    - class membership
    - ecology of group
    - education level
    - employment status
    - income/wealth/economic status
    - occupation/profession
    - other
- non-economic attributes:
   - age
   - crime
   - ethnicity
   - family
   - gender/sexuality
   - health
   - nationality
   - other
   - place/location
   - religion
   - shared values/mentalities
- universal

In this notebook, we fine-tune a pre-trained sentence transformer model for multilabel classifiers using the `setfit` library to categorize into which attribute dimensions social group mentions belong.

notebook based on https://github.com/huggingface/setfit/blob/main/notebooks/text-classification_multilabel.ipynb

See also:

- https://huggingface.co/docs/setfit/en/how_to/multilabel
- https://github.com/huggingface/setfit/issues/413#issuecomment-1697751329

## Setup

In [None]:
import os
import numpy as np
import pandas as pd
import regex

import torch
import datasets
from sklearn.model_selection import train_test_split

from mention_classification.utils.setfit import get_class_weights, model_init, TrainerForSpanClassification

from transformers import AutoTokenizer, set_seed
from setfit import TrainingArguments, Trainer

from mention_classification.utils.metrics import *

from pathlib import Path

data_path = Path('../../data/annotations/group_mention_categorization')
models_path = Path('../../models/')

In [5]:
SEED = 42
set_seed(SEED)

In [6]:
# model_path = '../../models'
# base_model = os.path.join(model_path, 'paraphrase-mpnet-base-v2-social-group-mention-attributes-embedding')

## Preparing the dataset

In [11]:
fp = data_path / 'final_annotations.tsv'
annotations = pd.read_csv(fp, sep='\t')
ignore = ['stance: ', 'universal: ']
annotations.query("attribute_combination not in @ignore", inplace=True)

In [12]:
# gather attribute combinations with label=='Yes' at the mention level
mentions_df = annotations.groupby(['mention_id', 'text', 'mention'])[['attribute_combination', 'label']].apply(lambda x: sorted(set(x.attribute_combination[x.label=='Yes']))).reset_index()
mentions_df.rename(columns={0: 'attributes'}, inplace=True)

## Universal/econ/non-econ as three-way multilabel problem

### Prepare the data

In [13]:
annotations.q_id.isna().sum()

0

In [23]:
# stack by category
df = pd.concat([
    annotations[annotations.q_id == 'economic_attributes'].groupby(['mention_id', 'text', 'mention', 'q_id']).agg({'label': lambda x: 'Yes' if (x=='Yes').any() else 'No'}).reset_index(),
    annotations[annotations.q_id == 'non-economic_attributes'].groupby(['mention_id', 'text', 'mention', 'q_id']).agg({'label': lambda x: 'Yes' if (x=='Yes').any() else 'No'}).reset_index()
])
df.reset_index(drop=True, inplace=True)

# get dimensions
df.q_id = df.q_id.str.removesuffix('_attributes')
features = df.q_id.unique().tolist()

In [24]:
# reshape to wide format
df = df.pivot_table(index=['mention_id', 'text', 'mention'], columns='q_id', values='label', aggfunc='last').reset_index()
df = df.rename_axis(None, axis=1)

In [25]:
# TODO: consider dropping this and using it implicitly (when classifier sees/predicts both dims as 'No')
df['universal'] = 'No'
df.loc[(df[features]=='No').all(axis=1), 'universal'] = 'Yes'
features.append('universal')

In [26]:
# keep only fully gold-labeled examples
df = df[df[features].isna().sum(axis=1) == 0]

In [27]:
df[features].value_counts(dropna=False)

economic  non-economic  universal
No        Yes           No           327
Yes       No            No           161
No        No            Yes           59
Yes       Yes           No            53
Name: count, dtype: int64

In [None]:
# # all-No examples
# # TODO: make this part of conolsitation 
# df[(df[features]=='No').all(axis=1)]

# # discard 
# tmp = tmp[~(tmp[features]=='No').all(axis=1)]

In [30]:
label2id = {'No': 0, 'Yes': 1}
id2label = {v: k for k, v in label2id.items()}
df.loc[:,features] = df.loc[:,features].apply(lambda x: x.map(label2id))

In [31]:
df['labels'] = df.loc[:,features].apply(list, axis=1)

In [32]:
df[features].mean(axis=0)
# strong label class imbalance

economic        0.356667
non-economic    0.633333
universal       0.098333
dtype: object

In [42]:
model_id = 'sentence-transformers/paraphrase-mpnet-base-v2'
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [35]:
# using concat strategy
sep_token = ': ' # tokenizer.sep_token
df['input'] = df.mention + sep_token + df.text
max_length_ = max(tokenizer(df.input.to_list(), truncation=False, padding=False, return_length=True).length)
cols = ['input', 'labels']
cols_mapping = {"input": "text", "labels": "label"}

# # using span embedding strategy
# df['span'] = df.apply(lambda x: regex.search(regex.escape(x.mention), x.text).span(), axis=1)
# max_length_ = max(tokenizer(df.text.to_list(), truncation=False, padding=False, return_length=True).length)
# cols = ['text', 'span', 'labels']
# cols_mapping = {'text': 'text', 'span': 'span', 'labels': 'label'}

### split the data

In [36]:
# NOTE: need more sophisticated strategy!
trn, tst = train_test_split(range(len(df)), test_size=0.25, random_state=SEED)

In [37]:
dataset = datasets.DatasetDict({
    'train': datasets.Dataset.from_pandas(df.iloc[trn][cols], preserve_index=False),
    'test': datasets.Dataset.from_pandas(df.iloc[tst][cols], preserve_index=False)
})

### Prepare fine-tuning

In [38]:
feats = df.iloc[trn][features].to_numpy()
class_weights = get_class_weights(feats, multitarget=True)
class_weights = class_weights.astype(float)
class_weights

array([0.17673716, 0.09818731, 0.72507553])

In [39]:
id2label = {i: l for i, l in enumerate(features)}
label2id = {l: i for i, l in enumerate(features)}
id2label

{0: 'economic', 1: 'non-economic', 2: 'universal'}

In [41]:
model_id = 'social-group-mention-attribute-dimension-classifier-v2'
model_dir = os.path.join(models_path, model_id)

training_args = TrainingArguments(
    output_dir=model_dir,
    batch_size=(32, 4),
    max_length=max_length_,
    num_epochs=(0, 15),
    max_steps=-1,
    end_to_end=True,
    # loss=CosineSimilarityLoss,
    # samples_per_label=2,
    # use_amp=True,
    report_to='none',
    seed=SEED
)

In [43]:
from utils.metrics import compute_metrics_multilabel

# trainer = TrainerForSpanClassification(
trainer = Trainer(
    model_init=lambda: model_init(
        model_name=model_id,
        id2label=id2label,
        multitarget_strategy='one-vs-rest',
        class_weights=class_weights,
        use_span_embedding=False,#True,
        device='mps'
    ),
    metric=lambda p, t: compute_metrics_multilabel(p, t, id2label),
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    column_mapping=cols_mapping
)

# for deterministic results
trainer._args.seed = SEED
trainer.st_trainer.args.seed = SEED
trainer.st_trainer.args.data_seed = SEED
trainer.st_trainer.args.full_determinism = True

Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


README.md: 0.00B [00:00, ?B/s]

Map:   0%|          | 0/450 [00:00<?, ? examples/s]

### Fine-tune

In [None]:
trainer.train()

***** Running training *****
  Num unique pairs = 121480
  Batch size = 32
  Num epochs = 0


Step,Training Loss


Epoch:   0%|          | 0/15 [00:00<?, ?it/s]



Iteration:   0%|          | 0/113 [00:00<?, ?it/s]

Iteration:   0%|          | 0/113 [00:00<?, ?it/s]

Iteration:   0%|          | 0/113 [00:00<?, ?it/s]

Iteration:   0%|          | 0/113 [00:00<?, ?it/s]

Iteration:   0%|          | 0/113 [00:00<?, ?it/s]

Iteration:   0%|          | 0/113 [00:00<?, ?it/s]

Iteration:   0%|          | 0/113 [00:00<?, ?it/s]

Iteration:   0%|          | 0/113 [00:00<?, ?it/s]

Iteration:   0%|          | 0/113 [00:00<?, ?it/s]

Iteration:   0%|          | 0/113 [00:00<?, ?it/s]

### Evaluate

In [28]:
metrics = trainer.evaluate()

***** Running evaluation *****


In [29]:
res = pd.DataFrame(metrics, index=[0]).T.reset_index().rename(columns={'index': 'metric', 0: 'value'})
res[['metric', 'category']] = res.metric.str.split('_', expand=True)
res = res.pivot(index='category', columns='metric', values='value')
# remove index names
res.columns.name = None
res.index.name = None
res.loc[['macro']+features]

Unnamed: 0,f1,precision,recall,support
macro,0.85568,0.939013,0.80568,
universal,0.75,1.0,0.6,10.0
economic,0.892308,0.892308,0.892308,65.0
non-economic,0.924731,0.924731,0.924731,93.0


## error analysis

In [30]:
probs = trainer.model.predict_proba(dataset['test']['input'], as_numpy=True)
preds = np.where(probs > 0.5, 1, 0)

In [31]:
# sanity check: any universal and other attributes? (not allowed)
idxs = np.where(np.logical_and(preds[:, 0]==1, preds[:, 1:].sum(axis=1)>0))[0]
len(idxs)
# okay!

0

In [32]:
def parse_input(x):
  # text, mention = x.split(tokenizer.sep_token)
  # span = regex.search(regex.escape(mention), text).span()
  return x.split(tokenizer.sep_token)

In [33]:
errors_df = [] 
for attribute, attribute_id in label2id.items():
    errors = preds != dataset['test']['labels']
    idxs = np.where(errors[:, attribute_id])[0]

    tmp = pd.DataFrame([parse_input(x) for x in dataset['test'].select(idxs)['input']], columns=['text', 'mention'])
    tmp['attribute'] = attribute
    tmp['label'] = np.array(dataset['test'].select(idxs)['labels'])[:, attribute_id]
    tmp['pred'] = preds[idxs, attribute_id]
    errors_df.append(tmp)

errors_df = pd.concat(errors_df)
errors_df

Unnamed: 0,text,mention,attribute,label,pred
0,a society for survival in prosperity and well-...,It is only within the ecological framework tha...,universal,1,0
1,a society of security and well-being for all,We want a society of security and well-being f...,universal,1,0
2,anyone,A society that gives up being good for everyon...,universal,1,0
3,a modern knowledge society,Education is central to a modern knowledge soc...,universal,1,0
0,Those who - like the Red Greens - fight econom...,Those who - like the Red Greens - fight econom...,economic,0,1
1,a few,"By putting children first, and choosing to ens...",economic,1,0
2,Children in homeless settlements,Children in homeless settlements and children ...,economic,1,0
3,children with a dependent or mentally ill parent,"Children with a migrant background, children f...",economic,1,0
4,a green society that puts people first,We want a green society that puts people first.,economic,1,0
5,A society where wealth is not measured in cons...,A society where wealth is not measured in cons...,economic,0,1


## Save the model

In [61]:
import shutil
shutil.rmtree(model_dir)

In [70]:
trainer.model.save_pretrained(model_dir)

In [71]:
trainer.model.to('cpu');
del trainer

## granular attribute classification problem

### Prepare the data

In [40]:
tmp = df[df.q_id!='stance']

tmp.loc[:, 'attribute_combination'] = tmp.attribute_combination.str.removesuffix(': ')

features = tmp.attribute_combination.unique().tolist()

# pivot labels for attribute_combination to columns using mention_id, text, and mention as id vars
tmp = tmp.pivot_table(index=['mention_id', 'text', 'mention'], columns='attribute_combination', values='label', aggfunc='last').reset_index()
tmp = tmp.rename_axis(None, axis=1)

# keep only fully gold-labeled examples
tmp = tmp[tmp[features].isna().sum(axis=1) == 0]
tmp = tmp[~(tmp[features]=='No').all(axis=1)]

In [41]:
tmp.mention_id.unique().shape[0]

596

In [42]:
# one-hot encode labels
label2id = {'No': 0, 'Yes': 1}
id2label = {0: 'No', 1: 'Yes'}
tmp.loc[:,features] = tmp.loc[:,features].apply(lambda x: x.map(label2id))

In [43]:
cnts = tmp[features].sum(axis=0)
cnts

economic: class membership                 33
economic: ecology of group                 27
economic: education level                  24
economic: employment status                48
economic: income/wealth/economic status    47
economic: occupation/profession            93
economic: other                             0
non-economic: age                          72
non-economic: crime                        30
non-economic: ethnicity                    34
non-economic: family                       53
non-economic: gender/sexuality             47
non-economic: health                       36
non-economic: nationality                  78
non-economic: other                         5
non-economic: place/location               14
non-economic: religion                     25
non-economic: shared values/mentalities    74
universal                                  55
dtype: object

In [44]:
drop_these = cnts[cnts < 10].index.tolist()
for f in drop_these:
    features.remove(f)

tmp = tmp[['mention_id', 'text', 'mention'] + features]

In [45]:
tmp.loc[:, 'labels'] = tmp.loc[:,features].apply(list, axis=1)

In [46]:
tokenizer = AutoTokenizer.from_pretrained(base_model)

In [47]:
# using concat strategy
tmp.loc[:, 'input'] = tmp.text + tokenizer.sep_token + tmp.mention 
max_length_ = max(tokenizer(tmp.input.to_list(), truncation=False, padding=False, return_length=True).length)
cols = ['input', 'labels']
cols_mapping = {"input": "text", "labels": "label"}

# # using span embedding strategy
# tmp['span'] = tmp.apply(lambda x: regex.search(regex.escape(x.mention), x.text).span(), axis=1)
# max_length_ = max(tokenizer(tmp.text.to_list(), truncation=False, padding=False, return_length=True).length)
# cols = ['text', 'span', 'labels']
# cols_mapping = {'text': 'text', 'span': 'span', 'labels': 'label'}

In [48]:
tmp['signature'] = tmp[features].apply(lambda r: '; '.join([f for f in features if r[f]==1]), axis=1)

### split the data

In [50]:
trn, tst = train_test_split(range(len(tmp)), test_size=0.25, random_state=SEED)

In [51]:
dataset = datasets.DatasetDict({
    'train': datasets.Dataset.from_pandas(tmp.iloc[trn][cols], preserve_index=False),
    'test': datasets.Dataset.from_pandas(tmp.iloc[tst][cols], preserve_index=False)
})

### Prepare fine-tuning

In [52]:
feats = tmp.iloc[trn][features].to_numpy()
class_weights = get_class_weights(feats, multitarget=True)
class_weights = class_weights.astype(float)
class_weights

array([0.06288789, 0.06860498, 0.13720995, 0.04312313, 0.04573665,
       0.02156156, 0.02647911, 0.07187188, 0.06037238, 0.04192526,
       0.04716592, 0.05031032, 0.02647911, 0.15093095, 0.08385053,
       0.02795018, 0.03354021])

In [53]:
id2label = {i: l for i, l in enumerate(features)}
label2id = {l: i for i, l in enumerate(features)}
id2label

{0: 'economic: class membership',
 1: 'economic: ecology of group',
 2: 'economic: education level',
 3: 'economic: employment status',
 4: 'economic: income/wealth/economic status',
 5: 'economic: occupation/profession',
 6: 'non-economic: age',
 7: 'non-economic: crime',
 8: 'non-economic: ethnicity',
 9: 'non-economic: family',
 10: 'non-economic: gender/sexuality',
 11: 'non-economic: health',
 12: 'non-economic: nationality',
 13: 'non-economic: place/location',
 14: 'non-economic: religion',
 15: 'non-economic: shared values/mentalities',
 16: 'universal'}

In [54]:
model_id = 'social-group-mention-attributes-classifier'
model_dir = os.path.join(model_path, model_id)

training_args = TrainingArguments(
    output_dir=model_dir,
    batch_size=(32, 4),
    max_length=max_length_,
    num_epochs=(0, 15),
    max_steps=-1,
    end_to_end=True,
    # loss=CosineSimilarityLoss,
    # samples_per_label=2,
    # use_amp=True,
    report_to='none',
    seed=SEED
)

In [55]:
from utils.metrics import compute_metrics_multilabel

# trainer = TrainerForSpanClassification(
trainer = Trainer(
    model_init=lambda: model_init(
        model_name=base_model,
        id2label=id2label,
        multitarget_strategy='one-vs-rest',
        class_weights=class_weights,
        use_span_embedding=False,#True,
        device='mps'
    ),
    metric=lambda p, t: compute_metrics_multilabel(p, t, id2label),
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    column_mapping=cols_mapping
)

# for deterministic results
trainer._args.seed = SEED
trainer.st_trainer.args.seed = SEED
trainer.st_trainer.args.data_seed = SEED
trainer.st_trainer.args.full_determinism = True

Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset


Map:   0%|          | 0/447 [00:00<?, ? examples/s]

### Fine-tune

In [56]:
trainer.train()

***** Running training *****
  Num unique pairs = 189960
  Batch size = 32
  Num epochs = 0


0it [00:00, ?it/s]

{'train_runtime': 0.0055, 'train_samples_per_second': 0.0, 'train_steps_per_second': 0.0, 'train_loss': 0.0, 'epoch': 0}


Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Iteration:   0%|          | 0/112 [00:00<?, ?it/s]

Iteration:   0%|          | 0/112 [00:00<?, ?it/s]

Iteration:   0%|          | 0/112 [00:00<?, ?it/s]

Iteration:   0%|          | 0/112 [00:00<?, ?it/s]

Iteration:   0%|          | 0/112 [00:00<?, ?it/s]

Iteration:   0%|          | 0/112 [00:00<?, ?it/s]

Iteration:   0%|          | 0/112 [00:00<?, ?it/s]

Iteration:   0%|          | 0/112 [00:00<?, ?it/s]

Iteration:   0%|          | 0/112 [00:00<?, ?it/s]

Iteration:   0%|          | 0/112 [00:00<?, ?it/s]

Iteration:   0%|          | 0/112 [00:00<?, ?it/s]

Iteration:   0%|          | 0/112 [00:00<?, ?it/s]

Iteration:   0%|          | 0/112 [00:00<?, ?it/s]

Iteration:   0%|          | 0/112 [00:00<?, ?it/s]

Iteration:   0%|          | 0/112 [00:00<?, ?it/s]

### Evaluate

In [57]:
metrics = trainer.evaluate()

***** Running evaluation *****


In [58]:
res = pd.DataFrame(metrics, index=[0]).T.reset_index().rename(columns={'index': 'metric', 0: 'value'})
res[['metric', 'category']] = res.metric.str.split('_', expand=True)
res = res.pivot(index='category', columns='metric', values='value')
# remove index names
res.columns.name = None
res.index.name = None
res.loc[['macro']+features]

Unnamed: 0,f1,precision,recall,support
macro,0.608382,0.898506,0.488124,
economic: class membership,0.615385,1.0,0.444444,9.0
economic: ecology of group,0.6,0.6,0.6,5.0
economic: education level,0.470588,1.0,0.307692,13.0
economic: employment status,0.555556,1.0,0.384615,13.0
economic: income/wealth/economic status,0.695652,0.888889,0.571429,14.0
economic: occupation/profession,0.296296,1.0,0.173913,23.0
non-economic: age,0.888889,1.0,0.8,15.0
non-economic: crime,0.875,1.0,0.777778,9.0
non-economic: ethnicity,0.75,0.857143,0.666667,9.0


In [71]:
y_true = np.array(dataset['test']['labels'])
y_pred = np.where(trainer.model.predict_proba(dataset['test']['input'], as_numpy=True) > 0.5, 1, 0)

In [72]:
#def compute_metrics_hierarchical_multilabel(y_pred, y_true, id2label, label_sep=': '):
# y_true, y_pred = np.array(y_true), np.array(y_pred)
# granularly
granular_scores = {}
for i, l in id2label.items():
    p, r, f1, _ = precision_recall_fscore_support(y_true[:, i], y_pred[:, i], average='binary', zero_division=0.0)
    granular_scores[l] = {'f1': f1, 'precision': p, 'recall': r, 'support': np.sum(y_true[:, i])}
macros = {m: np.mean([d[m] for d in granular_scores.values()]) for m in ['f1', 'precision', 'recall']}
granular_scores = {'macro_granular': macros} | granular_scores
granular_scores

{'macro_granular': {'f1': 0.09339525283797728,
  'precision': 0.15980392156862747,
  'recall': 0.07913165266106442},
 'economic: class membership': {'f1': 0.0,
  'precision': 0.0,
  'recall': 0.0,
  'support': 6},
 'economic: ecology of group': {'f1': 0.0,
  'precision': 0.0,
  'recall': 0.0,
  'support': 3},
 'economic: education level': {'f1': 0.0,
  'precision': 0.0,
  'recall': 0.0,
  'support': 3},
 'economic: employment status': {'f1': 0.0,
  'precision': 0.0,
  'recall': 0.0,
  'support': 13},
 'economic: income/wealth/economic status': {'f1': 0.25,
  'precision': 1.0,
  'recall': 0.14285714285714285,
  'support': 14},
 'economic: occupation/profession': {'f1': 0.0,
  'precision': 0.0,
  'recall': 0.0,
  'support': 23},
 'non-economic: age': {'f1': 0.9166666666666666,
  'precision': 0.9166666666666666,
  'recall': 0.9166666666666666,
  'support': 12},
 'non-economic: crime': {'f1': 0.0,
  'precision': 0.0,
  'recall': 0.0,
  'support': 6},
 'non-economic: ethnicity': {'f1': 0.0,

In [None]:
# coarse
granular2coarse = {i: l.split(label_sep)[0] for i, l in id2label.items()}
coarse_cats = set(granular2coarse.values())
y_true = [granular2coarse[i] for i in y_true]
y_pred = [granular2coarse[i] for i in y_pred]
p, r, f1, _ = precision_recall_fscore_support(y_true, y_pred, average=None, zero_division=0.0)
coarse_scores = {}
coarse_scores['macro_coarse'] = {'f1': np.mean(f1), 'precision': np.mean(p), 'recall': np.mean(r)}
for i, l in coarse_cats:
    coarse_scores[l] = {'f1': f1[i], 'precision': p[i], 'recall': r[i]}
# flatten
scores = coarse_scores | granular_scores
scores = {f'{m}_{l}': v for l, d in scores.items() for m, v in d.items()}
return scores

## Stance

### Prepare the data

In [8]:
tmp = df[df.attribute=="stance"]
tmp = tmp[['mention_id', 'text', 'mention', 'label']].drop_duplicates()

In [9]:
tmp['label'].value_counts(dropna=False)

label
Positive    242
Negative     34
Neutral      23
Unsure        1
Name: count, dtype: int64

In [10]:
tmp = tmp[tmp.label != 'Unsure']

{0: 'Positive', 1: 'Neutral', 2: 'Negative'}

In [11]:
id2label = dict(enumerate(tmp.label.unique()))
label2id = {l: i for i, l in id2label.items()}

In [12]:
tmp.loc[:,'labels'] = tmp.label.map(label2id)

In [13]:
tokenizer = AutoTokenizer.from_pretrained(base_model)

In [14]:
# using concat strategy
tmp['input'] = tmp.text + tokenizer.sep_token + tmp.mention 
max_length_ = max(tokenizer(tmp.input.to_list(), truncation=False, padding=False, return_length=True).length)
cols = ['input', 'labels']
cols_mapping = {"input": "text", "labels": "label"}

### split the data

In [15]:
trn, tst = train_test_split(range(len(tmp)), test_size=0.25, random_state=SEED, stratify=tmp.label)

In [16]:
dataset = datasets.DatasetDict({
    'train': datasets.Dataset.from_pandas(tmp.iloc[trn][cols], preserve_index=False),
    'test': datasets.Dataset.from_pandas(tmp.iloc[tst][cols], preserve_index=False)
})

### Prepare fine-tuning

In [17]:
y_train = np.array(dataset['train']['labels'])
class_weights = get_class_weights(y_train)
class_weights = class_weights.astype(float)
class_weights

array([0.0537386 , 0.57215805, 0.37410334])

In [21]:
model_id = 'social-group-mention-stance-classifier'
model_dir = os.path.join(model_path, model_id)

training_args = TrainingArguments(
    output_dir=model_dir,
    batch_size=(32, 4),
    max_length=max_length_,
    num_epochs=(0, 15),
    max_steps=-1,
    end_to_end=True,
    # loss=CosineSimilarityLoss,
    # samples_per_label=2,
    # use_amp=True,
    report_to='none',
    seed=SEED
)

In [22]:
from utils.metrics import compute_metrics_multiclass

# trainer = TrainerForSpanClassification(
trainer = Trainer(
    model_init=lambda: model_init(
        model_name=base_model,
        id2label=id2label,
        # multitarget_strategy='one-vs-rest',
        class_weights=class_weights,
        use_span_embedding=False,#True,
        device='mps'
    ),
    metric=lambda p, t: compute_metrics_multiclass(p, t, id2label),
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    column_mapping=cols_mapping
)

# for deterministic results
trainer._args.seed = SEED
trainer.st_trainer.args.seed = SEED
trainer.st_trainer.args.data_seed = SEED
trainer.st_trainer.args.full_determinism = True

Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset


Map:   0%|          | 0/224 [00:00<?, ? examples/s]

### Fine-tune

In [23]:
trainer.train()

***** Running training *****
  Num unique pairs = 33950
  Batch size = 32
  Num epochs = 0


0it [00:00, ?it/s]

{'train_runtime': 0.0053, 'train_samples_per_second': 0.0, 'train_steps_per_second': 0.0, 'train_loss': 0.0, 'epoch': 0}


Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

: 

### Evaluate

In [None]:
metrics = trainer.evaluate()

***** Running evaluation *****


In [None]:
res = pd.DataFrame(metrics, index=[0]).T.reset_index().rename(columns={'index': 'metric', 0: 'value'})
res[['metric', 'category']] = res.metric.str.split('_', expand=True)
res = res.pivot(index='category', columns='metric', values='value')
# remove index names
res.columns.name = None
res.index.name = None
res.loc[['macro']+features]

Unnamed: 0,f1,precision,recall,support
macro,0.903006,0.957418,0.859259,
universal,0.888889,1.0,0.8,15.0
economic,0.927273,0.910714,0.944444,54.0
non-economic,0.892857,0.961538,0.833333,60.0


## error analysis

In [None]:
probs = trainer.model.predict_proba(dataset['test']['input'], as_numpy=True)
preds = np.where(probs > 0.5, 1, 0)

In [None]:
# sanity check: any universal and other attributes? (not allowed)
idxs = np.where(np.logical_and(preds[:, 0]==1, preds[:, 1:].sum(axis=1)>0))[0]
len(idxs)
# okay!

0

In [None]:
def parse_input(x):
  # text, mention = x.split(tokenizer.sep_token)
  # span = regex.search(regex.escape(mention), text).span()
  return x.split(tokenizer.sep_token)

In [None]:
errors_df = [] 
for attribute, attribute_id in label2id.items():
    errors = preds != dataset['test']['labels']
    idxs = np.where(errors[:, attribute_id])[0]

    tmp = pd.DataFrame([parse_input(x) for x in dataset['test'].select(idxs)['input']], columns=['text', 'mention'])
    tmp['attribute'] = attribute
    tmp['label'] = np.array(dataset['test'].select(idxs)['labels'])[:, attribute_id]
    tmp['pred'] = preds[idxs, attribute_id]
    errors_df.append(tmp)

errors_df = pd.concat(errors_df)
errors_df

Unnamed: 0,text,mention,attribute,label,pred
0,Party for living people.,living people,universal,1,0
1,x Businesses should pay a normal share of thei...,society,universal,1,0
2,Young people need places where they can develo...,groups,universal,1,0
0,In a continuously technologically improving so...,a continuously technologically improving society,economic,0,1
1,A society where wealth is not measured in cons...,A society where wealth is not measured in cons...,economic,0,1
2,x Businesses should pay a normal share of thei...,society,economic,0,1
3,The Greens propose to experiment with new ways...,those who are undergoing profound democratic r...,economic,0,1
4,"Children with a migrant background, children f...",children with a dependent or mentally ill parent,economic,1,0
5,Elderly care and care should be of high qualit...,mentors,economic,1,0
6,Young people who don't want education now - bu...,Young people who don't want education now - bu...,economic,1,0


## Save the model

In [None]:
import shutil
shutil.rmtree(model_dir)

In [None]:
trainer.model.save_pretrained(model_dir)

In [None]:
trainer.model.to('cpu');
del trainer