# Social group mention attribute category and attributes multilabel text classification

We have collected human annotations that categorize mentions of social groups in party manifestos into the following (hierarchical) scheme of attribute dimensions and attribute classes:

- economic attributes
    - class membership
    - ecology of group
    - education level
    - employment status
    - income/wealth/economic status
    - occupation/profession
    - other
- non-economic attributes:
   - age
   - crime
   - ethnicity
   - family
   - gender/sexuality
   - health
   - nationality
   - other
   - place/location
   - religion
   - shared values/mentalities
- universal

In this notebook, we fine-tune a pre-trained sentence transformer model for multilabel classifiers using the `setfit` library to categorize into which attribute dimensions social group mentions belong.

notebook based on https://github.com/huggingface/setfit/blob/main/notebooks/text-classification_multilabel.ipynb

See also:

- https://huggingface.co/docs/setfit/en/how_to/multilabel
- https://github.com/huggingface/setfit/issues/413#issuecomment-1697751329

## Setup

In [28]:
import sys
sys.path.append('.')

In [29]:
import os
import numpy as np
import pandas as pd
import regex

import torch
import datasets
from sklearn.model_selection import train_test_split

from utils.setfit import get_class_weights, model_init, TrainerForSpanClassification

from transformers import AutoTokenizer, set_seed
from setfit import TrainingArguments, Trainer

from utils.metrics import *

from pathlib import Path

data_path = Path('../../data/annotations/group_mention_categorization')
model_path = Path('../../models/')

In [30]:
SEED = 42
set_seed(SEED)

In [31]:
model_id = 'sentence-transformers/paraphrase-mpnet-base-v2'

## Preparing the dataset

In [32]:
fp = data_path / 'final_annotations.tsv'
annotations = pd.read_csv(fp, sep='\t')
ignore = ['stance: ', 'universal: ']
annotations.query("attribute_combination not in @ignore", inplace=True)

In [33]:
# gather attribute combinations with label=='Yes' at the mention level
mentions_df = annotations.groupby(['mention_id', 'text', 'mention'])[['attribute_combination', 'label']].apply(lambda x: sorted(set(x.attribute_combination[x.label=='Yes']))).reset_index()
mentions_df.rename(columns={0: 'attributes'}, inplace=True)

## Universal/econ/non-econ as three-way multilabel problem

### Prepare the data

In [34]:
annotations.q_id.isna().sum()

0

In [35]:
# stack by category
df = pd.concat([
    annotations[annotations.q_id == 'economic_attributes'].groupby(['mention_id', 'text', 'mention', 'q_id']).agg({'label': lambda x: 'Yes' if (x=='Yes').any() else 'No'}).reset_index(),
    annotations[annotations.q_id == 'non-economic_attributes'].groupby(['mention_id', 'text', 'mention', 'q_id']).agg({'label': lambda x: 'Yes' if (x=='Yes').any() else 'No'}).reset_index()
])
df.reset_index(drop=True, inplace=True)

# get dimensions
df.q_id = df.q_id.str.removesuffix('_attributes')
features = df.q_id.unique().tolist()

In [36]:
# reshape to wide format
df = df.pivot_table(index=['mention_id', 'text', 'mention'], columns='q_id', values='label', aggfunc='last').reset_index()
df = df.rename_axis(None, axis=1)

In [37]:
# TODO: consider dropping this and using it implicitly (when classifier sees/predicts both dims as 'No')
df['universal'] = 'No'
df.loc[(df[features]=='No').all(axis=1), 'universal'] = 'Yes'
features.append('universal')

In [38]:
# keep only fully gold-labeled examples
df = df[df[features].isna().sum(axis=1) == 0]

In [39]:
df[features].value_counts(dropna=False)

economic  non-economic  universal
No        Yes           No           327
Yes       No            No           161
No        No            Yes           59
Yes       Yes           No            53
Name: count, dtype: int64

In [40]:
# # all-No examples
# # TODO: make this part of conolsitation 
# df[(df[features]=='No').all(axis=1)]

# # discard 
# tmp = tmp[~(tmp[features]=='No').all(axis=1)]

In [41]:
label2id = {'No': 0, 'Yes': 1}
id2label = {v: k for k, v in label2id.items()}
df.loc[:,features] = df.loc[:,features].apply(lambda x: x.map(label2id))

In [42]:
df['labels'] = df.loc[:,features].apply(list, axis=1)

In [43]:
df[features].mean(axis=0)
# strong label class imbalance

economic        0.356667
non-economic    0.633333
universal       0.098333
dtype: object

In [44]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [45]:
USE_SPAN_EMBEDDING = False
if not USE_SPAN_EMBEDDING:
    # using concat strategy
    sep_tok = tokenizer.sep_token 
    df['input'] = df.mention + sep_tok + df.text 
    max_length_ = max(tokenizer(df.input.to_list(), truncation=False, padding=False, return_length=True).length)
    cols = ['input', 'labels']
    cols_mapping = {"input": "text", "labels": "label"}
else:
    # using span embedding strategy
    df['span'] = df.apply(lambda x: regex.search(regex.escape(x.mention), x.text).span(), axis=1)
    max_length_ = max(tokenizer(df.text.to_list(), truncation=False, padding=False, return_length=True).length)
    cols = ['text', 'span', 'labels']
    cols_mapping = {'text': 'text', 'span': 'span', 'labels': 'label'}

### split the data

In [46]:
df['signature'] = df[features].apply(lambda r: '; '.join([f for f in features if r[f]==1]), axis=1)
# set signatures with < 10 obs to None
df['signature'] = df['signature'].where(df['signature'].isin(df['signature'].value_counts()[df['signature'].value_counts() >= 10].index), '_')

In [47]:
trn, tst = train_test_split(range(len(df)), test_size=0.25, random_state=SEED, stratify=df.signature)

In [48]:
dataset = datasets.DatasetDict({
    'train': datasets.Dataset.from_pandas(df.iloc[trn][cols], preserve_index=False),
    'test': datasets.Dataset.from_pandas(df.iloc[tst][cols], preserve_index=False)
})

### Prepare fine-tuning

In [49]:
feats = df.iloc[trn][features].to_numpy()
class_weights = get_class_weights(feats, multitarget=True)
class_weights = class_weights.astype(float)
class_weights

array([0.19142408, 0.10813781, 0.70043811])

In [50]:
id2label = {i: l for i, l in enumerate(features)}
label2id = {l: i for i, l in enumerate(features)}
id2label

{0: 'economic', 1: 'non-economic', 2: 'universal'}

In [51]:
from sentence_transformers.losses import ContrastiveLoss
model_name = 'social-group-mention-attribute-dimension-classifier-v2'
model_dir = os.path.join(model_path, model_name)

training_args = TrainingArguments(
    output_dir=model_dir,
    batch_size=(32, 8),
    max_length=max_length_,
    num_epochs=(1, 7),
    max_steps=150,
    end_to_end=True,
    loss=ContrastiveLoss,
    # samples_per_label=2,
    # use_amp=True,
    report_to='none',
    seed=SEED
)

In [52]:
from utils.metrics import compute_metrics_multilabel

trainer_class = TrainerForSpanClassification if USE_SPAN_EMBEDDING else Trainer
trainer = trainer_class(
    model_init=lambda: model_init(
        model_name=model_id,
        id2label=id2label,
        multitarget_strategy='one-vs-rest',
        class_weights=class_weights,
        use_span_embedding=USE_SPAN_EMBEDDING,
        # device=device
    ),
    metric=lambda p, t: compute_metrics_multilabel(p, t, id2label),
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    column_mapping=cols_mapping
)

# for deterministic results
trainer._args.seed = SEED
trainer.st_trainer.args.seed = SEED
trainer.st_trainer.args.data_seed = SEED
trainer.st_trainer.args.full_determinism = True

Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset
Map: 100%|██████████| 450/450 [00:00<00:00, 44909.03 examples/s]


### Fine-tune

In [53]:
trainer.train()

***** Running training *****
  Num unique pairs = 4800
  Batch size = 32
  Num epochs = 1


Step,Training Loss
1,0.1259
50,0.05
100,0.0283
150,0.0263


Epoch: 100%|██████████| 7/7 [00:15<00:00,  2.22s/it]


### Evaluate

In [54]:
metrics = trainer.evaluate()

***** Running evaluation *****


In [55]:
res = pd.DataFrame(metrics, index=[0]).T.reset_index().rename(columns={'index': 'metric', 0: 'value'})
res[['metric', 'category']] = res.metric.str.split('_', expand=True)
res = res.pivot(index='category', columns='metric', values='value')
# remove index names
res.columns.name = None
res.index.name = None
res.loc[['macro']+features]

Unnamed: 0,f1,precision,recall,support
macro,0.858621,0.901381,0.822928,
economic,0.846154,0.862745,0.830189,53.0
non-economic,0.914894,0.924731,0.905263,95.0
universal,0.814815,0.916667,0.733333,15.0


## error analysis

In [58]:
probs = trainer.model.predict_proba(dataset['test']['input'], as_numpy=True)
preds = np.where(probs > 0.5, 1, 0)
preds.shape

(150, 3)

In [59]:
# sanity check: any universal and other attributes? (not allowed)
idxs = np.where(np.logical_and(preds[:, 0]==1, preds[:, 1:].sum(axis=1)>0))[0]
len(idxs)

12

In [60]:
def parse_input(x):
  # text, mention = x.split(tokenizer.sep_token)
  # span = regex.search(regex.escape(mention), text).span()
  return x.split(sep_tok)

In [68]:
errors_df = [] 
for attribute, attribute_id in label2id.items():
    errors = preds != dataset['test']['labels']
    idxs = np.where(errors[:, attribute_id])[0]

    tmp = pd.DataFrame([parse_input(x) for x in dataset['test'].select(idxs)['input']], columns=['mention', 'text'])
    tmp['attribute'] = attribute
    tmp['label'] = np.array(dataset['test'].select(idxs)['labels'])[:, attribute_id]
    tmp['pred'] = preds[idxs, attribute_id]
    errors_df.append(tmp)

errors_df = pd.concat(errors_df)

In [74]:
highlight = lambda text, mention: text.replace(mention, '\u001B[30m\u001B[43m'+mention+'\033[0m')

for (a, t, p), subdf in errors_df.groupby(['attribute', 'label', 'pred']):
    error_type = 'false positives' if t==0 else 'false negatives'
    # print attribute name in bold
    print(f'\033[1m{a}\033[0m: "{error_type}"')
    for i, row in subdf.sample(n=min(4, len(subdf)), random_state=42    ).iterrows():
        print(f"  - {highlight(row['text'], row['mention'])}")
    print()

[1meconomic[0m: "false positives"
  - Tax reductions will be granted to [30m[43mpersons with disabilities of Group 1 and 2[0m.
  - The Greens demand that [30m[43mthe multinationals based in Switzerland[0m assume their responsibility for sustainable global development according to their size and do not go into their own pockets.
  - The separation on the joint energy companies of [30m[43mthe nationals[0m should not come to account.
  - The RPS has shown that it is the only political force that really exposes corruption and ties to [30m[43mthe mafia[0m at the high levels of power.

[1meconomic[0m: "false negatives"
  - Our aim will be to introduce a statutory duty on local authorities to provide nursery education, as soon as possible, for [30m[43mall pre-school children[0m whose parents wish it.
  - Companies which looked inwards to Whitehall are now listening to their customers and [30m[43mshareholders[0m.
  - Only [30m[43mthose who are transparently informed[0m

## Save the model

In [75]:
import shutil
shutil.rmtree(model_dir)

In [76]:
trainer.model.save_pretrained(model_dir)

In [77]:
trainer.model.to('cpu');
del trainer

## granular attribute classification problem

### Economic attributes

#### Prepare the data

In [96]:
df = annotations.query("attribute=='economic' and category!='other'")
features = df.category.unique().tolist()
features

['class membership',
 'ecology of group',
 'education level',
 'employment status',
 'income/wealth/economic status',
 'occupation/profession']

In [97]:
# pivot labels for attribute_combination to columns using mention_id, text, and mention as id vars
df = df.pivot_table(index=['mention_id', 'text', 'mention'], columns='category', values='label', aggfunc='last').reset_index()
df = df.rename_axis(None, axis=1)

# NOTE: only apply for multi-dim classification
# # keep only fully gold-labeled examples
# df = df[df[features].isna().sum(axis=1) == 0]
# df = df[~(df[features]=='No').all(axis=1)]

In [98]:
df.mention_id.unique().shape[0]

600

In [99]:
# one-hot encode labels
label2id = {'No': 0, 'Yes': 1}
id2label = {0: 'No', 1: 'Yes'}
df.loc[:,features] = df.loc[:,features].apply(lambda x: x.map(label2id))

In [100]:
cnts = df[features].sum(axis=0)
cnts

class membership                 23
ecology of group                 17
education level                  22
employment status                31
income/wealth/economic status    44
occupation/profession            98
dtype: object

In [101]:
drop_these = cnts[cnts < 10].index.tolist()
for f in drop_these:
    features.remove(f)
df = df[['mention_id', 'text', 'mention'] + features]

In [102]:
df.loc[:, 'labels'] = df.loc[:,features].apply(list, axis=1)

In [103]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [108]:
STRATEGY = 'concat'  # 'mention'
if STRATEGY == 'span':
    # using span embedding strategy
    df['span'] = df.apply(lambda x: regex.search(regex.escape(x.mention), x.text).span(), axis=1)
    max_length_ = max(tokenizer(df.text.to_list(), truncation=False, padding=False, return_length=True).length)
    cols = ['text', 'span', 'labels']
    cols_mapping = {'text': 'text', 'span': 'span', 'labels': 'label'}
else:
    if STRATEGY == 'concat':
        # using concat strategy
        sep_tok = tokenizer.sep_token
        df['input'] = df.mention + sep_tok + df.text
    elif STRATEGY == 'mention':
        # using concat strategy
        df.loc[:, 'input'] = df.mention
    else:
        raise ValueError(f"Unknown strategy: {STRATEGY}")
    max_length_ = max(tokenizer(df.input.to_list(), truncation=False, padding=False, return_length=True).length)
    cols = ['input', 'labels']
    cols_mapping = {"input": "text", "labels": "label"}    

#### split the data

In [109]:
df['signature'] = df[features].apply(lambda r: '; '.join([f for f in features if r[f]==1]), axis=1)
df['signature'] = df['signature'].where(df['signature'].isin(df['signature'].value_counts()[df['signature'].value_counts() >= 10].index), '_')

In [110]:
trn, tst = train_test_split(range(len(df)), test_size=0.25, random_state=SEED, stratify=df.signature)

In [111]:
dataset = datasets.DatasetDict({
    'train': datasets.Dataset.from_pandas(df.iloc[trn][cols], preserve_index=False),
    'test': datasets.Dataset.from_pandas(df.iloc[tst][cols], preserve_index=False)
})

#### Prepare fine-tuning

In [113]:
feats = df.iloc[trn][features].to_numpy()
class_weights = get_class_weights(feats, multitarget=True)
class_weights = class_weights.astype(float)
class_weights

array([0.20441921, 0.28959388, 0.20441921, 0.15109246, 0.1022096 ,
       0.04826565])

In [114]:
id2label = {i: l for i, l in enumerate(features)}
label2id = {l: i for i, l in enumerate(features)}
id2label

{0: 'class membership',
 1: 'ecology of group',
 2: 'education level',
 3: 'employment status',
 4: 'income/wealth/economic status',
 5: 'occupation/profession'}

In [115]:
model_name = 'social-group-mention-econ-attributes-classifier'
model_dir = os.path.join(model_path, model_name)

training_args = TrainingArguments(
    output_dir=model_dir,
    batch_size=(32, 8),
    max_length=max_length_,
    num_epochs=(1, 7),
    max_steps=150,
    # max_steps=-1,
    end_to_end=True,
    loss=ContrastiveLoss,
    # samples_per_label=2,
    # use_amp=True,
    report_to='none',
    seed=SEED
)

In [116]:
from utils.metrics import compute_metrics_multilabel

trainer_class = TrainerForSpanClassification if STRATEGY=='span' else Trainer
trainer = trainer_class(
    model_init=lambda: model_init(
        model_name=model_id,
        id2label=id2label,
        multitarget_strategy='one-vs-rest',
        class_weights=class_weights,
        use_span_embedding=STRATEGY=='span',
    ),
    metric=lambda p, t: compute_metrics_multilabel(p, t, id2label),
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    column_mapping=cols_mapping
)

# for deterministic results
trainer._args.seed = SEED
trainer.st_trainer.args.seed = SEED
trainer.st_trainer.args.data_seed = SEED
trainer.st_trainer.args.full_determinism = True

Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset
Map: 100%|██████████| 450/450 [00:00<00:00, 55356.55 examples/s]


#### Fine-tune

In [117]:
trainer.train()

***** Running training *****
  Num unique pairs = 4800
  Batch size = 32
  Num epochs = 1


Step,Training Loss
1,0.1438
50,0.0495
100,0.0248
150,0.0172


Epoch: 100%|██████████| 7/7 [00:15<00:00,  2.23s/it]


#### Evaluate

In [118]:
metrics = trainer.evaluate()

***** Running evaluation *****


In [119]:
res = pd.DataFrame(metrics, index=[0]).T.reset_index().rename(columns={'index': 'metric', 0: 'value'})
res[['metric', 'category']] = res.metric.str.split('_', expand=True)
res = res.pivot(index='category', columns='metric', values='value')
# remove index names
res.columns.name = None
res.index.name = None
res.loc[['macro']+features]

Unnamed: 0,f1,precision,recall,support
macro,0.684371,0.757428,0.646261,
class membership,0.444444,0.666667,0.333333,6.0
ecology of group,0.545455,0.5,0.6,5.0
education level,0.8,0.8,0.8,5.0
employment status,0.875,0.875,0.875,8.0
income/wealth/economic status,0.625,0.833333,0.5,10.0
occupation/profession,0.816327,0.869565,0.769231,26.0


In [132]:
if STRATEGY == 'span':
    parse_example = lambda x: (x['text'], tuple(x['span']))
    examples = list(map(parse_example, dataset['test'].select_columns(['text', 'span']).to_list()))
    preds = trainer.model.predict(examples, as_numpy=True)
    errors_df = pd.DataFrame(examples, columns=['text', 'span'])
    errors_df['mention'] = errors_df.apply(lambda x: x['text'][slice(*x['span'])], axis=1)
    del errors_df['span']
else:
    # probs = trainer.model.predict_proba(dataset['test']['input'], as_numpy=True)
    # preds = np.where(probs > 0.5, 1, 0)
    preds = trainer.model.predict(dataset['test']['input'], as_numpy=True)
    errors_df = pd.DataFrame(dataset['test']['input'], columns=['mention'])
    if STRATEGY == 'concat':
        errors_df[['mention', 'text']] = errors_df.mention.str.split(sep_tok, expand=True)
errors_df['label'] = dataset['test']['labels']
errors_df['pred'] = list(map(list, preds))
errors_df['category'] = [features]*len(errors_df)
errors_df = errors_df.explode(['label', 'pred', 'category'])
errors_df = errors_df.explode(['label', 'pred', 'category'])
errors_df = errors_df.query("label!=pred")

In [135]:
for (c, t, p), subdf in errors_df.groupby(['category', 'label', 'pred']):
    error_type = 'false positives' if t==0 else 'false negatives'
    # print attribute name in bold
    print(f'\033[1m{c}\033[0m: "{error_type}"')
    for i, row in subdf.sample(n=min(4, len(subdf)), random_state=42    ).iterrows():
        print(f"  - {highlight(row['text'], row['mention'])}")
    print()

[1mclass membership[0m: "false positives"
  - This tax would only affect [30m[43mthe wealthiest 6 per cent of New Zealanders[0m.

[1mclass membership[0m: "false negatives"
  - [30m[43mPowerful cliques behind and within the old parties[0m are heading towards the Great Coalition.
  - [30m[43mPeople who started with goods carried across the border in suitcases and from field beds on the streets, from small consulting firms and service workshops, gradually built shops, wholesalers, factories, corporations and today give jobs to dozens of people: workers[0m.
  - Too many [30m[43mordinary people[0m know this.
  - A real support for [30m[43mordinary people[0m.

[1mecology of group[0m: "false positives"
  - [30m[43mThe people who represent this ecological mindset[0m can be found in the environmental movement, among technological innovators, in organizations and political parties.
  - Work with provincial and territorial governments to ensure workplace accommodations, me

### non-economic attributes

#### Prepare the data

In [136]:
df = annotations.query("attribute=='non-economic' and category!='other'")
features = df.category.unique().tolist()
features

['age',
 'crime',
 'ethnicity',
 'family',
 'gender/sexuality',
 'health',
 'nationality',
 'place/location',
 'religion',
 'shared values/mentalities']

In [137]:
# pivot labels for attribute_combination to columns using mention_id, text, and mention as id vars
df = df.pivot_table(index=['mention_id', 'text', 'mention'], columns='category', values='label', aggfunc='last').reset_index()
df = df.rename_axis(None, axis=1)

# NOTE: only apply for multi-dim classification
# # keep only fully gold-labeled examples
# df = df[df[features].isna().sum(axis=1) == 0]
# df = df[~(df[features]=='No').all(axis=1)]

In [138]:
df.mention_id.unique().shape[0]

600

In [139]:
# one-hot encode labels
label2id = {'No': 0, 'Yes': 1}
id2label = {0: 'No', 1: 'Yes'}
df.loc[:,features] = df.loc[:,features].apply(lambda x: x.map(label2id))

In [140]:
cnts = df[features].sum(axis=0)
cnts

age                          65
crime                        30
ethnicity                    34
family                       50
gender/sexuality             48
health                       31
nationality                  74
place/location               17
religion                     25
shared values/mentalities    87
dtype: object

In [141]:
drop_these = cnts[cnts < 10].index.tolist()
for f in drop_these:
    features.remove(f)
df = df[['mention_id', 'text', 'mention'] + features]

In [142]:
df.loc[:, 'labels'] = df.loc[:,features].apply(list, axis=1)

In [143]:
tokenizer = AutoTokenizer.from_pretrained(model_id)

In [144]:
STRATEGY = 'concat'  # 'mention'
if STRATEGY == 'span':
    # using span embedding strategy
    df['span'] = df.apply(lambda x: regex.search(regex.escape(x.mention), x.text).span(), axis=1)
    max_length_ = max(tokenizer(df.text.to_list(), truncation=False, padding=False, return_length=True).length)
    cols = ['text', 'span', 'labels']
    cols_mapping = {'text': 'text', 'span': 'span', 'labels': 'label'}
else:
    if STRATEGY == 'concat':
        # using concat strategy
        sep_tok = tokenizer.sep_token
        df['input'] = df.mention + sep_tok + df.text
    elif STRATEGY == 'mention':
        # using concat strategy
        df.loc[:, 'input'] = df.mention
    else:
        raise ValueError(f"Unknown strategy: {STRATEGY}")
    max_length_ = max(tokenizer(df.input.to_list(), truncation=False, padding=False, return_length=True).length)
    cols = ['input', 'labels']
    cols_mapping = {"input": "text", "labels": "label"}    

#### split the data

In [145]:
df['signature'] = df[features].apply(lambda r: '; '.join([f for f in features if r[f]==1]), axis=1)
df['signature'] = df['signature'].where(df['signature'].isin(df['signature'].value_counts()[df['signature'].value_counts() >= 10].index), '_')

In [146]:
trn, tst = train_test_split(range(len(df)), test_size=0.25, random_state=SEED, stratify=df.signature)

In [147]:
dataset = datasets.DatasetDict({
    'train': datasets.Dataset.from_pandas(df.iloc[trn][cols], preserve_index=False),
    'test': datasets.Dataset.from_pandas(df.iloc[tst][cols], preserve_index=False)
})

#### Prepare fine-tuning

In [148]:
feats = df.iloc[trn][features].to_numpy()
class_weights = get_class_weights(feats, multitarget=True)
class_weights = class_weights.astype(float)
class_weights

array([0.0571124 , 0.12460887, 0.10153315, 0.06853488, 0.07614986,
       0.1096558 , 0.04809465, 0.21087655, 0.16125853, 0.04217531])

In [149]:
id2label = {i: l for i, l in enumerate(features)}
label2id = {l: i for i, l in enumerate(features)}
id2label

{0: 'age',
 1: 'crime',
 2: 'ethnicity',
 3: 'family',
 4: 'gender/sexuality',
 5: 'health',
 6: 'nationality',
 7: 'place/location',
 8: 'religion',
 9: 'shared values/mentalities'}

In [150]:
model_name = 'social-group-mention-nonecon-attributes-classifier'
model_dir = os.path.join(model_path, model_name)

training_args = TrainingArguments(
    output_dir=model_dir,
    batch_size=(32, 8),
    max_length=max_length_,
    num_epochs=(1, 7),
    max_steps=150,
    # max_steps=-1,
    end_to_end=True,
    loss=ContrastiveLoss,
    # samples_per_label=2,
    # use_amp=True,
    report_to='none',
    seed=SEED
)

In [151]:
from utils.metrics import compute_metrics_multilabel

trainer_class = TrainerForSpanClassification if STRATEGY=='span' else Trainer
trainer = trainer_class(
    model_init=lambda: model_init(
        model_name=model_id,
        id2label=id2label,
        multitarget_strategy='one-vs-rest',
        class_weights=class_weights,
        use_span_embedding=STRATEGY=='span',
    ),
    metric=lambda p, t: compute_metrics_multilabel(p, t, id2label),
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    column_mapping=cols_mapping
)

# for deterministic results
trainer._args.seed = SEED
trainer.st_trainer.args.seed = SEED
trainer.st_trainer.args.data_seed = SEED
trainer.st_trainer.args.full_determinism = True

Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset
Map: 100%|██████████| 450/450 [00:00<00:00, 58346.06 examples/s]


#### Fine-tune

In [152]:
trainer.train()

***** Running training *****
  Num unique pairs = 4800
  Batch size = 32


  Num epochs = 1


Step,Training Loss
1,0.1305
50,0.0479
100,0.0198
150,0.0145


Epoch: 100%|██████████| 7/7 [00:15<00:00,  2.23s/it]


#### Evaluate

In [153]:
metrics = trainer.evaluate()

***** Running evaluation *****


In [154]:
res = pd.DataFrame(metrics, index=[0]).T.reset_index().rename(columns={'index': 'metric', 0: 'value'})
res[['metric', 'category']] = res.metric.str.split('_', expand=True)
res = res.pivot(index='category', columns='metric', values='value')
# remove index names
res.columns.name = None
res.index.name = None
res.loc[['macro']+features]

Unnamed: 0,f1,precision,recall,support
macro,0.772882,0.944689,0.70623,
age,0.8,0.923077,0.705882,17.0
crime,0.8,0.857143,0.75,8.0
ethnicity,0.875,0.777778,1.0,7.0
family,0.823529,1.0,0.7,10.0
gender/sexuality,0.956522,1.0,0.916667,12.0
health,0.909091,1.0,0.833333,6.0
nationality,0.740741,1.0,0.588235,17.0
place/location,0.4,1.0,0.25,4.0
religion,0.941176,0.888889,1.0,8.0


In [155]:
if STRATEGY == 'span':
    parse_example = lambda x: (x['text'], tuple(x['span']))
    examples = list(map(parse_example, dataset['test'].select_columns(['text', 'span']).to_list()))
    preds = trainer.model.predict(examples, as_numpy=True)
    errors_df = pd.DataFrame(examples, columns=['text', 'span'])
    errors_df['mention'] = errors_df.apply(lambda x: x['text'][slice(*x['span'])], axis=1)
    del errors_df['span']
else:
    # probs = trainer.model.predict_proba(dataset['test']['input'], as_numpy=True)
    # preds = np.where(probs > 0.5, 1, 0)
    preds = trainer.model.predict(dataset['test']['input'], as_numpy=True)
    errors_df = pd.DataFrame(dataset['test']['input'], columns=['mention'])
    if STRATEGY == 'concat':
        errors_df[['mention', 'text']] = errors_df.mention.str.split(sep_tok, expand=True)
errors_df['label'] = dataset['test']['labels']
errors_df['pred'] = list(map(list, preds))
errors_df['category'] = [features]*len(errors_df)
errors_df = errors_df.explode(['label', 'pred', 'category'])
errors_df = errors_df.explode(['label', 'pred', 'category'])
errors_df = errors_df.query("label!=pred")

In [156]:
for (c, t, p), subdf in errors_df.groupby(['category', 'label', 'pred']):
    error_type = 'false positives' if t==0 else 'false negatives'
    # print attribute name in bold
    print(f'\033[1m{c}\033[0m: "{error_type}"')
    for i, row in subdf.sample(n=min(4, len(subdf)), random_state=42    ).iterrows():
        print(f"  - {highlight(row['text'], row['mention'])}")
    print()

[1mage[0m: "false positives"
  - [30m[43mToday’s society[0m is in a transitional phase between the industrial society and the knowledge society.

[1mage[0m: "false negatives"
  - It is also important that [30m[43meach child, as an individual[0m, can develop their life project according to their capabilities, skills and wishes.
  - [30m[43mYoung people who don't want education now - but maybe later[0m
  - A third of [30m[43myoung people in Slovakia[0m are unemployed after graduation.
  - The elderly are [30m[43mindependent, active and confident citizens who want to actively use, expand and pass on their experiences and knowledge[0m.

[1mcrime[0m: "false positives"
  - Do not submit to [30m[43mthe betrayers of your promises[0m.

[1mcrime[0m: "false negatives"
  - The Federal Republic must acknowledge its responsibility to the victims of fascism and recognize the compensation demands of all the victims of the Nazi, especially the Roma and Sinti, the Jewish people

## Stance

### Prepare the data

In [None]:
tmp = df[df.attribute=="stance"]
tmp = tmp[['mention_id', 'text', 'mention', 'label']].drop_duplicates()

In [None]:
tmp['label'].value_counts(dropna=False)

label
Positive    242
Negative     34
Neutral      23
Unsure        1
Name: count, dtype: int64

In [None]:
tmp = tmp[tmp.label != 'Unsure']

{0: 'Positive', 1: 'Neutral', 2: 'Negative'}

In [None]:
id2label = dict(enumerate(tmp.label.unique()))
label2id = {l: i for i, l in id2label.items()}

In [None]:
tmp.loc[:,'labels'] = tmp.label.map(label2id)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(base_model)

In [None]:
# using concat strategy
tmp['input'] = tmp.text + tokenizer.sep_token + tmp.mention 
max_length_ = max(tokenizer(tmp.input.to_list(), truncation=False, padding=False, return_length=True).length)
cols = ['input', 'labels']
cols_mapping = {"input": "text", "labels": "label"}

### split the data

In [None]:
trn, tst = train_test_split(range(len(tmp)), test_size=0.25, random_state=SEED, stratify=tmp.label)

In [None]:
dataset = datasets.DatasetDict({
    'train': datasets.Dataset.from_pandas(tmp.iloc[trn][cols], preserve_index=False),
    'test': datasets.Dataset.from_pandas(tmp.iloc[tst][cols], preserve_index=False)
})

### Prepare fine-tuning

In [None]:
y_train = np.array(dataset['train']['labels'])
class_weights = get_class_weights(y_train)
class_weights = class_weights.astype(float)
class_weights

array([0.0537386 , 0.57215805, 0.37410334])

In [None]:
model_id = 'social-group-mention-stance-classifier'
model_dir = os.path.join(model_path, model_id)

training_args = TrainingArguments(
    output_dir=model_dir,
    batch_size=(32, 4),
    max_length=max_length_,
    num_epochs=(0, 15),
    max_steps=-1,
    end_to_end=True,
    # loss=CosineSimilarityLoss,
    # samples_per_label=2,
    # use_amp=True,
    report_to='none',
    seed=SEED
)

In [None]:
from utils.metrics import compute_metrics_multiclass

# trainer = TrainerForSpanClassification(
trainer = Trainer(
    model_init=lambda: model_init(
        model_name=base_model,
        id2label=id2label,
        # multitarget_strategy='one-vs-rest',
        class_weights=class_weights,
        use_span_embedding=False,#True,
        device='mps'
    ),
    metric=lambda p, t: compute_metrics_multiclass(p, t, id2label),
    args=training_args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    column_mapping=cols_mapping
)

# for deterministic results
trainer._args.seed = SEED
trainer.st_trainer.args.seed = SEED
trainer.st_trainer.args.data_seed = SEED
trainer.st_trainer.args.full_determinism = True

Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset


Map:   0%|          | 0/224 [00:00<?, ? examples/s]

### Fine-tune

In [None]:
trainer.train()

***** Running training *****
  Num unique pairs = 33950
  Batch size = 32
  Num epochs = 0


0it [00:00, ?it/s]

{'train_runtime': 0.0053, 'train_samples_per_second': 0.0, 'train_steps_per_second': 0.0, 'train_loss': 0.0, 'epoch': 0}


Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Iteration:   0%|          | 0/56 [00:00<?, ?it/s]

: 

### Evaluate

In [None]:
metrics = trainer.evaluate()

***** Running evaluation *****


In [None]:
res = pd.DataFrame(metrics, index=[0]).T.reset_index().rename(columns={'index': 'metric', 0: 'value'})
res[['metric', 'category']] = res.metric.str.split('_', expand=True)
res = res.pivot(index='category', columns='metric', values='value')
# remove index names
res.columns.name = None
res.index.name = None
res.loc[['macro']+features]

Unnamed: 0,f1,precision,recall,support
macro,0.903006,0.957418,0.859259,
universal,0.888889,1.0,0.8,15.0
economic,0.927273,0.910714,0.944444,54.0
non-economic,0.892857,0.961538,0.833333,60.0


## error analysis

In [None]:
probs = trainer.model.predict_proba(dataset['test']['input'], as_numpy=True)
preds = np.where(probs > 0.5, 1, 0)

In [None]:
# sanity check: any universal and other attributes? (not allowed)
idxs = np.where(np.logical_and(preds[:, 0]==1, preds[:, 1:].sum(axis=1)>0))[0]
len(idxs)
# okay!

0

In [None]:
def parse_input(x):
  # text, mention = x.split(tokenizer.sep_token)
  # span = regex.search(regex.escape(mention), text).span()
  return x.split(tokenizer.sep_token)

In [None]:
errors_df = [] 
for attribute, attribute_id in label2id.items():
    errors = preds != dataset['test']['labels']
    idxs = np.where(errors[:, attribute_id])[0]

    tmp = pd.DataFrame([parse_input(x) for x in dataset['test'].select(idxs)['input']], columns=['text', 'mention'])
    tmp['attribute'] = attribute
    tmp['label'] = np.array(dataset['test'].select(idxs)['labels'])[:, attribute_id]
    tmp['pred'] = preds[idxs, attribute_id]
    errors_df.append(tmp)

errors_df = pd.concat(errors_df)
errors_df

Unnamed: 0,text,mention,attribute,label,pred
0,Party for living people.,living people,universal,1,0
1,x Businesses should pay a normal share of thei...,society,universal,1,0
2,Young people need places where they can develo...,groups,universal,1,0
0,In a continuously technologically improving so...,a continuously technologically improving society,economic,0,1
1,A society where wealth is not measured in cons...,A society where wealth is not measured in cons...,economic,0,1
2,x Businesses should pay a normal share of thei...,society,economic,0,1
3,The Greens propose to experiment with new ways...,those who are undergoing profound democratic r...,economic,0,1
4,"Children with a migrant background, children f...",children with a dependent or mentally ill parent,economic,1,0
5,Elderly care and care should be of high qualit...,mentors,economic,1,0
6,Young people who don't want education now - bu...,Young people who don't want education now - bu...,economic,1,0


## Save the model

In [None]:
import shutil
shutil.rmtree(model_dir)

In [None]:
trainer.model.save_pretrained(model_dir)

In [None]:
trainer.model.to('cpu');
del trainer