# SetFit for Multilabel Text Classification

notebook based on https://github.com/huggingface/setfit/blob/main/notebooks/text-classification_multilabel.ipynb

See also:

- https://huggingface.co/docs/setfit/en/how_to/multilabel
- https://github.com/huggingface/setfit/issues/413#issuecomment-1697751329

## Setup

In [1]:
import sys
sys.path.append('../../../code/mention-classification')

In [2]:
import numpy as np
import pandas as pd
import torch

import datasets
from sklearn.model_selection import train_test_split

from utils.setfit import get_class_weights, model_init

from transformers import AutoTokenizer
from setfit import TrainingArguments, Trainer
from sentence_transformers.losses import CosineSimilarityLoss

from utils.metrics import *
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
MODEL = "sentence-transformers/paraphrase-mpnet-base-v2"

In [4]:
# TODO:
#  - prepare dataset function
#  - prepare model function

## Preparing the dataset

In [4]:
import os 

data_path = '../../../data/annotations/group_mention_categorization'

dirs = ['social-group-mention-categorization-coder-training', 'social-group-mention-categorization-round02']

fps = [os.path.join(data_path, d, 'parsed', 'consolidated_annotations.tsv') for d in dirs]

df_all = pd.concat([pd.read_csv(fp, sep='\t') for fp in fps], axis=0, ignore_index=True)
df_all.q_id.unique()

array(['economic_attributes', 'non-economic_attributes', 'stance',
       'universal_attributes'], dtype=object)

## Universal attributes

In [5]:
# NOTE: let's focus on universal attributes for now
df = df_all.loc[df_all.q_id == 'universal_attributes', ['text', 'mention', 'label']]

In [6]:
df[['label']].value_counts(sort=False)
# NOTE: extreme label class imbalance 

label
No       394
Yes       56
Name: count, dtype: int64

In [7]:
label2id = {'No': 0, 'Yes': 1}
id2label = {0: 'No', 1: 'Yes'}

In [8]:
df.label = df.label.map(label2id)

### split the data

In [9]:
# TODO: consider increasing train size
trn, tst = train_test_split(range(len(df)), test_size=0.5, stratify=df.label, random_state=42)

In [10]:
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [11]:
df['input'] = df.text + tokenizer.sep_token + df.mention

In [12]:
cols = ['input', 'label']
dataset = datasets.DatasetDict({
    'train': datasets.Dataset.from_pandas(df.iloc[trn][cols], preserve_index=False),
    'test': datasets.Dataset.from_pandas(df.iloc[tst][cols], preserve_index=False)
})

### fine-tune

In [13]:
args = TrainingArguments(
    output_dir='setfit',
    batch_size=(32, 4),
    num_epochs=(1, 1),
    max_steps=10,
    body_learning_rate=(2e-5, 1e-5),
    head_learning_rate=1e-2,
    end_to_end=True,
    samples_per_label=2, # default but can be increased for TripletLoss
    loss=CosineSimilarityLoss, # note: could use TripletLoss
    use_amp=True,
    report_to='none'
)

In [14]:
class_weights = get_class_weights(np.array(dataset['train']['label']))
dict(zip(id2label.values(), class_weights))

{'No': 0.12444444444444443, 'Yes': 0.8755555555555555}

In [15]:
trainer = Trainer(
    model_init=lambda: model_init(
        model_name=MODEL,
        id2label=id2label,
        # class_weights=class_weights,
        device='mps'
    ),
    args=args,
    metric=compute_metrics_binary,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    column_mapping={'input': 'text', 'label': 'label'},
)

Applying column mapping to the training dataset
Applying column mapping to the evaluation dataset
Map: 100%|██████████| 225/225 [00:00<00:00, 30144.97 examples/s]


In [16]:
trainer.train()

***** Running training *****
  Num unique pairs = 320
  Batch size = 32
  Num epochs = 1
 10%|█         | 1/10 [00:02<00:19,  2.15s/it]

{'embedding_loss': 0.2752, 'grad_norm': 1.1571369171142578, 'learning_rate': 2e-05, 'epoch': 0.1}


100%|██████████| 10/10 [00:17<00:00,  1.79s/it]
The `max_length` is `None`. Using the maximum acceptable length according to the current model body: 512.


{'train_runtime': 17.9318, 'train_samples_per_second': 17.845, 'train_steps_per_second': 0.558, 'train_loss': 0.2602002501487732, 'epoch': 1.0}


Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

: 

In [None]:
import transformers
transformers.__version__

### evaluate

In [None]:
y_true = np.array(dataset['test']['label'])
y_pred = trainer.model.predict(dataset['test']['input'], use_labels=False).cpu().numpy()

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_true, y_pred, target_names=id2label.values()))

In [66]:
# inputs = trainer.model.model_body.tokenizer(dataset['test']['input'][:16], return_tensors='pt', padding=True)
# with torch.no_grad():
#     embeddings = trainer.model.model_body(inputs.to('mps'))
#     # outputs = trainer.model.model_head.linear(embeddings['sentence_embedding']).cpu().numpy()
#     logits, probs = trainer.model.model_head(embeddings['sentence_embedding'], temperature=1.0)
#     logits = logits.cpu().numpy()
#     probs = probs.cpu().numpy()

# probs.round(3) # overconfidence, need to apply early stopping?

In [None]:
probs = trainer.model.predict_proba(dataset['test']['input'], as_numpy=True)
probs.round(3)

In [None]:
labs = np.array(dataset['test']['label'])
pred_probs = probs[range(len(labs)), labs]

# plot boxplot of predicted probabilities by true label
plt.boxplot([pred_probs[labs==0], pred_probs[labs==1]], labels=['No', 'Yes'])
plt.ylim(0, 1)
# draw a line at 0.5
plt.axhline(0.5, color='r', linestyle='--')
plt.show()

## Universal/econ/non-econ as three-way multilabel problem

In [None]:
df = pd.read_csv(fp, sep='\t')
df.columns

In [25]:
# stack by category
tmp = pd.concat([
    df[df.q_id == 'universal_attributes'].drop(columns=['category']),
    df[df.q_id == 'economic_attributes'].groupby(['mention_id', 'text', 'mention', 'q_id']).agg({'label': lambda x: 'Yes' if (x=='Yes').any() else 'No'}).reset_index(),
    df[df.q_id == 'non-economic_attributes'].groupby(['mention_id', 'text', 'mention', 'q_id']).agg({'label': lambda x: 'Yes' if (x=='Yes').any() else 'No'}).reset_index()
])
tmp.reset_index(drop=True, inplace=True)

# get dimensions
tmp.q_id = tmp.q_id.str.removesuffix('_attributes')
features = tmp.q_id.unique().tolist()

# reshape to wide format
tmp = tmp.pivot(index=['mention_id', 'text', 'mention'], columns='q_id', values='label').reset_index()
tmp = tmp.rename_axis(None, axis=1)

# keep only fully gold-labeled examples
tmp = tmp[tmp[features].isna().sum(axis=1) == 0]

In [None]:
tmp[features].value_counts(dropna=False)

In [28]:
label2id = {'No': 0, 'Yes': 1}
id2label = {0: 'No', 1: 'Yes'}
tmp.loc[:,features] = tmp.loc[:,features].apply(lambda x: x.map(label2id))

In [29]:
tmp['labels'] = tmp.loc[:,features].apply(list, axis=1)

In [None]:
tmp[features].mean(axis=0)
# strong label class imbalance

## split the data

In [31]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL)

In [32]:
tmp['input'] = tmp.text + tokenizer.sep_token + tmp.mention 

In [33]:
max_length_ = max(tokenizer(tmp.input.to_list(), truncation=False, padding=False, return_length=True).length)

In [34]:
trn, tst = train_test_split(range(len(tmp)), test_size=0.5, random_state=42)

In [35]:
cols = ['input', 'labels']
dataset = datasets.DatasetDict({
    'train': datasets.Dataset.from_pandas(tmp.iloc[trn][cols], preserve_index=False),
    'test': datasets.Dataset.from_pandas(tmp.iloc[tst][cols], preserve_index=False)
})

In [None]:
feats = tmp.iloc[tst][features].to_numpy()
class_weights = get_class_weights(feats, multitarget=True)
class_weights = class_weights.astype(float)
class_weights

## Fine-tuning the model

In [None]:
id2label = {i: l for i, l in enumerate(features)}
label2id = {l: i for i, l in enumerate(features)}
id2label

In [51]:
# import torch
# from setfit import SetFitModel
# 
# device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
# def model_init():
#     return SetFitModel.from_pretrained(
#         model_name, 
#         use_differentiable_head=True, 
#         head_params={"out_features": len(features)},
#         multi_target_strategy='one-vs-rest',
#         labels=features,
#         id2label=id2label
#     ).to(device)

In [40]:
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import TrainingArguments, Trainer

args = TrainingArguments(
    output_dir='setfit',
    batch_size=(32, 4),
    max_length=max_length_,
    num_epochs=(1, 8),
    max_steps=100,
    end_to_end=False,
    samples_per_label=2,
    loss=CosineSimilarityLoss,
    use_amp=True,
    report_to='none'
)

In [None]:
from utils.metrics import compute_metrics_multilabel
trainer = Trainer(
    model_init=lambda: model_init(
        model_name=MODEL,
        id2label=id2label,
        multitarget_strategy='one-vs-rest',
        class_weights=class_weights,
        device='mps'
    ),
    metric=lambda p, t: compute_metrics_multilabel(p, t, id2label),
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    column_mapping={"input": "text", "labels": "label"},
)

In [None]:
trainer.train()

In [None]:
metrics = trainer.evaluate()

In [None]:
res = pd.DataFrame(metrics, index=[0]).T.reset_index().rename(columns={'index': 'metric', 0: 'value'})
res[['metric', 'category']] = res.metric.str.split('_', expand=True)
res = res.pivot(index='category', columns='metric', values='value')
# remove index names
res.columns.name = None
res.index.name = None
res.loc[['macro']+features]

In [46]:
probs = trainer.model.predict_proba(dataset['test']['input'], as_numpy=True)

In [None]:
probs_df = pd.DataFrame(probs, columns=['pred: '+f for f in features]).round(3)
# compute loss 
losses = np.array(dataset['test']['labels']) - probs
probs_df['loss'] = np.abs(losses).sum(axis=1).round(3)
probs_df[features] = tmp.iloc[tst, :][features].reset_index(drop=True)
probs_df[['text', 'mention']] = pd.Series(dataset['test']['input']).str.split(tokenizer.sep_token, expand=True)
probs_df.sort_values('loss', ascending=False).head(20)

In [None]:
# IDEA: measure uncertainty by computing closeness to classification threshold
threshold = np.ones(probs.shape)/2
cuts = probs - threshold
vals = np.abs(cuts).min(axis=1)
idxs = vals.argsort()[::1]

probs_df.iloc[idxs, :].head(16)#.text.to_list()
# TODO: compute share of misclassification as indicator of informativeness of ranking criterion

In [None]:
# inference

text_data_file = '../../data/intermediate/social_group_mentions_ranked.tsv'
texts = pd.read_csv(text_data_file, sep='\t', nrows=32*200) # has 13748 rows

probs = trainer.model.predict_proba(texts.text.to_list(), as_numpy=True, show_progress_bar=True)

In [None]:
threshold = np.ones(probs.shape)/2
cuts = probs - threshold
vals = np.abs(cuts).min(axis=1)
# plot histogram of vals
plt.hist(vals, bins=20)
plt.show()
# NOTE: shows overall high "confidence" (maybe overconfidence)

In [None]:
n_ = 100
idxs = vals.argsort()[:n_]

pd.concat([
    texts.iloc[idxs, :][['text', 'mention']].reset_index(drop=True),
    pd.DataFrame(probs[idxs, :].round(3), columns=features),

], axis=1 )

## Non-econ attributes

In [3]:
# NOTE: let's focus on non-economic attributes for now
df = pd.read_csv(fp, sep='\t')
df = df[df.q_id == 'non-economic_attributes']
df = df[~df.category.isna()]

In [None]:
df[['category', 'label']].value_counts(sort=False)
# NOTE: extreme label class imbalance 

In [None]:
keep_cats = df[df.label == 'Yes'].category.unique().tolist()

df = df[df.category.isin(keep_cats)]
df[['category', 'label']].value_counts(sort=False)


In [None]:
# remove any instances where some are Unsure
discard = df.groupby('mention_id').agg({'label': lambda l: (l == 'Unsure').any()})
discard = discard[discard.label].index.to_list()

df = df[~df.mention_id.isin(discard)]

df[['category', 'label']].value_counts(sort=False)


In [None]:
df = df[['text', 'mention', 'category', 'label']]

# pivot wider
df = df.pivot(index=['text', 'mention'], columns='category', values='label').reset_index()

features = df.columns[2:].to_list()

features

In [8]:
df = df[df.iloc[:,3:].isna().sum(axis=1) == 0.0]

In [9]:
label2id = {'No': 0, 'Yes': 1}
id2label = {0: 'No', 1: 'Yes'}
df.loc[:,features] = df.loc[:,features].apply(lambda x: x.map(label2id))

In [10]:
df['labels'] = df.loc[:,features].apply(list, axis=1)

In [None]:
df[features].mean(axis=0)
# still crazy label class imbalance

### split the data

In [None]:
df[features].reset_index(drop=True)

In [13]:
# from skmultilearn.model_selection import IterativeStratification
# 
# X = np.zeros((len(df), 1))# df[['text', 'mention', 'labels']]
# y = df[features].reset_index(drop=True)
# test_size = 0.4
# stratifier = IterativeStratification(n_splits=2, order=2, sample_distribution_per_fold=[test_size, 1.0-test_size])
# stratifier = IterativeStratification(n_splits=2, order=2, sample_distribution_per_fold=[test_size, 1.0-test_size])
# train_indexes, test_indexes = next(stratifier.split(X, y))


In [14]:
from sklearn.model_selection import train_test_split

trn, tst = train_test_split(range(len(df)), test_size=0.5, random_state=42)

In [15]:
model_name = "sentence-transformers/paraphrase-mpnet-base-v2"
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [16]:
df['input'] = df.text + tokenizer.sep_token + df.mention 

In [17]:
cols = ['input', 'labels']
dataset = datasets.DatasetDict({
    'train': datasets.Dataset.from_pandas(df.iloc[trn][cols], preserve_index=False),
    'test': datasets.Dataset.from_pandas(df.iloc[tst][cols], preserve_index=False)
})

### Fine-tuning the model

In [18]:
from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer(model_name, device='cpu')

To train a SetFit model, the first thing to do is download a pretrained checkpoint from the Hub. We can do so by using the `from_pretrained()` method associated with the `SetFitModel` class.

**Note that the `multi_target_strategy` parameter here signals to both the model and the trainer to expect a multi-labelled dataset.**

In [19]:
id2label = {i: l for i, l in enumerate(features)}
label2id = {l: i for i, l in enumerate(features)}

### Non-diff head (regression)

In [None]:
from setfit import SetFitModel

model = SetFitModel.from_pretrained(
    model_name, 
    multi_target_strategy='one-vs-rest',
    labels=features
)

Alternative is to init explicitly (see [here](https://github.com/huggingface/setfit/issues/413#issuecomment-1697751329))

```python
from sentence_transformers import SentenceTransformer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
embedder = SentenceTransformer('all-MiniLM-L6-v2')
model = SetFitModel(
    model_body=embedder, 
    model_head=OneVsRestClassifier(LogisticRegression(class_weight="balanced")),
    multi_target_strategy="one-vs-rest"
)
```

In [112]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'mps' if torch.backends.mps.is_available() else 'cpu'
def model_init():
    return SetFitModel.from_pretrained(
        model_name, 
        use_differentiable_head=True, 
        head_params={"out_features": len(features)},
        multi_target_strategy='one-vs-rest',
        labels=features,
    ).to(device)

In [113]:
from sentence_transformers.losses import CosineSimilarityLoss
from setfit import TrainingArguments, Trainer

args = TrainingArguments(
    output_dir='setfit',
    batch_size=(16, 4),
    num_epochs=(2, 8),
    samples_per_label=2, # default but can be increased for TripletLoss
    loss=CosineSimilarityLoss, # note: could use TripletLoss
    use_amp=True,
    end_to_end=False,
    report_to='none'
)

In [None]:
trainer = Trainer(
    model_init=model_init,
    args=args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    column_mapping={"input": "text", "labels": "label"},
)

In [None]:
trainer.train()

The final step is to compute the model's performance using the `evaluate()` method. The default metric measures 'subset accuracy', which measures the fraction of samples where we predict all 8 labels correctly.

In [None]:
metrics = trainer.evaluate()
metrics

In [117]:
y_true = np.array(dataset['test']["labels"])
y_pred = trainer.model.predict(dataset['test']["input"], use_labels=False).cpu().numpy()

In [99]:
# import torch
# inputs = tokenizer(['hello'], return_tensors='pt')
# 
# with torch.no_grad():
#     embeddings = trainer.model.model_body(inputs.to('mps'))
#     logits, probs = trainer.model.model_head(embeddings['sentence_embedding'], temperature=1.0)

In [None]:
from collections import  Counter
Counter([id2label[i] for labs in dataset['train']["labels"] for i, l in enumerate(labs) if l == 1])

In [None]:
from sklearn.metrics import precision_recall_fscore_support
scores = {}
for l, a, b in zip(trainer.model.labels, y_true.T, y_pred.T):
    p, r, f1, _ = precision_recall_fscore_support(a, b, average='binary', zero_division=0.0)
    scores[l] = {'f1': f1, 'precision': p, 'recall': r, 'support': np.sum(a)}
pd.DataFrame(scores).T

# {f'{m}_{l}': v for l, s in scores.items() for m, v in s.items()} 

In [151]:
inputs = trainer.model.model_body.tokenizer(dataset['test']['input'][:16], return_tensors='pt', padding=True)
with torch.no_grad():
    embeddings = trainer.model.model_body(inputs.to('mps'))
    outputs = trainer.model.model_head.linear(embeddings['sentence_embedding'])
    # logits, probs = trainer.model.model_head(embeddings['sentence_embedding'], temperature=1.0)
    # logits = logits.cpu().numpy()
    # probs = probs.cpu().numpy()

threshold = np.ones(probs.shape)/2
cuts = probs - threshold
idx = np.abs(cuts).mean(axis=1).argmin()

probs[idx], y_pred[idx], y_true[idx], dataset['test']['input'][idx]

In [None]:
# IDEA: focus sentence embedding model on mention
inputs = embedder.tokenizer(dataset['test']['input'][:1], return_tensors='pt', padding=True)

import torch
with torch.no_grad():
    features = embedder[0](features=inputs)

token_embeddings = features["token_embeddings"]
attention_mask = (
    features["attention_mask"]
    if "attention_mask" in features
    else torch.ones(token_embeddings.shape[:-1], device=token_embeddings.device, dtype=torch.int64)
)

mask = features['input_ids'] == embedder.tokenizer.sep_token_id
mask = mask.cumsum(dim=1) == 1
# convert mask to same type as attention_mask
mask = mask.to(attention_mask.dtype)
attention_mask = mask

# note: this is what happens in the SentenceTransformer model under the hood
input_mask_expanded = (
    attention_mask.unsqueeze(-1).expand(token_embeddings.size()).to(token_embeddings.dtype)
)
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)

sum_mask = input_mask_expanded.sum(1)

sum_mask = torch.clamp(sum_mask, min=1e-9)

output_vector = sum_embeddings / sum_mask

output_vector.shape

## Stance

In [50]:
# NOTE: let's focus on non-economic attributes for now
df = pd.read_csv(fp, sep='\t')
df = df[df.q_id == 'stance']

In [None]:
df[['label']].value_counts(sort=False)
# NOTE: extreme label class imbalance 

In [52]:
df = df[['text', 'mention', 'label']]

In [53]:
label2id = {'Neutral': 0, 'Positive': 1, 'Negative': 2}
id2label = {i: l for l, i in label2id.items()}

In [54]:
df.label = df.label.map(label2id)

### split the data

In [55]:
trn, tst = train_test_split(range(len(df)), test_size=0.5, stratify=df.label, random_state=42)

In [56]:
model_name = "sentence-transformers/paraphrase-mpnet-base-v2"
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [57]:
df['input'] = df.text + tokenizer.sep_token + df.mention

In [None]:
max_length_ = max(tokenizer(df.input.to_list(), truncation=False, padding=False, return_length=True).length)

In [58]:
cols = ['input', 'label']
dataset = datasets.DatasetDict({
    'train': datasets.Dataset.from_pandas(df.iloc[trn][cols], preserve_index=False),
    'test': datasets.Dataset.from_pandas(df.iloc[tst][cols], preserve_index=False)
})

### Fine-tuning the model

In [66]:
args = TrainingArguments(
    output_dir='setfit',
    batch_size=(32, 4),
    max_length=max_length_,
    num_epochs=(1, 8),
    max_steps=10,
    body_learning_rate=(2e-5, 1e-5),
    head_learning_rate=1e-2,
    end_to_end=False,
    samples_per_label=2, # default but can be increased for TripletLoss
    loss=CosineSimilarityLoss, # note: could use TripletLoss
    use_amp=True,
    report_to='none'
)

In [None]:
class_weights = get_class_weights(np.array(dataset['train']['label']))
dict(zip(id2label.values(), class_weights))

In [None]:
trainer = Trainer(
    model_init=lambda: model_init(
        model_name=MODEL,
        id2label=id2label,
        class_weights=class_weights,
        device='mps'
    ),
    args=args,
    metric=lambda p, t: compute_metrics_multiclass(p, t, id2label),
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    column_mapping={'input': 'text', 'label': 'label'},
)

In [None]:
trainer.train()

In [None]:
metrics = trainer.evaluate()
metrics

In [None]:
inputs = trainer.model.model_body.tokenizer(dataset['test']['input'][:16], return_tensors='pt', padding=True)
with torch.no_grad():
    embeddings = trainer.model.model_body(inputs.to('mps'))
    # outputs = trainer.model.model_head.linear(embeddings['sentence_embedding']).cpu().numpy()
    logits, probs = trainer.model.model_head(embeddings['sentence_embedding'], temperature=1.0)
    logits = logits.cpu().numpy()
    probs = probs.cpu().numpy()

In [None]:
probs = trainer.model.predict_proba(dataset['test']['input'], as_numpy=True)
labs = np.array(dataset['test']['label'])
pred_probs = probs[range(len(labs)), labs]

# plot boxplot of predicted probabilities by true label
plt.boxplot(
    [pred_probs[labs==i] for i in id2label.keys()],
    labels=id2label.values()
)
plt.ylim(0, 1)
# draw a line at 0.5
plt.axhline(0.5, color='r', linestyle='--')
plt.show()