In [33]:
from types import SimpleNamespace

args = SimpleNamespace()

args.input_path = './../../data/manifestos/all_manifesto_sentences_translated.tsv'
args.id_col = 'sentence_id'
args.text_col = 'text_mt_m2m_100_1.2b'
args.group_by_document = True

args.exclude_ids_in_files = [
    # for first uncertainty-based sample (i.e., batch 2), add
    '../../data/annotations/group-menion-gold-examples/sample.tsv',
    '../../data/annotations/group-menion-coder-training/sample_round1.tsv',
    '../../data/annotations/group-menion-coder-training/sample_round2.tsv',
    '../../data/annotations/group-mention-annotation-batch-01/sample.tsv',
    # for second uncertainty-based sample (i.e., batch 3), add
    '../../data/annotations/group-mention-annotation-batch-02/sample.tsv',
]

# args.model_path = './../../results/classifiers/group-mention-detection_batch-01/best_model'
args.model_path = './../../results/classifiers/group-mention-detection_batch-02/best_model'
args.seed = 1234

# # for first uncertainty-based sample
# args.focal_category = None 
# for second uncertainty-based sample
args.focal_category = None 


# # for first uncertainty-based sample (i.e., batch 2), add
# args.sample_size = 2500
args.sample_size = 1000

In [39]:
import os

#args.output_path = '../../data/annotations/group-mention-annotation-batch-02/sample.tsv'
args.output_path = '../../data/annotations/group-mention-annotation-batch-03/sample.tsv'

os.makedirs(os.path.dirname(args.output_path), exist_ok=True)

In [3]:
import pandas as pd
import torch
from datasets import Dataset
from transformers import set_seed, AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
set_seed(args.seed)

In [18]:
from scipy.stats import entropy
import numpy as np
from scipy import special

def weighted_entropy(
    pk: np.typing.ArrayLike,
    w: np.typing.ArrayLike | None = None,
    base: float | None = None,
    axis: int = 0
) -> np.number | np.ndarray:
    """
    Calculate the weighted entropy [1] of given distribution(s).

    If only probabilities `pk` are given, the Shannon entropy [2] is calculated as
    ``H = -sum(pk * log(pk))``.

    If probabilities `pk` and `w` are given, the weighted (Shannon) entropy is calculated as
    ``H = -sum(pk * log(pk))``.


    Parameters
    ----------
    pk : array_like
        Defines the distribution. Along each axis-slice 
        of ``pk``, element ``i`` is the probability of 
        event ``i``.
    w : array_like, optional
        Defines the weight attributed to event ``i`` 
    base : float, optional
        The logarithmic base to use, defaults to ``e`` (natural logarithm).
    axis : int, optional
        The axis along which the weighted entropy is calculated. Default is 0.

    Returns
    -------
    S : {float, array_like}
        The calculated entropy.

    Notes
    -----
    Based on [1] Guiaşu ([1971](https://doi.org/10.1016/0034-4877(71)90002-4)) and adapted from scipy.stats.entropy


    The relative entropy, ``D(pk|qk)``, quantifies the increase in the average
    number of units of information needed per symbol if the encoding is
    optimized for the probability distribution `qk` instead of the true
    distribution `pk`. Informally, the relative entropy quantifies the expected
    excess in surprise experienced if one believes the true distribution is
    `qk` when it is actually `pk`.

    A related quantity, the cross entropy ``CE(pk, qk)``, satisfies the
    equation ``CE(pk, qk) = H(pk) + D(pk|qk)`` and can also be calculated with
    the formula ``CE = -sum(pk * log(qk))``. It gives the average
    number of units of information needed per symbol if an encoding is
    optimized for the probability distribution `qk` when the true distribution
    is `pk`. It is not computed directly by `entropy`, but it can be computed
    using two calls to the function (see Examples).

    See [2]_ for more information.

    References
    ----------
    .. [1] Guiaşu, S. (1971), Weighted Entropy. Reports
           on Mathematical Physics, 2(3): 165-179.
           https://doi.org/10.1016/0034-4877(71)90002-4
    .. [2] Shannon, C.E. (1948), A Mathematical Theory of Communication.
           Bell System Technical Journal, 27: 379-423.
           https://doi.org/10.1002/j.1538-7305.1948.tb01338.x

    """
    if base is not None and base <= 0:
        raise ValueError("`base` must be a positive number or `None`.")
    if base is None:
        base = np.e

    pk = np.asarray(pk)
    # normalize
    with np.errstate(invalid='ignore'):
        pk = 1.0*pk / np.sum(pk, axis=axis, keepdims=True)

    # construct or check weights
    if w is None:
        w = np.ones(pk.shape[axis])
    else:
        assert len(w) == pk.shape[axis], f"Expected {pk.shape[axis]} weights, got {len(w)}"
        w = np.asarray(w)

    # compute inner
    s = np.multiply(pk, w)*np.emath.logn(base, pk)

    # compute outer
    S = -s.sum(axis=axis)

    return S

# # test
# probs = [
#     [0.2, 0.8], 
#     [0.9, 0.1], 
#     [0.7, 0.3]
# ]
# weights = [1, 1]
# weighted_entropy(probs, w=weights, axis=1), entropy(probs, axis=1)

In [19]:
import numpy as np
from scipy.stats import entropy
from scipy.special import softmax
from typing import Union


def _compute_prediction_entropy(predictions, dataset, weights: Union[None, np.typing.NDArray]=None):
    """
    Compute the entropy of the predictions for each example in the dataset.

    For token classification, we get an (# tokens, # classes)-shaped array of logits (the output of the classification layer).
    We compute the softmax of the logits (applied per toekn), and then compute the entropy of the resulting probability distribution.

    Inspired by small-text's PredictionEntropy confidence-based active learning query startegy 
     see https://github.com/webis-de/small-text/blob/c78459e1b60269da1aeaa270e954961cc36d77cb/small_text/query_strategies/strategies.py#L180
    """
    if weights is not None:
        assert predictions[0].shape[1] == len(weights), f"Expected {len(weights)} classes, got {predictions[0].shape[1]}"
    else:
        weights = [1] * predictions[0].shape[1]

    entropies = []
    lengths = dataset['attention_mask'].sum(dim=1)
    for p, l in zip(predictions, lengths):
        logits = p[:l].numpy()
        probs = softmax(logits, axis=1)
        # based on https://aclanthology.org/2024.lrec-main.30.pdf
        e = weighted_entropy(probs, w=weights, axis=1).max()
        entropies.append(e)
    return np.array(entropies)

In [5]:
# theoretical entropy limits
n_classes = 5
entropy([1/n_classes] * n_classes), entropy([1.0] + [0.0]*(n_classes-1))

(1.6094379124341005, 0.0)

In [6]:
# creat example of token-level predicted class probabilities ((# tokens, # classes)-shaped array)
a = np.array([[0.2, 0.8], [0.9, 0.1]]) # <= high certainty/low entropy
b = np.array([[0.4, 0.6], [0.6, 0.4]]) # <= low certainty/high entropy

entropy(a, axis=1).max(), entropy(b, axis=1).max()

(0.5004024235381879, 0.6730116670092565)

In [20]:
from tqdm.auto import tqdm

def compute_prediction_entropies(
    dataset: Dataset,
    model: AutoModelForTokenClassification,
    tokenizer: AutoTokenizer,
    batch_size: int = 32,
    **kwargs
) -> np.ndarray:
    data_collator = DataCollatorForTokenClassification(tokenizer)
    entropies = []
    for batch in tqdm(torch.utils.data.DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)):
        with torch.no_grad():
            outputs = model(**batch.to(model.device))
        entropies.append(_compute_prediction_entropy(outputs.logits.cpu(), batch, **kwargs))
    entropies = np.concatenate(entropies)
    return entropies

## load and prepare the data

In [26]:
df = pd.read_csv(args.input_path, sep='\t', usecols=[args.id_col, args.text_col])
df.rename(columns={args.text_col: 'text'}, inplace=True)
df = df[~df.text.isna()]

### remove previously seen/annotated examples

In [27]:
exclude = []
for fp in args.exclude_ids_in_files:
    tmp = pd.read_csv(fp, sep='\t')
    if args.id_col not in tmp.columns:
        tmp.rename(columns={tmp.columns[0]: args.id_col}, inplace=True)
    exclude.append(tmp[args.id_col])
exclude = pd.concat(exclude, ignore_index=True, sort=False).to_list()

df = df[~df[args.id_col].isin(exclude)]

In [11]:
len(df)

429548

In [21]:
# df = df.sample(100, random_state=args.seed)

In [28]:
dataset = Dataset.from_pandas(df, preserve_index=False)

tokenizer = AutoTokenizer.from_pretrained(args.model_path, use_fasr=True, truncation=True)
tokenize = lambda examples: tokenizer(examples['text'], padding=False, truncation=True)

dataset = dataset.map(tokenize, batched=True)
dataset = dataset.remove_columns([c for c in dataset.column_names if c not in ['input_ids']])
dataset.set_format(type='torch')

Map:   0%|          | 0/429548 [00:00<?, ? examples/s]

## Estimate labeling uncertainty

In [14]:
model = AutoModelForTokenClassification.from_pretrained(args.model_path, device_map='auto')

In [29]:
model.config.id2label

{0: 'O',
 1: 'I-social group',
 2: 'I-organizational group',
 3: 'B-social group',
 4: 'B-organizational group'}

In [30]:
# for second uncertainty-based sample, upweigh 'organizational group' type
w = [1, 1, 2, 1, 2] # [1]*5

In [31]:
entropies = compute_prediction_entropies(dataset, model, tokenizer, batch_size=64, weights=w)

  0%|          | 0/6712 [00:00<?, ?it/s]

In [32]:
df['entropy'] = entropies

## Sample sentences from manifestos

In [36]:
# if args.group_by_document:
df['manifesto_id'] = df[args.id_col].str.extract(r'^(\d+_\d+)')

# get the number of groups in the df
n_groups = df.manifesto_id.nunique()
# compute the sample size per group
per_group_sample_size = args.sample_size // n_groups

if per_group_sample_size*n_groups != args.sample_size:
    print(
        f"Warning: The sample size {args.sample_size} is not divisible by the number of groups ({n_groups}).",
        f"Setting the sample size to {per_group_sample_size*n_groups} ({per_group_sample_size} samples per group)."
    )

# get the `per_group_sample_size` with the highest entropy within group
sample = df.sort_values('entropy', ascending=False).groupby('manifesto_id').head(per_group_sample_size)

# reshuffle the sample
sample = sample.sample(frac=1, random_state=args.seed).reset_index(drop=True)



In [37]:
# correlate the entropy with the length of the text
from scipy.stats import pearsonr
pearsonr(df['entropy'], df['text'].str.len())

PearsonRResult(statistic=-0.3066503503678775, pvalue=0.0)

## Write sample to disk

In [40]:
sample[['manifesto_id', 'sentence_id', 'text']].to_csv(args.output_path, sep='\t', index=False)

In [41]:
sample['label'] = [[]] * len(sample)

In [42]:
lines = sample[['sentence_id', 'text', 'label']].to_dict(orient='records')
from utils.io import write_jsonlines
write_jsonlines(lines, args.output_path.replace('.tsv', '.manifest'))