In [7]:
import os

base_path = os.path.join('..', '..')
data_path = os.path.join(base_path, 'data', 'annotations')

In [12]:
jobs = [nm for nm in os.listdir(data_path) if nm.startswith('group-mention-annotation-batch-')]

for job in jobs:
    loc = os.path.join(data_path, job)
    print(job, os.listdir(loc))

group-mention-annotation-batch-01 ['annotations.jsonl', '.DS_Store', 'review_annotations.jsonl', 'reviewed.jsonl', 'annotations', 'e34e6c94-0d40-47d7-8993-ef3abf27ecc2.zip', 'sample.tsv', 'sample.manifest', 'annotations.zip', 'review_cases.jsonl']
group-mention-annotation-batch-02 ['annotations.jsonl', '.DS_Store', 'review_annotations.jsonl', 'sample.jsonl', '326d68e0-b620-4ea7-8ef4-bfdbba9275c1.zip', '35dc19b0-d92c-4698-8057-e3a87514511b.zip', 'reviewed.jsonl', 'annotations', 'sample.tsv', 'review', 'sample.manifest', 'review_cases.jsonl']
group-mention-annotation-batch-03 ['.DS_Store', 'review_annotations.jsonl', 'reviewed.jsonl', 'annotations', 'sample.tsv', 'sample.manifest', 'review_cases.jsonl', 'c72fb31d-3809-4f0c-919b-76b329ec475a.zip']


In [29]:
from utils.io import read_jsonlines


fps = [os.path.join(data_path, job, 'review_annotations.jsonl') for job in jobs]

parse_entry = lambda x: {k: x[k] for k in ['id', 'text', 'label']}
data = [parse_entry(line) for fp in fps for line in read_jsonlines(fp)]

In [33]:
def parse_annotation(text, annotation, keep_text: bool):
    out = {
        'start': annotation[0],
        'end': annotation[1],
        'type': annotation[2],
        'mention': text[annotation[0]:annotation[1]]
    }
    if keep_text:
        out['text'] = text
    return out
    

def unnest_sequence_annotations(data, **kwargs):
    return [
        {'text_id': line['id'], 'mention_nr': i+1} | parse_annotation(line['text'], lab, **kwargs)
        for line in data 
        for i, lab in enumerate(line['label'])
    ]

In [35]:
import pandas as pd
df = pd.DataFrame(unnest_sequence_annotations(data, keep_text=True))
df

Unnamed: 0,text_id,mention_nr,start,end,type,mention,text
0,11110_198809-390636,1,5,12,social group,parents,Give parents the right to become municipal day...
1,11110_199109-390960,1,44,51,social group,society,"Therefore, we oppose the despolitisation of so..."
2,11110_199109-390960,2,55,81,organizational group,multinational corporations,"Therefore, we oppose the despolitisation of so..."
3,11110_199109-390960,3,200,238,social group,party leaders or officials in Brussels,"Therefore, we oppose the despolitisation of so..."
4,11110_199109-390940,1,62,113,social group,a society for survival in prosperity and well-...,It is only within the ecological framework tha...
...,...,...,...,...,...,...,...
6468,13230_199409-186966,1,0,10,organizational group,Businesses,Businesses also need good conditions.
6469,51320_197006-218939,1,0,89,organizational group,Firms wishing to build new factories and offic...,Firms wishing to build new factories and offic...
6470,42110_199010-05630,1,28,121,organizational group,those interest organizations that have the rig...,This is especially true for those interest org...
6471,12951_199709-333505,1,74,108,organizational group,small and medium-sized enterprises,The Progress Party will make it easier to star...


In [38]:
"Name a social group mentioned in the following sentence '''%s'''. Answer: '''%s'''" % ('Hello', 'H')

"Name a social group mentioned in the following sentence '''Hello'''. Answer: '''H'''"

In [1]:
ON_COLAB = False
from types import SimpleNamespace

args = SimpleNamespace()
args.model_path = './../../results/classifiers/group-mention-detection_batch-01/best_model'
args.seed = 1234

In [3]:
texts = [
    "Youth unemployment will also influence their parents, teachers, the poor.",
    "We are fighting for the rights of the people.",
    "Too much weight is esting on those who have the least.",
    "Very short sentence."
]

In [5]:
# from transformers import pipeline
# 
# classifier = pipeline(task='ner', model=args.model_path, aggregation_strategy='simple', device_map='cpu')
# 
# preds = classifier(texts)
# 
# mentions = [[span['word'].strip() for span in spans] for spans in preds]
# mentions

[[], [], [], []]

In [3]:
import torch
from datasets import Dataset
from transformers import set_seed, AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification
set_seed(args.seed)

In [4]:
import numpy as np
from scipy.stats import entropy
from scipy.special import softmax

def _compute_prediction_entropy(predictions, dataset):
    """
    Compute the entropy of the predictions for each example in the dataset.

    inspired by small-text's PredictionEntropy confidence-based active learning query startegy 
     see https://github.com/webis-de/small-text/blob/c78459e1b60269da1aeaa270e954961cc36d77cb/small_text/query_strategies/strategies.py#L180
    """
    entropies = []
    lengths = dataset['attention_mask'].sum(dim=1)
    for p, l in zip(predictions, lengths):
        logits = p[:l].numpy()
        e = entropy(softmax(logits, axis=0), axis=None)
        entropies.append(e)
    return np.array(entropies)

In [5]:
model = AutoModelForTokenClassification.from_pretrained(args.model_path)
tokenizer = AutoTokenizer.from_pretrained(args.model_path, use_fasr=True)

In [6]:
dataset = Dataset.from_dict({'text': texts})
tokenize = lambda examples: tokenizer(examples['text'], padding=False, truncation=True)
dataset = dataset.map(tokenize, batched=True)
dataset = dataset.remove_columns(['text'])
dataset.set_format(type='torch')

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [9]:
from tqdm.auto import tqdm

def compute_prediction_entropies(
    dataset: Dataset,
    model: AutoModelForTokenClassification,
    tokenizer: AutoTokenizer,
    batch_size: int = 32
) -> np.ndarray:
    data_collator = DataCollatorForTokenClassification(tokenizer)   
    entropies = []
    for batch in tqdm(torch.utils.data.DataLoader(dataset, batch_size=batch_size, collate_fn=data_collator)):
        with torch.no_grad():
                outputs = model(**batch)
        entropies.append(_compute_prediction_entropy(outputs.logits, batch))
    entropies = np.concatenate(entropies)
    return entropies

In [10]:
compute_prediction_entropies(dataset, model, tokenizer, batch_size=2)

  0%|          | 0/2 [00:00<?, ?it/s]

array([3.1428676, 2.8410678, 3.1937726, 3.0467315], dtype=float32)

In [110]:
a = np.array([[0.1, 0.9], [0.9, 0.1]])
b = np.array([[0.4, 0.6], [0.6, 0.4]])

entropy(a, axis=None), entropy(b, axis=None)

(1.0182301539513936, 1.366158847569202)