In [1]:
import os
import sys
sys.path.append(os.path.abspath('../../code'))

In [None]:
import os
import json
from pathlib import Path

import regex
from collections import Counter

import numpy as np
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
import pandas as pd
pd.set_option('display.max_columns', 15)
pd.set_option('display.width', 320)

from utils.io import read_label_config
from utils.corpus import JsonlinesAnnotationsCorpus
from utils.bsc_model import BSCModel

In [45]:
from types import SimpleNamespace

args = SimpleNamespace()
args.label_config_file = '../../data/annotation/doccano_label_config.json'
args.input_file = '../../data/annotation/parsed/uk-commons_annotations.jsonl'

args.data_path = '../../data/annotation/annotations/'
args.data_folder_pattern = 'uk-commons'
args.data_file_format = 'csv'

args.output_file = '../../data/annotation/labeled/uk-commons_all_labeled.jsonl'
args.overwrite_output = False

args.verbose = True

## Read the annotations

In [7]:
fps = [str(fp) for fp in Path(args.data_path).glob(f'*{args.data_folder_pattern}*/*.{args.data_file_format}')]
# read metadata
metadata = pd.concat({fp.split('/')[-1].replace('_ids.csv', ''): pd.read_csv(fp) for fp in fps}, axis=0)
metadata['job'] = metadata.index.get_level_values(0)
metadata.reset_index(drop=True, inplace=True)
print(metadata.job.value_counts())
jobs = metadata.job.unique().tolist()
jobs.sort()

job
group-mentions-annotation-uk-commons-round-01    1575
Name: count, dtype: int64


In [10]:
cat2code = read_label_config(args.label_config_file)
acorp = JsonlinesAnnotationsCorpus(cat2code)

acorp.load_from_jsonlines(args.input_file, verbose=args.verbose)

100%|██████████| 1573/1573 [00:02<00:00, 653.69it/s]


In [11]:
n_docs = acorp.ndocs
n_gold_labeled = len([1 for doc in acorp.docs if doc.n_labels > 0])
print('# docs = ', n_docs)
print('# gold items =', n_gold_labeled)
acorp.label_map

# docs =  1573
# gold items = 121


{'O': 0,
 'I-social group': 1,
 'B-social group': 7,
 'I-political group': 2,
 'B-political group': 8,
 'I-political institution': 3,
 'B-political institution': 9,
 'I-organization, public institution, or collective actor': 4,
 'B-organization, public institution, or collective actor': 10,
 'I-implicit social group reference': 5,
 'B-implicit social group reference': 11,
 'I-unsure': 6,
 'B-unsure': 12}

In [12]:
# ensure that 'unsure' never in GOLD annotations
for doc in acorp.docs:
    if doc.n_labels > 0 and cat2code['B-unsure'] in doc.labels['GOLD']:
        print(doc.id)

b3d0d7b0926cb70b858ced44aab87880
2044106345e3988ce759190853dc980c


In [13]:
for i in range(acorp.ndocs):
    if acorp.docs[i].n_labels > 0:
        break

In [14]:
acorp.docs[i].labels

{'GOLD': array([ 0,  0,  0,  0,  0,  0,  0, 10,  4,  4,  4,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  7,  0,  7,  0])}

In [15]:
# drop 'unsure' labels
def recode_labels(codes):
    out = codes.copy()
    # set "unsure" to outside
    out[codes ==  6] = 0
    out[codes == 12] = 0
    # reset all others
    out[out>6] -= 1
    return(out)

for i in range(acorp.ndocs):
    for j in range(acorp.docs[i].n_annotations):
        acorp.docs[i].annotations[ acorp.docs[i].annotators[j] ] = recode_labels(acorp.docs[i].annotations[ acorp.docs[i].annotators[j] ])
        for k in acorp.docs[i].labels.keys():
            acorp.docs[i].labels[k] = recode_labels(acorp.docs[i].labels[k])
    if acorp.docs[i].n_labels > 0:
        acorp.docs[i].labels['GOLD'] = recode_labels(acorp.docs[i].labels['GOLD'])


idx = acorp.label_map.pop('B-unsure')
acorp.label_map_inv.pop(idx)
idx = acorp.label_map.pop('I-unsure')
acorp.label_map_inv.pop(idx)

# [print(c, 't', l) for c, l in acorp.label_map_inv.items()]
for c in range(7,12): acorp.label_map[acorp.label_map_inv[c]] -= 1
# [print(c, 't', l) for l, c in acorp.label_map.items()]

acorp.label_map_inv = {c: l for l, c in acorp.label_map.items()}
# [print(c, 't', l) for c, l in acorp.label_map_inv.items()]

len(acorp.label_map) == len(acorp.label_map_inv)

acorp.inside_labels = list(range(1, 6))
acorp.beginning_labels = list(range(6, 11))

acorp.ndocs = len(acorp.docs)
acorp.doc_ids = [doc.id for doc in acorp.docs]
acorp.doc_id2idx = {doc.id: i for i, doc in enumerate(acorp.docs)}
acorp.doc_idx2id = {i: doc.id for i, doc in enumerate(acorp.docs)}

acorp.annotator_label_counts = acorp._count_annotator_labels()

In [16]:
# count label types in GOlD
# ensure that 'unsure' never in GOLD annotations
gold_types = Counter()
for doc in acorp.docs:
    if 'GOLD' in doc.labels:
        gold_types.update(doc.labels['GOLD'].tolist())
gold_types

Counter({0: 2166, 1: 250, 3: 136, 4: 95, 6: 68, 2: 41, 7: 36, 5: 29, 8: 12})

In [17]:
annotations = acorp.annotator_label_counts
gold = Counter()
for labs in [doc.labels['GOLD'].tolist() for doc in acorp.docs if 'GOLD' in doc.labels]:
    gold.update(labs)
pd.DataFrame(annotations).join(pd.DataFrame({'gold': dict(gold)})).loc[acorp.label_map.values()]

Unnamed: 0,emarie,sjasmin,gold
0,21798,22242,2166.0
1,802,682,250.0
6,291,326,68.0
2,193,97,41.0
7,97,62,36.0
3,599,561,136.0
8,394,394,12.0
4,440,301,95.0
9,213,207,
5,94,49,29.0


In [18]:
# define custom token cleaning function
def clean_tokens(x):

    # insert special characters
    if regex.match(r"^\p{Sc}", x):
        return("<MONEY>")

    if regex.match(r"^\d+([.,/-]\d+)*\w*$", x):
        return("<DIGITS>")

    return(x)

# clean docs' tokens (in place) and collect results in Counter object
cleaned_vocab = Counter()
for i in range(acorp.ndocs):
    cleaned_vocab.update(acorp.docs[i].clean_tokens(fun=clean_tokens))

# reduces vocab size somewhat!
print(len(cleaned_vocab)/len(acorp.vocab))

0.9757688723205965


## Annotation aggregation

### Prepare the Baysian sequene combination (BSC) sequence annotation model

In [21]:
amodel = BSCModel(acorp, max_iter=30, gold_labels='GOLD', verbose=True)

Parallel can run 10 jobs simultaneously, with 10 cores


In [22]:
print('# label classes:', amodel.num_classes)
print('# annotators:   ', amodel.num_annotators)
print('# docs:', amodel.num_docs)
print('# tokens:', amodel.num_tokens)
print('# docs with gold labels:', (amodel.gold[amodel.doc_start == 1] > -1).sum())
print('# tokens with gold labels:', (amodel.gold > 0).sum())

# label classes: 11
# annotators:    2
# docs: 1573
# tokens: 37500
# docs with gold labels: 121
# tokens with gold labels: 667


In [23]:
# inspect the alpha prior
alpha0 = amodel.model.A.alpha0.copy()
# note: default prior belief is that annotators assign correct label 2 out of 3 times
alpha0[0,0]/alpha0[0,1:].sum()

2.0

In [24]:
# inspect transitions prior
new_beta0 = amodel.model.LM.beta0.copy()
cats = [k for k, v in sorted(acorp.label_map.items(), key=lambda item: item[1])]
tmp = pd.DataFrame(new_beta0.round(2), index = cats, columns = ['=>'+c[:5]  for c in cats])
# note: read this by rows and columns indicate label categories 0...12 and the cell (i, j) indicates the prior from
#       probability that the label in column j follows the label in row i
# for examples:
# after the "outside" label, only itself and B-* labels are allowed
tmp.iloc[[0]]
# after the "I-social group" label, only itself, O, or the other B-* labels are allowed
tmp.iloc[[1]]
new_beta0[1, 6] = 1e-12
# after the "B-social group" label, O, "I-social group", or any of the other B-* labels are allowed
tmp.iloc[[6]]
new_beta0[6, 6] = 1e-12

# apply logic to each type
for c in range(2, 6):
    new_beta0[c, c+5] = 1e-12
    new_beta0[c+5, c+5] = 1e-12

# verify
print(pd.DataFrame(new_beta0.round(2), index = cats, columns = ['=>'+c[:5]  for c in cats]))

# reset
amodel.reset_label_transitions_prior(new_beta0)

                                                    =>O  =>I-soc  =>I-pol  =>I-pol  =>I-org  =>I-imp  =>B-soc  =>B-pol  =>B-pol  =>B-org  =>B-imp
O                                                   6.0      0.0      0.0      0.0      0.0      0.0      1.0      1.0      1.0      1.0      1.0
I-social group                                      1.0      1.0      0.0      0.0      0.0      0.0      0.0      1.0      1.0      1.0      1.0
I-political group                                   1.0      0.0      1.0      0.0      0.0      0.0      1.0      0.0      1.0      1.0      1.0
I-political institution                             1.0      0.0      0.0      1.0      0.0      0.0      1.0      1.0      0.0      1.0      1.0
I-organization, public institution, or collecti...  1.0      0.0      0.0      0.0      1.0      0.0      1.0      1.0      1.0      0.0      1.0
I-implicit social group reference                   1.0      0.0      0.0      0.0      0.0      1.0      1.0      1.0      

### Fit the BSC model

In [25]:
# fit model
amodel.fit_predict()

BSC: run() called with annotation matrix with shape = (37500, 2)
BC iteration 0 in progress
BC iteration 0: computed label probabilities
BC iteration 0: updated label model
BC iteration 0: updated worker models
BC iteration 1 in progress
BAC iteration 1: completed forward pass
BAC iteration 1: completed backward pass
BC iteration 1: computed label probabilities
BC iteration 1: updated label model
BC iteration 1: updated worker models
Computing LB=-759947.2036: label model and labels=-76513.0000, annotator model=-457.2500, features=-682976.9301
BSC: max. difference at iteration 2: inf
BC iteration 2 in progress
BAC iteration 2: completed forward pass
BAC iteration 2: completed backward pass
BC iteration 2: computed label probabilities
BC iteration 2: updated label model
BC iteration 2: updated worker models
Computing LB=-741708.9618: label model and labels=-59333.7656, annotator model=-329.5000, features=-682045.7118
BSC: max. difference at iteration 3: 18238.24182
BC iteration 3 in pro



BAC iteration 3: completed forward pass
BAC iteration 3: completed backward pass
BC iteration 3: computed label probabilities
BC iteration 3: updated label model
BC iteration 3: updated worker models
Computing LB=-739811.8993: label model and labels=-57484.5938, annotator model=-303.3750, features=-682023.9696
BSC: max. difference at iteration 4: 1897.06244
BC iteration 4 in progress
BAC iteration 4: completed forward pass
BAC iteration 4: completed backward pass
BC iteration 4: computed label probabilities
BC iteration 4: updated label model
BC iteration 4: updated worker models
Computing LB=-739492.9847: label model and labels=-57199.0859, annotator model=-296.8750, features=-681996.9613
BSC: max. difference at iteration 5: 318.91457
BC iteration 5 in progress
BAC iteration 5: completed forward pass
BAC iteration 5: completed backward pass
BC iteration 5: computed label probabilities
BC iteration 5: updated label model
BC iteration 5: updated worker models
Computing LB=-739332.2476: 

In [26]:
print(amodel.runtime_)
amodel.model.convergence_history

214.11958279204555


[inf,
 18238.241818634793,
 1897.0624387806747,
 318.91457307501696,
 160.73717962857336,
 124.69076562579721,
 93.46654740266968,
 50.287831012159586,
 23.218291213270277,
 22.058464309899136,
 32.072725362260826,
 15.641622876748443,
 9.657274924917147,
 14.174832683289424,
 9.232539518037811,
 5.777763915481046,
 4.9175397456856444,
 4.149025749065913,
 3.865889688488096,
 3.96450487524271,
 4.027864253614098,
 5.041564604500309,
 5.326535994186997,
 5.2524078383576125,
 3.2502073486102745,
 1.7336729498347268,
 1.116782711003907,
 0.5563615111168474]

### add sentence metadata to posterior label estimates

In [27]:
# add metadata
metadata.drop(['split_', 'job'], axis=1, inplace=True)
metadata.set_index('uid', drop=True, verify_integrity=True, inplace=True)

In [28]:
for doc in amodel.corpus.docs:
    doc.metadata = {k: v.item() if isinstance(v, np.int64) else v for k, v in dict(metadata.loc[doc.id]).items() if not pd.isna(v)}

In [29]:
np.unique(np.array([hasattr(doc, 'metadata') for doc in amodel.corpus.docs]), return_counts=True)

(array([ True]), array([1573]))

## Write to disk

In [38]:
def _jsonify(self, fields=['id', 'text', 'tokens', 'annotations', 'labels', 'metadata']):
    out = {k: self.__dict__[k] for k in fields if hasattr(self, k)}
    if 'annotations' in out.keys():
        out['annotations'] = {k: v.tolist() for k, v in out['annotations'].items()}
    if 'labels' in out.keys():
        out['labels'] = {k: v.tolist() for k, v in out['labels'].items()}
        if len(out['labels']) == 0:
            out['labels'] = None
    if 'metadata' in out.keys():
        if len(out['metadata']) == 0:
            out['metadata'] = None
    return json.dumps(out)

In [None]:
if not os.path.exists(args.output_file) or args.overwrite_output:
    with open(args.output_file, mode='w', encoding='utf-8') as writer:
        for doc in amodel.corpus.docs:
            writer.write(_jsonify(doc)+'\n')