In [1]:
import os
import sys
sys.path.append(os.path.abspath('../../code'))

In [2]:
import os
import json
from pathlib import Path

import regex
from collections import Counter

import numpy as np
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
import pandas as pd
pd.set_option('display.max_columns', 15)
pd.set_option('display.width', 320)

from utils.io import read_label_config
from utils.corpus import JsonlinesAnnotationsCorpus
from utils.bsc_model import BSCModel

In [3]:
from types import SimpleNamespace

args = SimpleNamespace()
args.label_config_file = '../../data/annotation/doccano_label_config.json'
args.input_file = '../../data/annotation/parsed/de-manifestos_annotations.jsonl'

args.data_path = '../../data/annotation/annotations/'
args.data_folder_pattern = 'de-manifestos'
args.data_file_format = 'csv'

args.output_file = '../../data/annotation/labeled/de-manifestos_all_labeled.jsonl'
args.overwrite_output = False

args.verbose = True

In [4]:
fps = [str(fp) for fp in Path(args.data_path).glob(f'*{args.data_folder_pattern}*/*.{args.data_file_format}')]
# read metadata
metadata = pd.concat({fp.split('/')[-1].replace('_ids.csv', ''): pd.read_csv(fp) for fp in fps}, axis=0)
metadata['job'] = metadata.index.get_level_values(0)
metadata.reset_index(drop=True, inplace=True)
print(metadata.job.value_counts())
jobs = metadata.job.unique().tolist()
jobs.sort()

job
group-mentions-annotation-de-manifestos-round-01    1500
group-mentions-annotation-de-manifestos-round-02    1427
Name: count, dtype: int64


## Read the annotations

In [5]:
cat2code = read_label_config(args.label_config_file)
acorp = JsonlinesAnnotationsCorpus(cat2code)
acorp.load_from_jsonlines(args.input_file, verbose=args.verbose)

100%|██████████| 2923/2923 [00:05<00:00, 488.74it/s] 


In [6]:
n_docs = acorp.ndocs
n_gold_labeled = len([1 for doc in acorp.docs if doc.n_labels > 0])
print('# docs = ', n_docs)
print('# gold items =', n_gold_labeled)
acorp.label_map

# docs =  2923
# gold items = 105


{'O': 0,
 'I-social group': 1,
 'B-social group': 7,
 'I-political group': 2,
 'B-political group': 8,
 'I-political institution': 3,
 'B-political institution': 9,
 'I-organization, public institution, or collective actor': 4,
 'B-organization, public institution, or collective actor': 10,
 'I-implicit social group reference': 5,
 'B-implicit social group reference': 11,
 'I-unsure': 6,
 'B-unsure': 12}

In [7]:
# ensure that 'unsure' never in GOLD annotations
for doc in acorp.docs:
    if doc.n_labels > 0 and cat2code['B-unsure'] in doc.labels['GOLD']:
        print(doc.id)

In [8]:
# count label types in GOlD
# ensure that 'unsure' never in GOLD annotations
gold_types = Counter()
for doc in acorp.docs:
    if 'GOLD' in doc.labels:
        gold_types.update(doc.labels['GOLD'].tolist())
gold_types

Counter({0: 1278,
         1: 83,
         4: 54,
         5: 42,
         10: 39,
         7: 38,
         3: 36,
         9: 29,
         11: 24,
         2: 21,
         8: 12})

In [9]:
# drop 'unsure' labels
def recode_labels(codes):
    out = codes.copy()
    # set "unsure" to outside
    out[codes ==  6] = 0
    out[codes == 12] = 0
    # reset all others
    out[out>6] -= 1
    return(out)

for i in range(acorp.ndocs):
    for j in range(acorp.docs[i].n_annotations):
        acorp.docs[i].annotations[ acorp.docs[i].annotators[j] ] = recode_labels(acorp.docs[i].annotations[ acorp.docs[i].annotators[j] ])
        for k in acorp.docs[i].labels.keys():
            acorp.docs[i].labels[k] = recode_labels(acorp.docs[i].labels[k])

idx = acorp.label_map.pop('B-unsure')
acorp.label_map_inv.pop(idx)
idx = acorp.label_map.pop('I-unsure')
acorp.label_map_inv.pop(idx)

# [print(c, 't', l) for c, l in acorp.label_map_inv.items()]
for c in range(7,12): acorp.label_map[acorp.label_map_inv[c]] -= 1
# [print(c, 't', l) for l, c in acorp.label_map.items()]

acorp.label_map_inv = {c: l for l, c in acorp.label_map.items()}
# [print(c, 't', l) for c, l in acorp.label_map_inv.items()]

len(acorp.label_map) == len(acorp.label_map_inv)

acorp.inside_labels = list(range(1, 6))
acorp.beginning_labels = list(range(6, 11))

acorp.ndocs = len(acorp.docs)
acorp.doc_ids = [doc.id for doc in acorp.docs]
acorp.doc_id2idx = {doc.id: i for i, doc in enumerate(acorp.docs)}
acorp.doc_idx2id = {i: doc.id for i, doc in enumerate(acorp.docs)}

acorp.annotator_label_counts = acorp._count_annotator_labels()

In [10]:
annotations = acorp.annotator_label_counts
gold = Counter()
for labs in [doc.labels['GOLD'].tolist() for doc in acorp.docs if 'GOLD' in doc.labels]:
    gold.update(labs)

In [11]:
pd.DataFrame(annotations).join(pd.DataFrame({'gold': dict(gold)})).loc[acorp.label_map.values()]

Unnamed: 0,emarie,sjasmin,gold
0,25194,25688,1316.0
1,523,516,83.0
6,449,442,12.0
2,118,71,21.0
7,125,101,29.0
3,210,182,36.0
8,245,228,39.0
4,334,214,54.0
9,304,264,24.0
5,236,125,42.0


In [12]:
# define custom token cleaning function
def clean_tokens(x):

    # insert special characters
    if regex.match(r"^\p{Sc}", x):
        return("<MONEY>")

    if regex.match(r"^\d+([.,/-]\d+)*\w*$", x):
        return("<DIGITS>")

    return(x)

# clean docs' tokens (in place) and collect results in Counter object
cleaned_vocab = Counter()
for i in range(acorp.ndocs):
    cleaned_vocab.update(acorp.docs[i].clean_tokens(fun=clean_tokens))

# reduces vocab size somewhat!
print(len(cleaned_vocab)/len(acorp.vocab))

0.9913457377758547


## Annotation aggregation

### Prepare the Baysian sequene combination (BSC) sequence annotation model

In [13]:
amodel = BSCModel(acorp, max_iter=30, gold_labels='GOLD', verbose=True)

Parallel can run 10 jobs simultaneously, with 10 cores


In [14]:
print('# label classes:', amodel.num_classes)
print('# annotators:   ', amodel.num_annotators)
print('# docs:', amodel.num_docs)
print('# tokens:', amodel.num_tokens)
print('# docs with gold labels:', (amodel.gold[amodel.doc_start == 1] > -1).sum())
print('# tokens with gold labels:', (amodel.gold > 0).sum())

# label classes: 11
# annotators:    2
# docs: 2923
# tokens: 46449
# docs with gold labels: 105
# tokens with gold labels: 340


In [15]:
# inspect the alpha prior
alpha0 = amodel.model.A.alpha0.copy()
# note: default prior belief is that annotators assign correct label 2 out of 3 times
alpha0[0,0]/alpha0[0,1:].sum()

2.0

In [16]:
# inspect transitions prior
new_beta0 = amodel.model.LM.beta0.copy()
cats = [k for k, v in sorted(acorp.label_map.items(), key=lambda item: item[1])]
tmp = pd.DataFrame(new_beta0.round(2), index = cats, columns = ['=>'+c[:5]  for c in cats])
# note: read this by rows and columns indicate label categories 0...12 and the cell (i, j) indicates the prior from
#       probability that the label in column j follows the label in row i
# for examples:
# after the "outside" label, only itself and B-* labels are allowed
tmp.iloc[[0]]
# after the "I-social group" label, only itself, O, or the other B-* labels are allowed
tmp.iloc[[1]]
new_beta0[1, 6] = 1e-12
# after the "B-social group" label, O, "I-social group", or any of the other B-* labels are allowed
tmp.iloc[[6]]
new_beta0[6, 6] = 1e-12

# apply logic to each type
for c in range(2, 6):
    new_beta0[c, c+5] = 1e-12
    new_beta0[c+5, c+5] = 1e-12

# verify
print(pd.DataFrame(new_beta0.round(2), index = cats, columns = ['=>'+c[:5]  for c in cats]))

# reset
amodel.reset_label_transitions_prior(new_beta0)

                                                    =>O  =>I-soc  =>I-pol  =>I-pol  =>I-org  =>I-imp  =>B-soc  =>B-pol  =>B-pol  =>B-org  =>B-imp
O                                                   6.0      0.0      0.0      0.0      0.0      0.0      1.0      1.0      1.0      1.0      1.0
I-social group                                      1.0      1.0      0.0      0.0      0.0      0.0      0.0      1.0      1.0      1.0      1.0
I-political group                                   1.0      0.0      1.0      0.0      0.0      0.0      1.0      0.0      1.0      1.0      1.0
I-political institution                             1.0      0.0      0.0      1.0      0.0      0.0      1.0      1.0      0.0      1.0      1.0
I-organization, public institution, or collecti...  1.0      0.0      0.0      0.0      1.0      0.0      1.0      1.0      1.0      0.0      1.0
I-implicit social group reference                   1.0      0.0      0.0      0.0      0.0      1.0      1.0      1.0      

### Fit the BSC model

In [17]:
# fit model
amodel.fit_predict()

BSC: run() called with annotation matrix with shape = (46449, 2)
BC iteration 0 in progress
BC iteration 0: computed label probabilities
BC iteration 0: updated label model
BC iteration 0: updated worker models
BC iteration 1 in progress
BAC iteration 1: completed forward pass
BAC iteration 1: completed backward pass
BC iteration 1: computed label probabilities
BC iteration 1: updated label model
BC iteration 1: updated worker models
Computing LB=-1246644.8807: label model and labels=-96209.6953, annotator model=-544.8750, features=-1149890.2635
BSC: max. difference at iteration 2: inf
BC iteration 2 in progress
BAC iteration 2: completed forward pass
BAC iteration 2: completed backward pass
BC iteration 2: computed label probabilities
BC iteration 2: updated label model
BC iteration 2: updated worker models
Computing LB=-1220044.3454: label model and labels=-71582.8906, annotator model=-317.5000, features=-1148143.9782
BSC: max. difference at iteration 3: 26600.53532
BC iteration 3 in



BAC iteration 11: completed backward pass
BC iteration 11: computed label probabilities
BC iteration 11: updated label model
BC iteration 11: updated worker models
Computing LB=-1215963.3792: label model and labels=-67871.2188, annotator model=-246.0000, features=-1147846.2073
BSC: max. difference at iteration 12: 56.46706
BC iteration 12 in progress
BAC iteration 12: completed forward pass
BAC iteration 12: completed backward pass
BC iteration 12: computed label probabilities
BC iteration 12: updated label model
BC iteration 12: updated worker models
Computing LB=-1215918.0748: label model and labels=-67839.4766, annotator model=-242.8750, features=-1147835.6920
BSC: max. difference at iteration 13: 45.30440
BC iteration 13 in progress
BAC iteration 13: completed forward pass
BAC iteration 13: completed backward pass
BC iteration 13: computed label probabilities
BC iteration 13: updated label model
BC iteration 13: updated worker models
Computing LB=-1215878.5683: label model and labe

In [19]:
print(amodel.runtime_)
amodel.model.convergence_history

283.32099145802204


[inf,
 26600.535315625835,
 2888.479642565595,
 444.6887653134763,
 139.04616380715743,
 93.00714982766658,
 124.05376527016051,
 131.82451964681968,
 116.86061852844432,
 86.53850379167125,
 56.4670614216011,
 45.304399782791734,
 39.50643207714893,
 25.464724241755903,
 15.99461836181581,
 14.113870717585087,
 18.30966693442315,
 9.12986496440135,
 7.838402014225721,
 5.871683999197558,
 3.4760293611325324,
 2.016621259506792,
 1.628119511064142,
 2.002525717020035,
 2.2893328964710236,
 1.6911360225640237,
 1.3550263370852917,
 0.8900373282376677]

### add sentence metadata to posterior label estimates

In [20]:
metadata.set_index('uid', inplace=True)

In [21]:
for doc in amodel.corpus.docs:
    doc.metadata = {k: v for k, v in dict(metadata.loc[doc.id]).items() if not pd.isna(v)}

In [22]:
np.unique(np.array([hasattr(doc, 'metadata') for doc in amodel.corpus.docs]), return_counts=True)

(array([ True]), array([2923]))

## Write to disk

In [23]:
def _jsonify(self, fields=['id', 'text', 'tokens', 'annotations', 'labels', 'metadata']):
    out = {k: self.__dict__[k] for k in fields if hasattr(self, k)}
    if 'annotations' in out.keys():
        out['annotations'] = {k: v.tolist() for k, v in out['annotations'].items()}
    if 'labels' in out.keys():
        out['labels'] = {k: v.tolist() for k, v in out['labels'].items()}
        if len(out['labels']) == 0:
            out['labels'] = None
    if 'metadata' in out.keys():
        if len(out['metadata']) == 0:
            out['metadata'] = None
    return json.dumps(out)

In [None]:
if not os.path.exists(args.output_file) or args.overwrite_output:
    with open(args.output_file, mode='w', encoding='utf-8') as writer:
        for doc in amodel.corpus.docs:
            writer.write(_jsonify(doc)+'\n')