In [1]:
import os
import sys
sys.path.append(os.path.abspath('../../code'))

In [None]:
import os
import json
from pathlib import Path

import regex
from collections import Counter

import numpy as np
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
import pandas as pd
pd.set_option('display.max_columns', 15)
pd.set_option('display.width', 320)

from utils.io import read_label_config
from utils.corpus import JsonlinesAnnotationsCorpus
from utils.bsc_model import BSCModel

In [None]:
from types import SimpleNamespace

args = SimpleNamespace()
args.label_config_file = '../../data/annotation/doccano_label_config.json'
args.input_file = '../../data/annotation/parsed/uk-manifestos_annotations.jsonl'

args.data_path = '../../data/annotation/annotations/'
args.data_folder_pattern = 'uk-manifestos'
args.data_file_format = 'csv'

args.output_file = '../../data/annotation/labeled/uk-manifestos_all_labeled.jsonl'
args.overwrite_output = False

args.verbose = True

## Read the annotations

In [5]:
fps = [str(fp) for fp in Path(args.data_path).glob(f'*{args.data_folder_pattern}*/*.{args.data_file_format}')]

# read metadata
metadata = pd.concat({fp.split('/')[-1].replace('_ids.csv', ''): pd.read_csv(fp) for fp in fps}, axis=0)
metadata['job'] = metadata.index.get_level_values(0)
metadata.reset_index(drop=True, inplace=True)
print(metadata.job.value_counts())
jobs = metadata.job.unique().tolist()
jobs.sort()

job
group-mentions-annotation-uk-manifestos-round-02         3377
group-mentions-annotation-uk-manifestos-round-03         1557
group-mentions-annotation-uk-manifestos-round-01         1400
group-mentions-annotation-uk-manifestos-other-parties    1362
group-mentions-annotation-uk-manifestos-2017+19           900
Name: count, dtype: int64


In [8]:
cat2code = read_label_config(args.label_config_file)

acorp = JsonlinesAnnotationsCorpus(cat2code)
acorp.load_from_jsonlines(args.input_file, verbose=args.verbose)
# NOTE: sorry that this function is soo slow

100%|██████████| 8576/8576 [01:08<00:00, 124.38it/s]


In [10]:
n_docs = acorp.ndocs
n_gold_labeled = len([1 for doc in acorp.docs if doc.n_labels > 0])
print('# docs = ', n_docs)
print('# gold items =', n_gold_labeled)
acorp.label_map

# docs =  8576
# gold items = 610


{'O': 0,
 'I-social group': 1,
 'B-social group': 7,
 'I-political group': 2,
 'B-political group': 8,
 'I-political institution': 3,
 'B-political institution': 9,
 'I-organization, public institution, or collective actor': 4,
 'B-organization, public institution, or collective actor': 10,
 'I-implicit social group reference': 5,
 'B-implicit social group reference': 11,
 'I-unsure': 6,
 'B-unsure': 12}

In [11]:
# ensure that 'unsure' never in GOLD annotations
for doc in acorp.docs:
    if doc.n_labels > 0 and cat2code['B-unsure'] in doc.labels['GOLD']:
        print(doc.id)

In [12]:
# count label types in GOlD
# ensure that 'unsure' never in GOLD annotations
gold_types = Counter()
for doc in acorp.docs:
    if 'GOLD' in doc.labels:
        gold_types.update(doc.labels['GOLD'].tolist())
gold_types

Counter({0: 9974,
         1: 894,
         4: 468,
         3: 346,
         7: 326,
         5: 311,
         10: 242,
         9: 185,
         11: 150,
         2: 147,
         8: 110})

In [13]:
# drop 'unsure' labels
def recode_labels(codes):
    out = codes.copy()
    # set "unsure" to outside
    out[codes ==  6] = 0
    out[codes == 12] = 0
    # reset all others
    out[out>6] -= 1
    return(out)

for i in range(acorp.ndocs):
    for j in range(acorp.docs[i].n_annotations):
        acorp.docs[i].annotations[ acorp.docs[i].annotators[j] ] = recode_labels(acorp.docs[i].annotations[ acorp.docs[i].annotators[j] ])
        for k in acorp.docs[i].labels.keys():
            acorp.docs[i].labels[k] = recode_labels(acorp.docs[i].labels[k])


idx = acorp.label_map.pop('B-unsure')
acorp.label_map_inv.pop(idx)
idx = acorp.label_map.pop('I-unsure')
acorp.label_map_inv.pop(idx)

# [print(c, 't', l) for c, l in acorp.label_map_inv.items()]
for c in range(7,12): acorp.label_map[acorp.label_map_inv[c]] -= 1
# [print(c, 't', l) for l, c in acorp.label_map.items()]

acorp.label_map_inv = {c: l for l, c in acorp.label_map.items()}
# [print(c, 't', l) for c, l in acorp.label_map_inv.items()]

len(acorp.label_map) == len(acorp.label_map_inv)

acorp.inside_labels = list(range(1, 6))
acorp.beginning_labels = list(range(6, 11))

acorp.ndocs = len(acorp.docs)
acorp.doc_ids = [doc.id for doc in acorp.docs]
acorp.doc_id2idx = {doc.id: i for i, doc in enumerate(acorp.docs)}
acorp.doc_idx2id = {i: doc.id for i, doc in enumerate(acorp.docs)}

acorp.annotator_label_counts = acorp._count_annotator_labels()

In [14]:
annotations = acorp.annotator_label_counts
gold = Counter()
for labs in [doc.labels['GOLD'].tolist() for doc in acorp.docs if 'GOLD' in doc.labels]:
    gold.update(labs)
pd.DataFrame(annotations).join(pd.DataFrame({'gold': dict(gold)})).loc[acorp.label_map.values()]

Unnamed: 0,emarie,sjasmin,gold
0,104345,107456,10300.0
1,4073,3188,894.0
6,2126,1977,110.0
2,777,643,147.0
7,942,896,185.0
3,1886,1724,346.0
8,1281,1439,242.0
4,2122,1766,468.0
9,1333,1370,150.0
5,913,833,311.0


In [15]:
# define custom token cleaning function
def clean_tokens(x):

    # insert special characters
    if regex.match(r"^\p{Sc}", x):
        return("<MONEY>")

    if regex.match(r"^\d+([.,/-]\d+)*\w*$", x):
        return("<DIGITS>")

    return(x)

# clean docs' tokens (in place) and collect results in Counter object
cleaned_vocab = Counter()
for i in range(acorp.ndocs):
    cleaned_vocab.update(acorp.docs[i].clean_tokens(fun=clean_tokens))

# reduces vocab size somewhat!
print(len(cleaned_vocab)/len(acorp.vocab))

0.9692015895953757


## Annotation aggregation

### Prepare the Baysian sequene combination (BSC) sequence annotation model

In [16]:
amodel = BSCModel(acorp, max_iter = 30, gold_labels='GOLD', verbose=True)

Parallel can run 10 jobs simultaneously, with 10 cores


In [17]:
print('# label classes:', amodel.num_classes)
print('# annotators:   ', amodel.num_annotators)
print('# docs:', amodel.num_docs)
print('# tokens:', amodel.num_tokens)
print('# docs with gold labels:', (amodel.gold[amodel.doc_start == 1] > -1).sum())
print('# tokens with gold labels:', (amodel.gold > 0).sum())

# label classes: 11
# annotators:    2
# docs: 8576
# tokens: 179641
# docs with gold labels: 610
# tokens with gold labels: 2853


In [18]:
# inspect the alpha prior
alpha0 = amodel.model.A.alpha0.copy()
# note: default prior belief is that annotators assign correct label 2 out of 3 times
alpha0[0,0]/alpha0[0,1:].sum()

2.0

In [19]:
# inspect transitions prior
new_beta0 = amodel.model.LM.beta0.copy()
cats = [k for k, v in sorted(acorp.label_map.items(), key=lambda item: item[1])]
tmp = pd.DataFrame(new_beta0.round(2), index = cats, columns = ['=>'+c[:5]  for c in cats])
# note: read this by rows and columns indicate label categories 0...12 and the cell (i, j) indicates the prior from
#       probability that the label in column j follows the label in row i
# for examples:
# after the "outside" label, only itself and B-* labels are allowed
tmp.iloc[[0]]
# after the "I-social group" label, only itself, O, or the other B-* labels are allowed
tmp.iloc[[1]]
new_beta0[1, 6] = 1e-12
# after the "B-social group" label, O, "I-social group", or any of the other B-* labels are allowed
tmp.iloc[[6]]
new_beta0[6, 6] = 1e-12

# apply logic to each type
for c in range(2, 6):
    new_beta0[c, c+5] = 1e-12
    new_beta0[c+5, c+5] = 1e-12

# verify
print(pd.DataFrame(new_beta0.round(2), index = cats, columns = ['=>'+c[:5]  for c in cats]))

# reset
amodel.reset_label_transitions_prior(new_beta0)

                                                    =>O  =>I-soc  =>I-pol  =>I-pol  =>I-org  =>I-imp  =>B-soc  =>B-pol  =>B-pol  =>B-org  =>B-imp
O                                                   6.0      0.0      0.0      0.0      0.0      0.0      1.0      1.0      1.0      1.0      1.0
I-social group                                      1.0      1.0      0.0      0.0      0.0      0.0      0.0      1.0      1.0      1.0      1.0
I-political group                                   1.0      0.0      1.0      0.0      0.0      0.0      1.0      0.0      1.0      1.0      1.0
I-political institution                             1.0      0.0      0.0      1.0      0.0      0.0      1.0      1.0      0.0      1.0      1.0
I-organization, public institution, or collecti...  1.0      0.0      0.0      0.0      1.0      0.0      1.0      1.0      1.0      0.0      1.0
I-implicit social group reference                   1.0      0.0      0.0      0.0      0.0      1.0      1.0      1.0      

### Fit the BSC model

In [20]:
# fit model
amodel.fit_predict()

BSC: run() called with annotation matrix with shape = (179641, 2)
BC iteration 0 in progress
BC iteration 0: computed label probabilities
BC iteration 0: updated label model
BC iteration 0: updated worker models
BC iteration 1 in progress




BAC iteration 1: completed forward pass
BAC iteration 1: completed backward pass
BC iteration 1: computed label probabilities
BC iteration 1: updated label model
BC iteration 1: updated worker models
Computing LB=-2565719.4905: label model and labels=-392970.9609, annotator model=-1004.0000, features=-2171744.4983
BSC: max. difference at iteration 2: inf
BC iteration 2 in progress
BAC iteration 2: completed forward pass


KeyboardInterrupt: 

In [20]:
print(amodel.runtime_)
amodel.model.convergence_history

59246.04437354207


[inf,
 100322.22447962966,
 16926.870542862453,
 2475.791492738761,
 587.673656987492,
 241.77287979098037,
 202.29510506708175,
 151.08837817702442,
 134.41390146687627,
 147.75591368647292,
 141.03665407607332,
 164.30731191439554,
 143.51948086591437,
 135.67917372472584,
 112.49956936296076,
 78.37210077652708,
 49.19128704071045,
 30.435816916637123,
 23.220850703306496,
 17.607850742526352,
 13.628571960143745,
 11.153746377211064,
 9.183980046771467,
 8.113782392349094,
 6.74266731319949,
 6.388222127687186,
 6.535607845056802,
 6.46002184599638]

### add sentence metadata to posterior label estimates

In [24]:
# add metadata

# Use str.extract() to extract values from the sentence_id column
pattern = r'^([a-z0-9]+)[_-]((\d{4})(-?\d{2})?)-(\d+-\d+)$'
metadata[['party', 'tmp1', 'year', 'month', 'tmp2']] = metadata['sentence_id'].str.extract(pattern)
metadata[['paragraph_nr', 'sentence_nr']] = metadata['tmp2'].str.split('-', n=1, expand=True)
metadata.drop(['tmp1', 'tmp2'], axis=1, inplace=True)
metadata.set_index('uid', drop=True, verify_integrity=True, inplace=True)

In [25]:
for doc in amodel.corpus.docs:
    doc.metadata = {k: v for k, v in dict(metadata.loc[doc.id]).items() if not pd.isna(v)}

In [26]:
np.unique(np.array([hasattr(doc, 'metadata') for doc in amodel.corpus.docs]), return_counts=True)

(array([ True]), array([8576]))

## Write to disk

In [27]:
def _jsonify(self, fields=['id', 'text', 'tokens', 'annotations', 'labels', 'metadata']):
    out = {k: self.__dict__[k] for k in fields if hasattr(self, k)}
    if 'annotations' in out.keys():
        out['annotations'] = {k: v.tolist() for k, v in out['annotations'].items()}
    if 'labels' in out.keys():
        out['labels'] = {k: v.tolist() for k, v in out['labels'].items()}
        if len(out['labels']) == 0:
            out['labels'] = None
    if 'metadata' in out.keys():
        if len(out['metadata']) == 0:
            out['metadata'] = None
    return json.dumps(out)

In [None]:
if not os.path.exists(args.output_file) or args.overwrite_output:
    with open(args.output_file, mode='w', encoding='utf-8') as writer:
        for doc in amodel.corpus.docs:
            writer.write(_jsonify(doc)+'\n')