In [1]:
import os
import sys
sys.path.append(os.path.abspath('../../code'))

In [4]:
import os
import regex
import json
from collections import Counter

import numpy as np
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
import pandas as pd
pd.set_option('display.max_columns', 15)
pd.set_option('display.width', 320)

from utils.io import read_label_config
from utils.corpus import DoccanoAnnotationsCorpus

In [None]:
from types import SimpleNamespace

args = SimpleNamespace()
args.data_path = '../../data/annotation/annotations/'
args.data_folder_pattern = 'uk-commons'
args.data_annotations_folder = 'annotations'
args.data_file_format = 'jsonl'
args.keep_annotator = 'emarie,sjasmin'

args.label_config_file = '../../data/annotation/doccano_label_config.json'
args.label_review_config_file = '../../data/annotation/doccano_labels_review_config.json'

args.output_file = '../../data/annotation/parsed/uk-commons_annotations.jsonl'
args.overwrite_output = False

args.verbose = True

In [12]:
subdirs = [os.path.join(args.data_path, d, args.data_annotations_folder) for d in os.listdir(args.data_path) if d.startswith(args.data_folder_pattern)]
annotators = [a.strip() for a in args.keep_annotator.split(',')]
fps = [os.path.join(d, a+'.'+args.data_file_format) for a in annotators for d in subdirs]
fps.sort()

In [14]:
# read the label config
cat2code = read_label_config(args.label_config_file)

In [15]:
# read first (we merge the rest to this one)
acorp = DoccanoAnnotationsCorpus(cat2code)
annotator = os.path.basename(fps[0]).replace('.jsonl', '')
acorp.load_from_jsonlines(fp=fps[0], annotator_id=annotator, verbose=args.verbose)

In [16]:
# merge remaining ones
for fp in fps[1:]:
    if args.verbose: print(f'Reading annotations from file \'{fp}\'')
    tmp = DoccanoAnnotationsCorpus(cat2code)
    annotator = os.path.basename(fp).replace('.jsonl', '')
    tmp.load_from_jsonlines(fp=fp, annotator_id=annotator, verbose=args.verbose)
    acorp.merge_annotated_corpus(tmp)

Reading annotations from file '../../data/annotation/annotations/uk-commons/annotations/sjasmin.jsonl'


### merge gold labels

In [17]:
fp = os.path.join(args.data_path, 'ra-annotation-uk-commons-review', 'annotations', 'all.jsonl')

In [18]:
# read the label config
cats = ['SG', 'PG', 'PI', 'ORG', 'ISG', 'unsure']
cats = [t+c for t in ['I-', 'B-'] for c in cats]
cat2code_gold = {l+a: c+1 for a in ['-a', '-z'] for c, l in enumerate(cats)}
cat2code_gold['O'] = acorp.outside_label

In [19]:
gold_corp = DoccanoAnnotationsCorpus(cat2code_gold, n_types=6)
gold_corp.load_from_jsonlines(fp=fp, annotator_id='GOLD', verbose=args.verbose)

In [21]:
# check that doc IDs in gold data match those in the annotations
all([doc_id in acorp.doc_ids for doc_id in gold_corp.doc_ids])

True

In [22]:
# merge the gold labels to the annotations corpus
acorp.merge_gold_corpus(gold_corp)

In [23]:
print(acorp.docs[acorp.doc_id2idx[gold_corp.doc_ids[1]]])

[1m00b92b011f1a6c13fcaa47cbf1a3915e[0m
'I know he will share the House ' s concern about the number of young people coming forward who have been victims of horrendous abuse .'
                      [44m   [49m [44m     [49m                                 [44m     [49m [44m      [49m [44m      [49m [44m       [49m [44m   [49m [44m    [49m [44m    [49m [44m       [49m [44m  [49m [44m          [49m [44m     [49m  	(emarie)
                                                                [44m     [49m [44m      [49m                              [44m       [49m [44m  [49m [44m          [49m [44m     [49m  	(sjasmin)
                      [42m   [49m [42m     [49m                                 [42m     [49m [42m      [49m [42m      [49m [42m       [49m [42m   [49m [42m    [49m [42m    [49m [42m       [49m [42m  [49m [42m          [49m [42m     [49m  	[GOLD]


In [24]:
print('No. docs:', acorp.ndocs)
# how many singly/multiply annotated?
print(np.unique(np.asarray([doc.n_annotations for doc in acorp.docs]), return_counts=True))

No. docs: 1574
(array([1, 2]), array([1050,  524]))


In [25]:
# identify duplicate texts (if any)
texts = Counter()
for doc in acorp.docs:
    texts.update([doc.text])

In [30]:
print(np.unique(np.asarray(list(texts.values())), return_counts=True))
# 1 sentence is a verbatim duplicate (possible because we sampled based on within-manifesto sentence IDs)

(array([1, 2]), array([1572,    1]))


In [31]:
# get IDs of documents with dublicated text
duplicated = [t for t, n in texts.most_common() if n > 1]

In [32]:
# map doc IDs to texts
duplicates_ids = dict()
for doc in acorp.docs:
    if doc.text in duplicated:
        if doc.text in duplicates_ids.keys():
            duplicates_ids[doc.text].append(doc.id)
        else:
            duplicates_ids[doc.text] = [doc.id]

In [33]:
# print
if args.verbose:
    for ids in duplicates_ids.values():
        print('\n', '-'*100, sep='')
        for id in ids:
            print(acorp.docs[acorp.doc_id2idx[id]])


----------------------------------------------------------------------------------------------------
[1ma530fc5e897883be9be95e6ff301842e[0m
'What recent assessment she has made of the effect of the UK leaving the EU on the progress of talks on restoring devolution in Northern Ireland .'
                                                                     [44m   [49m [44m  [49m                                                                       	(sjasmin)
[1m07bfd6c84afd3bb57f6131029ab6db73[0m
'What recent assessment she has made of the effect of the UK leaving the EU on the progress of talks on restoring devolution in Northern Ireland .'
                                                                     [44m   [49m [44m  [49m                                                                       	(sjasmin)


In [34]:
# note: I manually disambiguate (no problem beause of identical annotations)
disambigute_duplicates = {
    'a530fc5e897883be9be95e6ff301842e': ['a530fc5e897883be9be95e6ff301842e', 'a530fc5e897883be9be95e6ff301842e']
}

In [35]:
# resolve duplicates: for duplicated texts
for ids in duplicates_ids.values():
    # see in all but the first doc (the 'original')
    if all([id in disambigute_duplicates.values() for id in ids]):
        this = [this for this, pair in disambigute_duplicates.items() if all([_ in pair for _ in ids])]
        for _ in disambigute_duplicates[this[0]]:
            if _ not in this:
                acorp.remove_documents([_])
    for id in ids[1:]:
        # id = ids[1]
        # for each annotator
        for annotator in acorp.docs[acorp.doc_id2idx[id]].annotators:
            # whether the annotator already in the 'original'
            if annotator in acorp.docs[acorp.doc_id2idx[ids[0]]].annotators:
                # if so remove annotation
                acorp.docs[acorp.doc_id2idx[id]].remove_annotation(annotator)
        if acorp.docs[acorp.doc_id2idx[id]].n_annotations > 0:
            acorp.merge_annotations([ids[0], id])
        else:
            acorp.remove_documents([id])

In [36]:
# verify
texts = Counter()
for doc in acorp.docs:
    texts.update([doc.text])
print(np.unique(np.asarray(list(texts.values())), return_counts=True))

(array([1]), array([1573]))


In [37]:
# reset important corpus attributes
acorp.doc_id2idx = {doc.id: i for i, doc in enumerate(acorp.docs)}
acorp.doc_idx2id = {i: doc.id for i, doc in enumerate(acorp.docs)}
acorp.annotator_label_counts = acorp._count_annotator_labels()

## clean tokens

In [38]:
toks = set()
all_chars = Counter()
for doc in acorp.docs:
    for tok in doc.tokens:
        toks.add(tok)
        all_chars.update([c for c in tok])

In [None]:
from utils.unicode import CATEGORIES as char_cats

del char_cats['Ll']
del char_cats['Lu']
del char_cats['Nd']

for k, v in char_cats.items():
    regx = r'\p{'+k+'}'
    m = [c for c in all_chars.keys() if regex.match(regx, c)]
    if len(m) > 0:
        print(k, end='\t')
        print(v, end='\t')
        print(m)
# NOTE: no need to clean

Pd	Dash Punctuation	['-']
Pe	Close Punctuation	[')']
Pf	Final Punctuation	['”']
Pi	Initial Punctuation	['“']
Po	Other Punctuation	['.', ';', ',', '?', "'", '&', '%', ':', '!', '/']
Ps	Open Punctuation	['(']
Sc	Currency Symbol	['£']


# Write to disk

In [None]:
os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
                  
if not os.path.exists(args.output_file) or args.overwrite_output:
    acorp.save_as_jsonlines(args.output_file, encoding='utf-8')