In [1]:
import os
import sys
sys.path.append(os.path.abspath('../../code'))

In [2]:
import os
import re
import regex

import numpy as np
import pandas as pd

from utils.io import read_label_config
from utils.corpus import DoccanoAnnotationsCorpus

from collections import Counter

from tqdm.auto import tqdm

np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
pd.set_option('display.max_columns', 15)
pd.set_option('display.width', 320)

In [None]:
from types import SimpleNamespace

args = SimpleNamespace()
args.data_path = '../../data/annotation/annotations/'
args.data_folder_pattern = 'uk-manifestos'
args.data_annotations_folder = 'annotations'
args.data_file_format = 'jsonl'
args.keep_annotator = 'emarie,sjasmin'

args.label_config_file = '../../data/annotation/doccano_label_config.json'

args.output_file = '../../data/annotation/parsed/uk-manifestos_annotations.jsonl'
args.overwrite_output = False

args.verbose = True

In [None]:
subdirs = [os.path.join(args.data_path, d, args.data_annotations_folder) for d in os.listdir(args.data_path) if d.startswith(args.data_folder_pattern]
annotators = [a.strip() for a in args.keep_annotator.split(',')]
fps = [os.path.join(d, a+'.'+args.data_file_format) for a in annotators for d in subdirs]
fps.sort()

In [None]:
# read the label config
cat2code = read_label_config(args.label_config_file)

In [11]:
# read first (we merge the rest to this one)
acorp = DoccanoAnnotationsCorpus(cat2code)
annotator = os.path.basename(fps[0]).replace('.jsonl', '')
acorp.load_from_jsonlines(fp=fps[0], annotator_id=annotator, verbose=args.verbose)

In [12]:
# merge remaining ones
for fp in fps[1:]:
    if args.verbose: print(f'Reading annotations from file \'{fp}\'')
    tmp = DoccanoAnnotationsCorpus(cat2code)
    annotator = os.path.basename(fp).replace('.jsonl', '')
    tmp.load_from_jsonlines(fp=fp, annotator_id=annotator, verbose=args.verbose)
    acorp.merge_annotated_corpus(tmp)

Reading annotations from file '../../data/annotation/annotations/uk-manifestos-other-parties/annotations/sjasmin.jsonl'
Reading annotations from file '../../data/annotation/annotations/uk-manifestos-round-01/annotations/emarie.jsonl'
Reading annotations from file '../../data/annotation/annotations/uk-manifestos-round-01/annotations/sjasmin.jsonl'
Reading annotations from file '../../data/annotation/annotations/uk-manifestos-round-02/annotations/emarie.jsonl'
Reading annotations from file '../../data/annotation/annotations/uk-manifestos-round-02/annotations/sjasmin.jsonl'
Reading annotations from file '../../data/annotation/annotations/uk-manifestos-round-03/annotations/emarie.jsonl'
Reading annotations from file '../../data/annotation/annotations/uk-manifestos-round-03/annotations/sjasmin.jsonl'
Reading annotations from file '../../data/annotation/annotations/uk-manifestos-round-2017+19/annotations/emarie.jsonl'
Reading annotations from file '../../data/annotation/annotations/uk-manife

In [13]:
# inspect those where warnings raised
print(acorp.docs[acorp.doc_id2idx['2f6771e692f900c77477e074d4d6deb9']])

[1m2f6771e692f900c77477e074d4d6deb9[0m
'Only Labour will offer the choice of remaining in the EU , or leaving with a sensible deal .'
      [44m      [49m                                       [44m   [49m [44m  [49m                                    	(sjasmin)


### merge gold labels

In [14]:
fp = os.path.join(args.data_path, 'ra-annotation-uk-manifestos-review', 'annotations', 'all.jsonl')

In [15]:
# read the label config
cats = ['SG', 'PG', 'PI', 'ORG', 'ISG', 'unsure']
cats = [t+c for t in ['I-', 'B-'] for c in cats]
cat2code_gold = {l+a: c+1 for a in ['-a', '-z'] for c, l in enumerate(cats)}
cat2code_gold['O'] = acorp.outside_label

In [None]:
gold_corp = DoccanoAnnotationsCorpus(cat2code_gold, n_types=len(cats))
gold_corp.load_from_jsonlines(fp=fp, annotator_id='GOLD', verbose=args.verbose)

In [18]:
# check that doc IDs in gold data match those in the annotations
all([doc_id in acorp.doc_ids for doc_id in gold_corp.doc_ids])

True

In [19]:
# merge the gold labels to the annotations corpus
acorp.merge_gold_corpus(gold_corp)

In [20]:
print(acorp.docs[acorp.doc_id2idx[gold_corp.doc_ids[99]]])

[1m30426f95b0aeaf60be5831dce2e28490[0m
'It also means that parents , mainly mothers , who take a full year are also losing out on pension provision .'
                    [44m       [49m [44m [49m [44m      [49m [44m       [49m [44m [49m [44m   [49m [44m    [49m [44m [49m [44m    [49m [44m    [49m                                           	(emarie)
                    [44m       [49m          [44m       [49m                                                                  	(sjasmin)
                    [42m       [49m [42m [49m [42m      [49m [42m       [49m [42m [49m [42m   [49m [42m    [49m [42m [49m [42m    [49m [42m    [49m                                           	[GOLD]


In [21]:
print('No. docs:', acorp.ndocs)
# how many singly/multiply annotated?
print(np.unique(np.asarray([doc.n_annotations for doc in acorp.docs]), return_counts=True))

No. docs: 8596
(array([1, 2]), array([5609, 2987]))


In [22]:
# identify duplicate texts (if any)
texts = Counter()
for doc in acorp.docs:
    texts.update([doc.text])

In [23]:
print(np.unique(np.asarray(list(texts.values())), return_counts=True))
# 20 sentences are verbatim duplicates (possible because we sampled based on within-manifesto sentence IDs)

(array([1, 2]), array([8556,   20]))


In [24]:
# get IDs of documents with dublicated text
duplicated = [t for t, n in texts.most_common() if n > 1]

In [25]:
# map doc IDs to texts
duplicates_ids = dict()
for doc in acorp.docs:
    if doc.text in duplicated:
        if doc.text in duplicates_ids.keys():
            duplicates_ids[doc.text].append(doc.id)
        else:
            duplicates_ids[doc.text] = [doc.id]

In [26]:
# print
if args.verbose:
    for ids in duplicates_ids.values():
        print('\n', '-'*100, sep='')
        for id in ids:
            print(acorp.docs[acorp.doc_id2idx[id]])


----------------------------------------------------------------------------------------------------
[1m0ccd795eb82b198ee3ef7ac9712ebc13[0m
'Move towards introducing ‘ safe standing ’ at football clubs , requiring the Sports Grounds Safety Authority to prepare guidance for implementing this change .'
                                               [44m        [49m [44m     [49m                 [44m      [49m [44m       [49m [44m      [49m [44m         [49m                                                   	(emarie)
                                                                          [44m   [49m [44m      [49m [44m       [49m [44m      [49m [44m         [49m                                                   	(sjasmin)
                                               [42m        [49m [42m     [49m             [42m   [49m [42m      [49m [42m       [49m [42m      [49m [42m         [49m                                                   	[GOLD]
[1m3b

In [27]:
# note: I've manually checked the cases where these very duplicate annotations.
#  In most cases, the annotations from the same annotator for the same text (though diff. 'docs') are identical.>
#  But in the few cases where this does not hold, I manually disambiguate.
disambigute_duplicates = {
    'b8fcbc767a1d9c51ff8b79a45c46bcb8': ['b8fcbc767a1d9c51ff8b79a45c46bcb8', '760b6392c10197cada53339542796332'],
    '54c25996cbee486c56a6b0fdfd2f7c79': ['54c25996cbee486c56a6b0fdfd2f7c79', '2a89d4f048351fe43b0102ccdab00859'],
    'ec49785ff7550ab699a28fa560cc7178': ['ec49785ff7550ab699a28fa560cc7178', '3e3eef0abdbfd4ff0301995486b06ae5'],
    '3d30f73ceaecd66835bf90e0970948cd': ['3d30f73ceaecd66835bf90e0970948cd', 'c0055bf852b96fcfcaaa6b51556ade5f']
}

In [28]:
# resolve duplicates: for duplicated texts
for ids in duplicates_ids.values():
    # see in all but the first doc (the 'original')
    if all([id in disambigute_duplicates.values() for id in ids]):
        this = [this for this, pair in disambigute_duplicates.items() if all([_ in pair for _ in ids])]
        for _ in disambigute_duplicates[this[0]]:
            if _ not in this:
                acorp.remove_documents([_])
    for id in ids[1:]:
        # id = ids[1]
        # for each annotator
        for annotator in acorp.docs[acorp.doc_id2idx[id]].annotators:
            # whether the annotator already in the 'original'
            if annotator in acorp.docs[acorp.doc_id2idx[ids[0]]].annotators:
                # if so remove annotation
                acorp.docs[acorp.doc_id2idx[id]].remove_annotation(annotator)
        if acorp.docs[acorp.doc_id2idx[id]].n_annotations > 0:
            acorp.merge_annotations([ids[0], id])
        else:
            acorp.remove_documents([id])

In [29]:
# verify
texts = Counter()
for doc in acorp.docs:
    texts.update([doc.text])
print(np.unique(np.asarray(list(texts.values())), return_counts=True))

(array([1]), array([8576]))


In [30]:
# reset important corpus attributes
acorp.doc_id2idx = {doc.id: i for i, doc in enumerate(acorp.docs)}
acorp.doc_idx2id = {i: doc.id for i, doc in enumerate(acorp.docs)}
acorp.annotator_label_counts = acorp._count_annotator_labels()

## clean tokens

In [31]:
toks = set()
all_chars = Counter()
for doc in acorp.docs:
    for tok in doc.tokens:
        toks.add(tok)
        all_chars.update([c for c in tok])

In [32]:
from utils.unicode import CATEGORIES as char_cats

del char_cats['Ll']
del char_cats['Lu']
del char_cats['Nd']

for k, v in char_cats.items():
    regx = r'\p{'+k+'}'
    m = [c for c in all_chars.keys() if regex.match(regx, c)]
    if len(m) > 0:
        print(k, end='\t')
        print(v, end='\t')
        print(m)

Cc	Control	['\x91', '\x92']
Co	Private Use	['\uf02f']
No	Other Number	['½']
Pd	Dash Punctuation	['-', '–']
Pe	Close Punctuation	[')', ']']
Pf	Final Punctuation	['’', '”']
Pi	Initial Punctuation	['‘', '“']
Po	Other Punctuation	[',', '.', ':', '?', '%', ';', '&', '/', "'", '"', '·']
Ps	Open Punctuation	['(']
Sc	Currency Symbol	['£', '€']
Sk	Modifier Symbol	['^']
Sm	Math Symbol	['+', '¬', '<', '>']


In [33]:
for doc in acorp.docs:
    if '<' in doc.tokens:
        print(doc.id, doc.tokens)

c60c93194a1f4b2d3b8a725c1ea05734 ['We', 'will', 'reverse', 'Tory', 'policies', 'on', 'the', 'privatisation', 'of', 'local', 'authority', 'services', '.', '<', '/', 'p>']


In [34]:
print(acorp.docs[acorp.doc_id2idx['c60c93194a1f4b2d3b8a725c1ea05734']])

[1mc60c93194a1f4b2d3b8a725c1ea05734[0m
'We will reverse Tory policies on the privatisation of local authority services . < / p>'
                 [44m    [49m                                  [44m     [49m [44m         [49m                  	(emarie)
                 [44m    [49m                                  [44m     [49m [44m         [49m [44m        [49m         	(sjasmin)


In [35]:
doc = acorp.docs[acorp.doc_id2idx['c60c93194a1f4b2d3b8a725c1ea05734']]
doc.tokens = doc.tokens[:-3]
for a in doc.annotations.keys():
    doc.annotations[a] = doc.annotations[a][:-3]
for l in doc.labels.keys():
    doc.labels[l] = doc.labels[l][:-3]

In [36]:
replace_chars = {    
    # Cc
    '\x91': '"',
    '\x92': '"',
    # Co
    u'\uf02f': '',
    # No
    '½': '1/2',
    # Po
    '·': '',
    # Sk
    '\^': ' ',
    # Sm
    '¬': '-'
}

for p, r in replace_chars.items():
    p = re.compile(p, re.UNICODE)
    for i in range(acorp.ndocs):
        acorp.docs[i].text = re.sub(p, r, acorp.docs[i].text)
        acorp.docs[i].tokens = [re.sub(p, r, tok) for tok in acorp.docs[i].tokens]

# Write to disk

In [None]:
os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
if not os.path.exists(args.output_file) or args.overwrite_output:
    acorp.save_as_jsonlines(args.output_file, encoding='utf-8')