In [1]:
import os
import sys
sys.path.append(os.path.abspath('../../code'))

In [2]:
import os
import regex
import json
from collections import Counter

import numpy as np
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

from utils.io import read_label_config
from utils.corpus import DoccanoAnnotationsCorpus

In [None]:
from types import SimpleNamespace

args = SimpleNamespace()
args.data_path = '../../data/annotation/annotations/'
args.data_folder_pattern = 'de-manifestos'
args.data_annotations_folder = 'annotations'
args.data_file_format = 'jsonl'
args.keep_annotator = 'emarie,sjasmin'

args.label_config_file = '../../data/annotation/doccano_label_config.json'

args.output_file = '../../data/annotation/parsed/de-manifestos_annotations.jsonl'
args.overwrite_output = False

args.verbose = True

In [6]:
subdirs = [os.path.join(args.data_path, d, args.data_annotations_folder) for d in os.listdir(args.data_path) if d.startswith(args.data_folder_pattern)]
annotators = [a.strip() for a in args.keep_annotator.split(',')]
fps = [os.path.join(d, a+'.'+args.data_file_format) for a in annotators for d in subdirs]
fps.sort()
fps

['../../data/annotation/annotations/de-manifestos-round-01/annotations/emarie.jsonl',
 '../../data/annotation/annotations/de-manifestos-round-01/annotations/sjasmin.jsonl',
 '../../data/annotation/annotations/de-manifestos-round-02/annotations/emarie.jsonl',
 '../../data/annotation/annotations/de-manifestos-round-02/annotations/sjasmin.jsonl']

In [7]:
# read the label config
cat2code = read_label_config(args.label_config_file)

In [8]:
# read first (we merge the rest to this one)
acorp = DoccanoAnnotationsCorpus(cat2code)
annotator = os.path.basename(fps[0]).replace('.jsonl', '')
acorp.load_from_jsonlines(fp=fps[0], annotator_id=annotator, verbose=args.verbose)



In [9]:
# merge remaining ones
for fp in fps[1:]:
    if args.verbose: print(f'Reading annotations from file \'{fp}\'')
    tmp = DoccanoAnnotationsCorpus(cat2code)
    annotator = os.path.basename(fp).replace('.jsonl', '')
    tmp.load_from_jsonlines(fp=fp, annotator_id=annotator, verbose=args.verbose)
    acorp.merge_annotated_corpus(tmp)

Reading annotations from file '../../data/annotation/annotations/de-manifestos-round-01/annotations/sjasmin.jsonl'
Reading annotations from file '../../data/annotation/annotations/de-manifestos-round-02/annotations/emarie.jsonl'
Reading annotations from file '../../data/annotation/annotations/de-manifestos-round-02/annotations/sjasmin.jsonl'


### merge gold labels

In [10]:
fp = os.path.join(args.data_path, 'ra-annotation-de-manifestos-review', 'annotations', 'all.jsonl')

In [11]:
# read the label config
cats = ['SG', 'PG', 'PI', 'ORG', 'ISG', 'unsure']
cats = [t+c for t in ['I-', 'B-'] for c in cats]
cat2code_gold = {l+a: c+1 for a in ['-a', '-z'] for c, l in enumerate(cats)}
cat2code_gold['O'] = acorp.outside_label

In [14]:
gold_corp = DoccanoAnnotationsCorpus(cat2code_gold, n_types=len(cats))
gold_corp.load_from_jsonlines(fp=fp, annotator_id='GOLD', verbose=args.verbose)

In [16]:
# check that doc IDs in gold data match those in the annotations
all([doc_id in acorp.doc_ids for doc_id in gold_corp.doc_ids])

True

In [17]:
# merge the gold labels to the annotations corpus
acorp.merge_gold_corpus(gold_corp)

In [18]:
print(acorp.docs[acorp.doc_id2idx[gold_corp.doc_ids[99]]])

[1mf5c4848a1a16a11751deada5456fc2f7[0m
'Die AfD setzt sich für gentechnikfrei erzeugte Lebensmittel aus der deutschen Landwirtschaft ein .'
 [44m   [49m [44m   [49m                                                             [44m         [49m [44m              [49m      	(emarie)
 [44m   [49m [44m   [49m                                                                                           	(sjasmin)
 [42m   [49m [42m   [49m                                                             [42m         [49m [42m              [49m      	[GOLD]


### ensure data integrity

In [19]:
print('No. docs:', acorp.ndocs)
# how many singly/multiply annotated?
print(np.unique(np.asarray([doc.n_annotations for doc in acorp.docs]), return_counts=True))

No. docs: 2927
(array([1, 2]), array([2328,  599]))


In [20]:
# identify duplicate texts (if any)
texts = Counter()
for doc in acorp.docs:
    texts.update([doc.text])

In [21]:
# 4 sentences are verbatim duplicates (possible because we sampled based on within-manifesto sentence IDs)
print(np.unique(np.asarray(list(texts.values())), return_counts=True))

(array([1, 2]), array([2919,    4]))


In [22]:
# get IDs of documents with dublicated text
duplicated = [t for t, n in texts.most_common() if n > 1]

In [23]:
# map doc IDs to texts
duplicates_ids = dict()
for doc in acorp.docs:
    if doc.text in duplicated:
        if doc.text in duplicates_ids.keys():
            duplicates_ids[doc.text].append(doc.id)
        else:
            duplicates_ids[doc.text] = [doc.id]

In [24]:
# print
if args.verbose:
    for ids in duplicates_ids.values():
        print('\n', '-'*100, sep='')
        for id in ids:
            print(acorp.docs[acorp.doc_id2idx[id]])


----------------------------------------------------------------------------------------------------
[1m2eff1ee7b7f8c9eb6ec562cb31ba2d53[0m
'Ein Anspruch auf Teilzeit soll daher nur bestehen , wenn ein Kind betreut oder ein naher Angehöriger gepflegt wird .'
                                                              [44m    [49m                  [44m     [49m [44m           [49m                	(emarie)
[1mce42c6df986b4c553898192a47712969[0m
'Ein Anspruch auf Teilzeit soll daher nur bestehen , wenn ein Kind betreut oder ein naher Angehöriger gepflegt wird .'
                                                              [44m    [49m                  [44m     [49m [44m           [49m                	(emarie)

----------------------------------------------------------------------------------------------------
[1ma979d2805a1561dc03903d3c55e74aaf[0m
'Der Selbstbehalt bei der Berechnung von Arbeitslosengeld II ist sanktionsfrei zu erhöhen .'
                            

In [25]:
# note: I've manually checked the cases where these very duplicate annotations.
#  In most cases, the annotations from the same annotator for the same text (though diff. 'docs') are identical.>
#  But in the few cases where this does not hold, I manually disambiguate.
disambigute_duplicates = {
    'e7b3f8d69a3b2fb6db83b9ea2817d90d': ['e7b3f8d69a3b2fb6db83b9ea2817d90d', 'de426127ca7c4af939382b58c0b63dc9'],
    '62ae8599698ad61f0e6ba84dce622864': ['62ae8599698ad61f0e6ba84dce622864', '10fa6bc89c26d903481f191605b21f48']
}

In [26]:
# resolve duplicates: for duplicated texts
for ids in duplicates_ids.values():
    # see in all but the first doc (the 'original')
    if all([id in disambigute_duplicates.values() for id in ids]):
        this = [this for this, pair in disambigute_duplicates.items() if all([_ in pair for _ in ids])]
        for _ in disambigute_duplicates[this[0]]:
            if _ not in this:
                acorp.remove_documents([_])
    for id in ids[1:]:
        # id = ids[1]
        # for each annotator
        for annotator in acorp.docs[acorp.doc_id2idx[id]].annotators:
            # whether the annotator already in the 'original'
            if annotator in acorp.docs[acorp.doc_id2idx[ids[0]]].annotators:
                # if so remove annotation
                acorp.docs[acorp.doc_id2idx[id]].remove_annotation(annotator)
        if acorp.docs[acorp.doc_id2idx[id]].n_annotations > 0:
            acorp.merge_annotations([ids[0], id])
        else:
            acorp.remove_documents([id])

In [27]:
# verify
texts = Counter()
for doc in acorp.docs:
    texts.update([doc.text])
print(np.unique(np.asarray(list(texts.values())), return_counts=True))

(array([1]), array([2923]))


In [28]:
# reset important corpus attributes
acorp.doc_id2idx = {doc.id: i for i, doc in enumerate(acorp.docs)}
acorp.doc_idx2id = {i: doc.id for i, doc in enumerate(acorp.docs)}
acorp.annotator_label_counts = acorp._count_annotator_labels()

## clean tokens

In [34]:
toks = set()
all_chars = Counter()
for doc in acorp.docs:
    for tok in doc.tokens:
        toks.add(tok)
        all_chars.update([c for c in tok])

In [None]:
from utils.unicode import CATEGORIES as char_cats

for k, v in char_cats.items():
    regx = r'\p{'+k+'}'
    m = [c for c in all_chars.keys() if regex.match(regx, c)]
    if len(m) > 0:
        print(k, end='\t')
        print(v, end='\t')
        print(m)
# NOTE: no cleaning needed

Pd	Dash Punctuation	['–', '-']
Pe	Close Punctuation	[')']
Pf	Final Punctuation	['»', '’']
Pi	Initial Punctuation	['“', '«']
Po	Other Punctuation	['.', ',', ':', '!', '"', '*', '/', '%', ';', '?', '…']
Ps	Open Punctuation	['„', '(']
Sc	Currency Symbol	['€']
Sm	Math Symbol	['+']


# Write to disk

In [None]:
os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
if not os.path.exists(args.output_file) or args.overwrite_output:
    acorp.save_as_jsonlines(args.output_file, encoding='utf-8')