In [4]:
import os
import sys
sys.path.append(os.path.abspath('../../../code'))

In [None]:
from types import SimpleNamespace

args = SimpleNamespace()
args.input_file = '../../../data/exdata/thau2019/thau2019_spans_matched_to_manifesto_texts.jsonl'

args.output_file = '../../../data/annotation/exdata/uk-manifestos_thau2019_annotations.jsonl'
args.overwrite_output = False

args.verbose = True

In [None]:
import os

from collections import Counter

import numpy as np
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
import pandas as pd
pd.set_option('display.max_columns', 15)
pd.set_option('display.width', 320)

from utils.corpus import DoccanoAnnotationsCorpus

get the label map

In [7]:
types = [
    'Age/generation',
    'Economic class',
    'Ethnicity/race',
    'Gender',
    'Geography',
    'Health',
    'Nationality',
    'Religion',
    'Other',
    'none',
]

cat2code = {'O': 0}
for i, l in enumerate(types):
    cat2code['I-'+l] = int(i+1)
    cat2code['B-'+l] = int(i+1+len(types))
cat2code

{'O': 0,
 'I-Age/generation': 1,
 'B-Age/generation': 11,
 'I-Economic class': 2,
 'B-Economic class': 12,
 'I-Ethnicity/race': 3,
 'B-Ethnicity/race': 13,
 'I-Gender': 4,
 'B-Gender': 14,
 'I-Geography': 5,
 'B-Geography': 15,
 'I-Health': 6,
 'B-Health': 16,
 'I-Nationality': 7,
 'B-Nationality': 17,
 'I-Religion': 8,
 'B-Religion': 18,
 'I-Other': 9,
 'B-Other': 19,
 'I-none': 10,
 'B-none': 20}

In [None]:
# read first (we merge the rest to this one)
acorp = DoccanoAnnotationsCorpus(cat2code)
acorp.load_from_jsonlines(fp=args.input_file, annotator_id='thau2019', verbose=args.verbose)



In [None]:
# inspect those where warnings raised
tmp = acorp.docs[acorp.doc_id2idx['conservatives-1974-02-191-2']]
mask = tmp.annotations['thau2019'] > 0
[tmp.tokens[idx] for idx in np.where(mask)[0]], tmp.annotations['thau2019'][mask]
# looks fine

(['the', 'oldest'], array([11,  1]))

In [16]:
print('No. docs:', acorp.ndocs)
# how many singly/multiply annotated?
print(np.unique(np.asarray([doc.n_annotations for doc in acorp.docs]), return_counts=True))

No. docs: 4070
(array([1]), array([4070]))


In [17]:
# identify duplicate texts (if any)
texts = Counter()
for doc in acorp.docs:
    texts.update([doc.text])

In [18]:
print(np.unique(np.asarray(list(texts.values())), return_counts=True))
# 22 sentences are verbatim duplicates (possible because we sampled based on within-manifesto sentence IDs)

(array([1, 2]), array([4026,   22]))


In [19]:
# get IDs of documents with dublicated text
duplicated = [t for t, n in texts.most_common() if n > 1]

In [20]:
# map doc IDs to texts
duplicates_ids = dict()
for doc in acorp.docs:
    if doc.text in duplicated:
        if doc.text in duplicates_ids.keys():
            duplicates_ids[doc.text].append(doc.id)
        else:
            duplicates_ids[doc.text] = [doc.id]

In [21]:
# print
if args.verbose:
    for ids in duplicates_ids.values():
        print('\n', '-'*100, sep='')
        for id in ids:
            print(acorp.docs[acorp.doc_id2idx[id]])


----------------------------------------------------------------------------------------------------
[1mconservatives-1966-141-1[0m
'legislate to allow ground leaseholders to buy or rent their houses on fair terms except where the property is to be redeveloped .'
                    [44m      [49m [44m            [49m                                                                                           	(thau2019)
[1mconservatives-1966-188-1[0m
'legislate to allow ground leaseholders to buy or rent their houses on fair terms except where the property is to be redeveloped .'
                    [44m      [49m [44m            [49m                                                                                           	(thau2019)

----------------------------------------------------------------------------------------------------
[1mconservatives-1970-209-1[0m
'we are publishing separate manifestos for scotland and wales .'
                                            

In [22]:
# note: I've manually checked the cases where these very duplicate annotations.
#  In most cases, the annotations from the same annotator for the same text (though diff. 'docs') are identical.>
#  But in the few cases where this does not hold, I manually disambiguate.
disambigute_duplicates = {
    'conservatives-1974-02-288-1': ['conservatives-1970-209-1', 'conservatives-1974-02-288-1']
}

In [23]:
# resolve duplicates: for duplicated texts
for ids in duplicates_ids.values():
    # see in all but the first doc (the 'original')
    if all([id in disambigute_duplicates.values() for id in ids]):
        this = [this for this, pair in disambigute_duplicates.items() if all([_ in pair for _ in ids])]
        for _ in disambigute_duplicates[this[0]]:
            if _ not in this:
                acorp.remove_documents([_])
    for id in ids[1:]:
        # id = ids[1]
        # for each annotator
        for annotator in acorp.docs[acorp.doc_id2idx[id]].annotators:
            # whether the annotator already in the 'original'
            if annotator in acorp.docs[acorp.doc_id2idx[ids[0]]].annotators:
                # if so remove annotation
                acorp.docs[acorp.doc_id2idx[id]].remove_annotation(annotator)
        if acorp.docs[acorp.doc_id2idx[id]].n_annotations > 0:
            acorp.merge_annotations([ids[0], id])
        else:
            acorp.remove_documents([id])

In [24]:
# verify
texts = Counter()
for doc in acorp.docs:
    texts.update([doc.text])
print(np.unique(np.asarray(list(texts.values())), return_counts=True))

(array([1]), array([4048]))


In [25]:
# reset important corpus attributes
acorp.doc_id2idx = {doc.id: i for i, doc in enumerate(acorp.docs)}
acorp.doc_idx2id = {i: doc.id for i, doc in enumerate(acorp.docs)}
acorp.annotator_label_counts = acorp._count_annotator_labels()

# Write to disk

In [None]:
if not os.path.exists(args.output_file) or args.overwrite_output:
    os.makedirs(os.path.dirname(args.output_file), exist_ok=True)
    acorp.save_as_jsonlines(args.output_file, encoding='utf-8')