In [2]:
redirects = FileObjectStore('../data/mmaps/redirects')
DOCSTART_MARKER = '-DOCSTART-'
RE_WIKI_ENT = re.compile(r'.*wiki\/(.*)')

In [3]:
data_path = '../data/'
dict_names = ['ent_dict', 'word_dict', 'redirects', 'str_prior', 'str_cond', 'disamb', 'str_necounts']
file_stores = {}
for dict_name in dict_names:
    file_stores[dict_name] = FileObjectStore(join(data_path, f'mmaps/{dict_name}'))

In [4]:
def is_training_doc(doc_id):
    return 'test' not in doc_id


def is_test_doc(doc_id):
    return 'testb' in doc_id


def is_dev_doc(doc_id):
    return 'testa' in doc_id


def doc_tag_for_id(doc_id):
    if 'testa' in doc_id:
        return 'dev'
    elif 'testb' in doc_id:
        return 'test'
    return 'train'

In [5]:
def iter_docs(path, split, redirects={}):
    if split == 'train':
        doc_id_predicate = is_training_doc
    elif split == 'dev':
        doc_id_predicate = is_dev_doc
    elif split == 'test':
        doc_id_predicate = is_test_doc
    else:
        print('wrong split, exiting')
        
    with codecs.open(path, 'r', 'utf-8') as f:
        doc_id = None
        doc_tokens = None
        doc_mentions = None

        for line in f:
            parts = line.split('\t')
            if len(parts) > 0:
                token = parts[0].strip()

                # if this line contains a mention
                if len(parts) >= 4 and parts[1] == 'B':

                    if parts[3].strip() != '' and not parts[3].startswith('--'):
                        try:
                            entity = RE_WIKI_ENT.match(parts[4]).group(1)
                        except AttributeError:
                            print(parts[4])
                        entity = redirects.get(entity, entity)
                        begin = sum(len(t)+1 for t in doc_tokens)

                        dodgy_tokenisation_bs_offset = 1 if re.search('[A-Za-z],',parts[2]) else 0

                        position = (begin, begin + len(parts[2]) + dodgy_tokenisation_bs_offset)
                        doc_mentions.append((entity, position))

                if token.startswith(DOCSTART_MARKER):
                    if doc_id is not None and doc_id_predicate(doc_id):
                        yield (' '.join(doc_tokens), doc_mentions, doc_id)

                    doc_id = token[len(DOCSTART_MARKER) + 2:-1]
                    #print(doc_id)
                    
                    ## TODO: FIX THIS HACK
                    if split == 'train' and doc_id[:3] == '618' and len(doc_tokens) == 510:
                        yield (' '.join(doc_tokens), doc_mentions, doc_id)
                    doc_tokens = []
                    doc_mentions = []
                elif doc_id is not None:
                    doc_tokens.append(token)

        if doc_id is not None and doc_id_predicate(doc_id):
            yield (' '.join(doc_tokens), doc_mentions, doc_id)

In [37]:
splits = ['train', 'dev', 'test']
docid2context = {}
all_examples = {split: [] for split in splits}

for split in splits:
    for context, mentions, doc_id in iter_docs('../data/Conll/AIDA-YAGO2-dataset.tsv', 
                                                split,
                                                redirects=redirects):
        docid2context[doc_id] = context
        all_examples[split].append([(doc_id, context[begin:end], (begin, end), ent_str) for ent_str, (begin, end) in mentions])

In [38]:
coref_resolver = HeuresticCorefResolver()
detector = SpacyDetector()
candidate_generator = NelCandidateGenerator(max_cands=256,
                                            disamb=file_stores['disamb'],
                                            redirects=file_stores['redirects'],
                                            str_necounts=file_stores['str_necounts'])

In [39]:
full_training_examples = {split: [] for split in splits}

for split, doc_examples in all_examples.items():
    for examples in doc_examples:
        text_spans = [(text, span) for _, text, span, _ in examples]
        try:
            doc_id = examples[0][0]
        except:
            print(doc_id, examples)
        doc = Doc(docid2context[doc_id],
                  file_stores=file_stores,
                  detector=detector,
                  candidate_generator=candidate_generator,
                  coref_resolver=coref_resolver,
                  doc_id=doc_id,
                  text_spans=text_spans)
        doc.gen_cands()
        
        for idx, (doc_id, text, span, ent_str) in enumerate(examples):
            mention = doc.mentions[idx]
            assert mention.text == text, (mention.text, text)
            assert (mention.begin, mention.end) == span
            full_training_examples[split].append('||'.join((doc_id, text, ent_str, '||'.join(mention.cands))))

62 ONE []
215 State []
515 London []
734 Dole []
1307testb Kansas []


In [41]:
for split in splits:
    f_store = FileObjectStore(f'/home/rohitalyosha/Student_Job/mannheim-nel/data/training_files/mmaps/{split}')
    split_examples = full_training_examples[split]
    f_store.save_many(zip(range(len(split_examples)), split_examples))
f_store = FileObjectStore(f'/home/rohitalyosha/Student_Job/mannheim-nel/data/training_files/mmaps/id2context')
f_store.save_many(docid2context.items())



In [42]:
full_training_examples['dev'][0]

'947testa CRICKET||LEICESTERSHIRE||Leicestershire_County_Cricket_Club||Leicestershire_County_Cricket_Club||Leicestershire||Leicestershire_(UK_Parliament_constituency)||High_Sheriff_of_Leicestershire||Leicestershire_Police||Leicestershire_and_Rutland_County_Football_Association||Leicestershire_Rugby_Union||Leicester_Town_Rifles||Leicestershire_Yeomanry||Leicestershire_Cricket_Board||Arriva_Fox_County||Royal_Leicestershire_Regiment||Leicestershire_and_Rutland_Cricket_Club||Leicestershire_Royal_Horse_Artillery||BBC_Radio_Leicester||Leicestershire_County_Council||Lord_Lieutenant_of_Leicestershire||Leicestershire_coalfield'

In [43]:
f_store = FileObjectStore(f'/home/rohitalyosha/Student_Job/mannheim-nel/data/training_files/mmaps/dev')

In [45]:
f_store[0]

'947testa CRICKET||LEICESTERSHIRE||Leicestershire_County_Cricket_Club||Leicestershire_County_Cricket_Club||Leicestershire||Leicestershire_(UK_Parliament_constituency)||High_Sheriff_of_Leicestershire||Leicestershire_Police||Leicestershire_and_Rutland_County_Football_Association||Leicestershire_Rugby_Union||Leicester_Town_Rifles||Leicestershire_Yeomanry||Leicestershire_Cricket_Board||Arriva_Fox_County||Royal_Leicestershire_Regiment||Leicestershire_and_Rutland_Cricket_Club||Leicestershire_Royal_Horse_Artillery||BBC_Radio_Leicester||Leicestershire_County_Council||Lord_Lieutenant_of_Leicestershire||Leicestershire_coalfield'