In [3]:
import pickle
import pathlib
from collections import defaultdict

import attr
import nltk
import elasticsearch as es
from tabulate import tabulate
from bs4 import BeautifulSoup as bs
from tqdm import tqdm_notebook as tqdm

In [4]:
CTX = {'index': 'articles', 'doc_type': 'doc'}
e = es.Elasticsearch([{ 'host': 'localhost', 'port': 9200 }])
e.ping()

True

# Topics

In [5]:
datasets = pathlib.Path('../../ungol-data/CLEF/truth/')
if not datasets.exists():
    raise Exception()

In [6]:
@attr.s
class Topic():
    top_id:      str = attr.ib()
    title:       str = attr.ib()
    description: str = attr.ib()
    narrative:   str = attr.ib()
        
    def __str__(self):
        return "[{}] {}\n- Description: {}\n- Narrative: {}\n".format(
            self.top_id, self.title, self.description, self.narrative)

topics = {}

with datasets.joinpath('CLEF2003_ah-mono-de_topics.xml').open(mode='r', encoding='utf-8') as f:
    topics_raw = f.read()

def soup2topic(topic_node):
    args = 'identifier', 'title', 'description', 'narrative'
    return Topic(*[topic_node.find(s).string for s in args])

fmt_path = 'truth/CLEF{}_ah-mono-de_topics.xml'
soup = bs(topics_raw, 'xml')
for topic_node in soup.find_all('topic'):
    topic = soup2topic(topic_node)
    topics[topic.top_id] = topic
        
print('  imported {} topics'.format(len(topics)))

print('\nexample:')
print(topics['150-AH'], '\n')

  imported 56 topics

example:
[150-AH] AI gegen Todesstrafe
- Description: Finde Berichte über direkte Aktionen von Amnesty International gegen die Todesstrafe.
- Narrative: Amnesty International widmet sich der weltweiten Abschaffung der Todesstrafe. Relevante Dokumente müssen spezifische Aktionen von AI gegen die Todesstrafe beschreiben.
 



In [10]:
for topic in topics.values():
    print(topic)

[141-AH] Briefbombe für Kiesbauer
- Description: Finde Informationen über die Explosion einer Briefbombe im Studio der Moderatorin Arabella Kiesbauer beim Fernsehsender PRO7.
- Narrative: Eine Briefbombe von rechtsorientierten Radikalen, die an die farbige Fernsehmoderatorin Arabella Kiesbauer gesandt wurde, explodierte am 9. Juni 1995 in einem Studio des Fernsehsenders PRO7. Eine Assistentin wurde verletzt. Alle Berichte über die Explosion und die polizeilichen Ermittlungen nach der Tat sind relevant. Andere Berichte über Briefbombenattentate sind nicht von Interesse.

[142-AH] Christo verhüllt den Deutschen Reichstag
- Description: Finde Berichte über die Verhüllung des Deutschen Reichstages in Berlin durch den Künstler Christo.
- Narrative: Der Verhüllungskünstler Christo benötigte im Juni 1995 zwei Wochen, um den Deutschen Reichstag in Berlin komplett mit Material zu verhüllen. Finde Berichte über dieses Kunstereignis. Jede Information über die Vorbereitung oder die Durchführung is

# Ground Truth

In [8]:
with datasets.joinpath('CLEF2003_ah-mono-de.txt').open(mode='r', encoding='ascii') as f:
    truth_raw = [line for line in f.readlines() if len(line.strip()) > 0]
    print('read {} ground truth items'.format(len(truth_raw)))

def read_truth(raw):
    truth = defaultdict(dict)
    sample_count = len(raw)
    
    for line in raw:
        top_id, _, doc_id, val = line.split()
        
        assert val == '1' or val == '0'
        assert top_id in topics
        
        truth[top_id][doc_id] = True if val == '1' else False

    assert sample_count == sum([len(v) for v in truth.values()])
    return truth

truth = read_truth(truth_raw)
print('  imported {} ground truth topics'.format(len(truth)))

tab_data = []
for top_id, mapping in truth.items():
    correct = sum([flag for flag in mapping.values()])
    tab_data.append((top_id, correct, len(mapping) - correct))

print(tabulate(tab_data, headers=('topic', 'true', 'false')))

read 21534 ground truth items
  imported 56 ground truth topics
topic      true    false
-------  ------  -------
141-AH        8      403
142-AH       65       43
143-AH       63       89
145-AH       10      564
147-AH       29      603
148-AH       12      176
149-AH       12      293
150-AH       45      214
151-AH       10      608
152-AH       56      627
153-AH       26      348
154-AH       19      316
155-AH       24      457
156-AH       29      634
157-AH       49      295
158-AH        3      334
159-AH       36      293
160-AH        1      586
161-AH        1      703
162-AH       24      179
163-AH       47      180
164-AH       72      359
165-AH        4      292
166-AH        2      410
167-AH        4      335
168-AH       17      291
169-AH       10      489
171-AH        8      329
172-AH        1      538
173-AH       10      414
174-AH       36      432
175-AH       11      575
176-AH       74        6
177-AH        7      610
178-AH       21      612
179-AH     

In [15]:
positives = [t for t, v in truth['174-AH'].items() if v]
positives

['SDA.950810.0099',
 'SDA.950810.0170',
 'SDA.950811.0023',
 'SDA.950822.0087',
 'SDA.950907.0249',
 'SDA.950908.0073',
 'SDA.950910.0111',
 'SDA.950911.0127',
 'SDA.950912.0115',
 'SDA.950921.0075',
 'SDA.950922.0111',
 'SDA.950923.0035',
 'SDA.950923.0072',
 'SDA.950924.0028',
 'SDA.951002.0038',
 'SDA.951013.0032',
 'SDA.951213.0056',
 'SPIEGEL9495-011260',
 'SPIEGEL9495-011263',
 'SPIEGEL9495-011395',
 'SPIEGEL9495-011396',
 'SPIEGEL9495-011409',
 'SPIEGEL9495-011498',
 'SPIEGEL9495-011503',
 'SPIEGEL9495-011508',
 'SPIEGEL9495-011522',
 'SPIEGEL9495-011523',
 'SPIEGEL9495-011524',
 'SPIEGEL9495-011937',
 'SPIEGEL9495-011946',
 'SPIEGEL9495-012199',
 'SPIEGEL9495-012211',
 'SPIEGEL9495-012225',
 'SPIEGEL9495-012909',
 'SPIEGEL9495-013181',
 'SPIEGEL9495-013587']

## Write pools to opt

In [None]:
basepath = pathlib.Path('../opt/raw')

notfound = defaultdict(list)
tab_data = []

for top_id in tqdm(truth):
    
    folder = basepath / top_id
    folder.mkdir(exist_ok=True)
    
    topic = e.get(id=top_id, **{'index': 'topics', 'doc_type': 'doc'})['_source']
    tab_data.append([top_id, topic['title'], 0])

    # write topic and truth
    
    with (folder / 'topic.txt').open('w') as fd:
        fd.write('\n\n'.join((topic['title'], topic['description'], topic['narrative'])))
    
    with (folder / 'truth.pickle').open('wb') as fd:
        pickle.dump(truth[top_id], fd)
        
    with (folder / 'truth.txt').open('w') as fd:
        for doc_id in truth[top_id]:
            flag = '1' if truth[top_id][doc_id] else '0'
            fd.write(doc_id + ' ' + flag + '\n')
            
    # write documents

    title_mapper = {}
    for doc_id, flag in tqdm(truth[top_id].items(), position=1, leave=False, desc=top_id):
        try:
            item = e.get(id=doc_id, **CTX)['_source']
            title = item['title']
            
            fname = doc_id + '.txt'
            folder_text = folder / 'text'
            
            folder_text.mkdir(exist_ok=True)
            with (folder_text / fname).open('w') as fd:
                fd.write('\n\n'.join((title, item['content'])))
            
            tab_data[-1][-1] += 1
            title_mapper[fname] = title
            
        except es.NotFoundError:
            notfound[top_id].append(doc_id)
 
    # write fname -> title mapping

    with (folder / 'titlemap.pickle').open('wb') as fd:
        pickle.dump(title_mapper, fd)
    
    
print(tabulate(tab_data))
            
for top_id in notfound:
    print('not found in topic {}'.format(top_id))
    for doc_id in notfound[top_id]:
        print('  - ', doc_id)

# Retrieve documents from elasticsearch

In [None]:
TOPIC = '150-AH'

positives = set()

print('\nsearching for documents containing the topic query:')
res = e.search(body={'query': {'match': {'content': topics[TOPIC].narrative}}, 'size': 20})
for i, hit in enumerate(res['hits']['hits']):
    
    doc = hit['_source']
    score = hit['_score']
    
    doc_id = hit['_id']
    title = doc.get('title', '<kein titel>')
    correct = 'correct' if doc_id in truth[TOPIC] and truth[TOPIC][doc_id] else 'wrong'
    
    if doc_id in truth[TOPIC] and truth[TOPIC][doc_id]:
        positives.add(doc_id)
    
    print()
    fmt = '[{}] {:2.5}: {}\n{} - {}\n\n{}...'
    print(fmt.format(i + 1, score, correct, doc_id, title, doc['content'][:300]))
    print('\n', '-' * 120)

# Other relevant articles (not found)

In [None]:
desired = [doc_id for doc_id in truth[TOPIC] if truth[TOPIC][doc_id] and not doc_id.startswith('SDA')]
print('found {} of {} articles'.format(len(positives), len(desired)))

for doc_id in truth[TOPIC]:
    if doc_id.startswith('SDA'):
        continue
    
    if truth[TOPIC][doc_id] and doc_id not in positives:
        hit = e.get(id=doc_id, **CTX)
        doc = hit['_source']
    
        doc_id = hit['_id']
        title = doc.get('title', '<kein titel>')
        
        print()
        fmt = '{} - {}\n\n{}...'
        print(fmt.format(doc_id, title, doc['content'][:300]))
        print('\n', '-' * 120)