In [148]:
from bs4 import BeautifulSoup as bs
import attr
import datetime
from collections import defaultdict
from typing import List
from typing import Dict
from typing import Set

 ## Import and Initialization
 
Yep, three documents - three encodings...

In [149]:
"""
with open('examples/E0008/samples/19940103', mode='r', encoding='iso-8859-1') as f:
    docs_raw = f.read()
    
with open('truth/CLEF2003_ah-mono-de.txt', mode='r', encoding='ascii') as f:
    truth_raw = [line for line in f.readlines() if len(line.strip()) > 0]
    
with open('truth/CLEF2003_ah-mono-de_topics.xml', mode='r', encoding='utf-8') as f:
    topics_raw = f.read()
"""    

@attr.s
class Doc():
    doc_id:      str = attr.ib()
    text:        str = attr.ib()
    title: List[str] = attr.ib(default=attr.Factory(list))
        
    def __str__(self):
        return '{} - {}\n{}...'.format(self.doc_id, self.title, self.text[:300])
        
@attr.s
class Topic():
    top_id:      str = attr.ib()
    title:       str = attr.ib()
    description: str = attr.ib()
    narrative:   str = attr.ib()
        
    def __str__(self):
        return "[{}] {}\n{}\n".format(self.top_id, self.title, self.description)
    
@attr.s
class Truth():
    file:    str = attr.ib()
    raw:     str = attr.ib()
        
@attr.s
class Dataset():
    year:                         str = attr.ib()
    truth: Dict[str, Dict[str, bool]] = attr.ib(default=attr.Factory(dict))
    docs:              Dict[str, Doc] = attr.ib(default=attr.Factory(dict))
    topics:          Dict[str, Topic] = attr.ib(default=attr.Factory(dict))
        
    # based on the examples, only some topics are actually relevant
    # because only they have related documents - this is a set of topic ids
    relevant: Set[str] = attr.ib(default=attr.Factory(set))
        
datasets = []
for year in '2000', '2001', '2002', '2003':
    datasets.append(Dataset(year=year))


### Read Documents

In [150]:
def soup2doc(doc_node) -> Doc:
    text = ' '.join(doc_node.find('text').string.strip().split())
    doc_id = doc_node.find('docid').string.strip()
    doc = Doc(text=text, doc_id=doc_id)
    for title in doc_node.find_all('title'):
        doc.title.append(title.string.strip())
    return doc


def read_docs(soup) -> Dict[str, Doc]:
    docs = {}
    for doc_node in soup:
        if len(doc_node) <= 1 or doc_node.find('text') is None:
            continue
    
        doc = soup2doc(doc_node)
        docs[doc.doc_id] = doc
    
    return docs


doc_files = (
    'E0008/samples/19940103',
    'E0038/samples/19951211',
    'E0036/samples/19940228_Sample',
    'E0038/samples/19940627',
)

print()
docs = {}
for file in doc_files:
    fname = 'examples/{}'.format(file)
    with open(fname, mode='r', encoding='iso-8859-1') as f:
        
        soup = bs(f.read(), 'html.parser')
        new = read_docs(soup)
        
        print('reading {} yielded {} duplicates ({} current, {} new)'.format(
            fname, len(docs.keys() & new.keys()), len(docs), len(new)))
        
        docs = {**docs, **new}
        
print('\ntotal documents:', len(docs))
print('only SPIEGEL:', all([s.startswith('SPIEGEL') for s in docs]))

print('\n', docs['SPIEGEL9495-000007'])

for dataset in datasets:
    dataset.docs = docs
print()


reading examples/E0008/samples/19940103 yielded 0 duplicates (0 current, 125 new)
reading examples/E0038/samples/19951211 yielded 0 duplicates (125 current, 136 new)
reading examples/E0036/samples/19940228_Sample yielded 0 duplicates (261 current, 26 new)
reading examples/E0038/samples/19940627 yielded 0 duplicates (287 current, 138 new)

total documents: 425
only SPIEGEL: True

 SPIEGEL9495-000007 - ['Heuchlerisches Vertuschen']
Man rät schwulen Pastoren, ihre Beziehungen zu verheimlichen, nicht aber diese abzubrechen. Es geht also nicht um scheinbar "christliche" Werte, sondern nur um die Aufrechterhaltung des Scheins. Dieses heuchlerische Vertuschen und Mißachten der Realität macht die Kirche unglaubwürdig und entfernt si...



## Read Topics

In [151]:
def soup2topic(topic_node):
    args = 'identifier', 'title', 'description', 'narrative'
    return Topic(*[topic_node.find(s).string for s in args])

fmt_path = 'truth/CLEF{}_ah-mono-de_topics.xml'
for dataset in datasets:
    print('\nlooking at year {}'.format(dataset.year))
    
    with open(fmt_path.format(dataset.year), mode='r', encoding='utf-8') as f:
        raw = f.read()
    
    soup = bs(raw, 'xml')
    for topic_node in soup.find_all('topic'):
        topic = soup2topic(topic_node)
        dataset.topics[topic.top_id] = topic
        
    print('  imported {} topics'.format(len(dataset.topics)))

print('\nexample:')
print(datasets[-1].topics['150-AH'], '\n')


looking at year 2000
  imported 37 topics

looking at year 2001
  imported 49 topics

looking at year 2002
  imported 50 topics

looking at year 2003
  imported 56 topics

example:
[150-AH] AI gegen Todesstrafe
Finde Berichte über direkte Aktionen von Amnesty International gegen die Todesstrafe.
 



In [152]:
def read_truth(raw, dataset):
    truth = defaultdict(dict)
    sample_count = len(raw)
    
    for line in raw:
        top_id, _, doc_id, val = line.split()
        
        assert val == '1' or val == '0'
        assert top_id in dataset.topics
        
        truth[top_id][doc_id] = True if val == '1' else False
    
        if doc_id in dataset.docs:
            dataset.relevant.add(top_id)

    assert sample_count == sum([len(v) for v in truth.values()])
    return truth


fmt_path = 'truth/CLEF{}_ah-mono-de.txt'
for dataset in datasets:
    print('\nlooking at year {}'.format(dataset.year))
    with open(fmt_path.format(dataset.year), mode='r', encoding='ascii') as f:
        raw = [line for line in f.readlines() if len(line.strip()) > 0]

    print('  read {} ground truth samples'.format(len(raw)))
    dataset.truth = read_truth(raw, dataset)
    print('  imported {} ground truth topics'.format(len(dataset.truth)))
    print('  of those, only {} are relevant'.format(len(dataset.relevant)))
    
print('\nsubsets?')
for d1, d2 in zip(datasets, datasets[1:]):
    assert d1 <= d2
    subset = all([k in d2.truth for k in d1.truth])
    print('  {} subset of {}? {}'.format(d1.year, d2.year, subset))


looking at year 2000
  read 11335 ground truth samples
  imported 37 ground truth topics
  of those, only 30 are relevant

looking at year 2001
  read 16726 ground truth samples
  imported 49 ground truth topics
  of those, only 33 are relevant

looking at year 2002
  read 19394 ground truth samples
  imported 50 ground truth topics
  of those, only 32 are relevant

looking at year 2003
  read 21534 ground truth samples
  imported 56 ground truth topics
  of those, only 29 are relevant

subsets?
  2000 subset of 2001? False
  2001 subset of 2002? False
  2002 subset of 2003? False


## Analyze relevant ground truth

In [153]:
for dataset in datasets:
    print('\nlooking at year {}'.format(dataset.year))
    for top_id in dataset.relevant:
        dic = dataset.truth[top_id]
        print('  topic: {:>7} - {:3d} true, {}/{} in corpus'.format(
            top_id, 
            len([v for v in dic if dic[v]]),
            len([v for v in dic if v in docs]),
            len(dic)))


looking at year 2000
  topic:   25-AH -   4 true, 3/287 in corpus
  topic:   16-AH -   5 true, 2/307 in corpus
  topic:   39-AH -  16 true, 1/314 in corpus
  topic:   35-AH -   1 true, 1/343 in corpus
  topic:   27-AH -  46 true, 1/315 in corpus
  topic:   24-AH -   2 true, 1/167 in corpus
  topic:   12-AH -  23 true, 2/256 in corpus
  topic:   38-AH -   8 true, 3/348 in corpus
  topic:   10-AH -  21 true, 5/355 in corpus
  topic:    5-AH - 101 true, 2/306 in corpus
  topic:   29-AH -   4 true, 3/275 in corpus
  topic:   13-AH -  48 true, 3/199 in corpus
  topic:    8-AH -   4 true, 7/302 in corpus
  topic:   37-AH -  56 true, 1/165 in corpus
  topic:    9-AH -   2 true, 2/393 in corpus
  topic:   26-AH -  48 true, 1/421 in corpus
  topic:   33-AH -  21 true, 2/238 in corpus
  topic:   15-AH -  14 true, 5/304 in corpus
  topic:   20-AH -  29 true, 5/309 in corpus
  topic:    4-AH -  13 true, 1/353 in corpus
  topic:   18-AH -  12 true, 2/421 in corpus
  topic:   11-AH -   4 true, 2/30

In [154]:
true_examples = defaultdict(dict)

for dataset in datasets:
    print('\nlooking at year {}'.format(dataset.year))

    for topic_id in dataset.relevant:
        samples = dataset.truth[topic_id].items()
        relevant_docs = [doc_id for doc_id, val in samples if val and doc_id in docs]
        # print('  {:>6} any relevant docs: {}'.format(topic_id, any(relevant_docs)))
        
        if any(relevant_docs):
            print('  found {} with relevant examples'.format(topic_id))
            true_examples[dataset.year][topic_id] = relevant_docs


looking at year 2000
  found 5-AH with relevant examples
  found 15-AH with relevant examples
  found 32-AH with relevant examples

looking at year 2001
  found 48-AH with relevant examples
  found 85-AH with relevant examples
  found 46-AH with relevant examples

looking at year 2002
  found 125-AH with relevant examples
  found 124-AH with relevant examples
  found 98-AH with relevant examples
  found 94-AH with relevant examples
  found 93-AH with relevant examples
  found 103-AH with relevant examples

looking at year 2003
  found 142-AH with relevant examples
  found 156-AH with relevant examples
  found 174-AH with relevant examples
  found 181-AH with relevant examples
  found 185-AH with relevant examples


In [155]:
for dataset in datasets:
    print('\nlooking at year {}'.format(dataset.year))
    for i, (topic_id, docs) in enumerate(true_examples[dataset.year].items()):
        print('\n  EXAMPLE {}\n'.format(i))
        print(dataset.topics[topic_id])
        print('  found {} relevant documents\n'.format(len(docs)))
        for doc_id in docs:
            print(dataset.docs[doc_id])


looking at year 2000

  EXAMPLE 0

[5-AH] Mitgliedschaft in der Europäischen Union
Die Haltung von Nicht-Mitgliedstaaten zum Beitritt zur Europäischen Gemeinschaft oder Europäischen Union soll ermittelt werden.

  found 1 relevant documents

SPIEGEL9495-000074 - ['Skandinavien', 'Arktischer Winter']
Mit ihren 20 Kühen im Stall zählen Riitta und Pentti Neitola zu den ganz großen Milchbauern im finnischen Lappland. Steiniger Boden, endlose Kiefern- und Birkenwälder, lange dunkle Winter, die sich schon im September mit Frost und Schnee ankündigen und erst im Mai enden - unter solch unwirtlichen Be...

  EXAMPLE 1

[15-AH] Wettbewerbsfähigkeit der europäischen Industrie
Welche Faktoren beeinträchtigen die Wettbewerbsfähigkeit der europäischen Industrie auf den Weltmärkten?

  found 1 relevant documents

SPIEGEL9495-000026 - ['Ein Europa für die Zukunft']
Der europäische Einigungsprozeß ist ins Stocken geraten, der Vertrag von Maastricht markiert ein Ende, nicht einen Neubeginn. Er hat die