# Analyze Corpora

In [None]:
import dumpr.common as dc

import nltk
from tqdm import tqdm_notebook as tqdm

import pickle
import pathlib
import multiprocessing as mp
from collections import defaultdict

from typing import Tuple
from typing import Generator

## CLEF

In [None]:
basepath = pathlib.Path('/mnt/maehre/dump/text/de/')
files = [
    'frankfurter-rundschau-9495/frankfurter-rundschau-9495.full.xml',
    'sda-94/sda-94.full.xml',
    'sda-95/sda-95.full.xml',
    'spiegel-9495/spiegel-9495.full.xml', ]


def gen_docs() -> Generator[str, None, None]:
    for xml in (basepath/f for f in files):
        with dc.BatchReader(str(xml)) as reader:
            for doc in reader:
                yield doc.content


def process(content: str) -> Tuple[Tuple[str, int]]:
    vocab = defaultdict(lambda: 0)
    for w in nltk.word_tokenize(content):
        vocab[w.lower()] += 1
    
    return tuple(vocab.items())
     

def run():
    total = 13781 + 69438 + 71677 + 139715
    
    vocab = defaultdict(lambda: 0)
    merge_bar = tqdm(total=total, position=1, desc='merged')
    
    def merge(result):    
        for word, count in result:
            vocab[word] += count
        merge_bar.update(1)
        
    with mp.Pool(3) as pool:
        results = []
        
        for i, content in tqdm(enumerate(gen_docs()), total=total, desc='read'):
            res = pool.apply_async(process, (content, ), callback=merge)
            results.append(res)

        for res in results:
            res.wait()
    
    mapping = [(k, v) for v, k in sorted({v: k for k, v in vocab.items()}.items(), reverse=True)]
    
    print('writing text file')
    with open('../opt/clef.vocab.txt', mode='w') as fd:
        for tup in mapping:
            fd.write('{} {}\n'.format(*tup))
    
    print('dumping dict')
    with open('../opt/clef.vocab.pickle', mode='wb') as fd:
        pickle.dump(dict(mapping), fd)
                               
    merge_bar.close()

run()
print('\ndone.')

In [None]:
with open('../opt/clef.vocab.pickle', mode='rb') as fd:
    vocab = pickle.load(fd)

In [None]:
print('vocabulary size', len(vocab))
print('total word count', sum(vocab.values()))