### The Culture of International Relations - Corpus statistics

#### About this notebook
spaCy tutorial: https://github.com/NirantK/nlp-python-deep-learning/blob/master/Part-03%20NLP%20with%20spaCy%20and%20Textacy.ipynb


In [28]:
# Setup
%load_ext autoreload
%autoreload 2

import sys
import os
import nltk
import re
import typing.re
import collections

sys.path = list(set(['.', '..']) - set(sys.path)) + sys.path

import textacy, spacy 
import pandas as pd
import ipywidgets as widgets
import zipfile
import common.utility as utility
import common.widgets_config as widgets_config
import common.config as config
import common.utility as utility
import common.treaty_utility as treaty_utility
import common.treaty_state as treaty_repository
import treaty_corpus

logger = utility.getLogger('corpus_text_analysis')

PERIOD_GROUP = 'years_1945-1972'
DATA_FOLDER = '../data'
CORPUS_PATH = os.path.join(DATA_FOLDER, 'treaty_text_corpora_20181115.zip')
PATTERN = '*.txt'
LANGUAGE = 'en'
LANGUAGE_MAP = { 'en': 'english', 'fr': 'french', 'it': 'other', 'de': 'other' }
LANGUAGE_MODEL_MAP = { 'en': 'en_core_web_sm', 'fr': 'fr_core_web_sm', 'it': 'it_core_web_sm', 'de': 'de_core_web_sm' }

WTI_INDEX = treaty_repository.load_wti_index(data_folder=DATA_FOLDER)

# sudo python3 -m spacy download en_core_web_lg
# nlp = spacy.load('en_core_web_lg')

def get_filenames(zip_filename, extension='.txt'):
    with zipfile.ZipFile(zip_filename, mode='r') as zf:
        return [ x for x in zf.namelist() if x.endswith(extension) ]
    
def get_text(zip_filename, filename):
    with zipfile.ZipFile(zip_filename, mode='r') as zf:
        return zf.read(filename).decode(encoding='utf-8')
    

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


2018-11-15 11:13:19,538 : INFO : Data loaded!


In [29]:
# Treaty data utilities
def get_treaties(wti_index, language, period_group='years_1945-1972', treaty_filter='is_cultural', parties=None):
    period_group = config.PERIOD_GROUPS_ID_MAP[period_group]
    treaties = wti_index.get_treaties_within_division(
        period_group=period_group,
        treaty_filter=treaty_filter,
        recode_is_cultural=False,
        parties=parties
    )
    treaties = treaties[treaties[LANGUAGE_MAP[language]]==language]
    return treaties


In [41]:
# %writefile: spacy_utility
from spacy.language import Language
from textacy.spacier.utils import merge_spans

HYPHEN_REGEXP = re.compile(r'\b(\w+)-\s*\r?\n\s*(\w+)\b', re.UNICODE)

def preprocess_text(source_filename, target_filename):
    filenames = get_filenames(source_filename)
    texts = ( (filename, get_text(source_filename, filename)) for filename in filenames )
    with zipfile.ZipFile(target_filename, 'w', zipfile.ZIP_DEFLATED) as zf:
        for filename, text in texts:
            logger.info('Processing ' + filename)
            text = re.sub(HYPHEN_REGEXP, r"\1\2\n", text)
            text = textacy.preprocess.normalize_whitespace(text)   
            text = textacy.preprocess.fix_bad_unicode(text)   
            text = textacy.preprocess.replace_currency_symbols(text)
            text = textacy.preprocess.unpack_contractions(text)
            #text = textacy.preprocess.replace_urls(text)
            #text = textacy.preprocess.replace_emails(text)
            #text = textacy.preprocess.replace_phone_numbers(text)
            text = textacy.preprocess.remove_accents(text)
            zf.writestr(filename, text)

def get_document_stream(corpus_path, lang, wti_index, period_group, treaty_filter='is_cultural', parties=None)

    treaties = get_treaties(wti_index, language=lang, period_group=period_group, treaty_filter=treaty_filter, parties=parties)
    treaties['treaty_id'] = treaties.index
    treaty_ids = list(treaties.index)
    
    documents = treaty_corpus.TreatyCompressedFileReader(corpus_path, lang, treaty_ids)

    for treaty_id, language, filename, text in documents:
        assert language == lang
        metadata = treaties.loc[treaty_id]
        yield filename, text, metadata
        
def create_textacy_corpus(documents, nlp, preprocess_args=None):
    corpus = textacy.Corpus(nlp)
    for filename, text, metadata in documents:
        if not preprocess_args is None:
            text = re.sub(HYPHEN_REGEXP, r"\1\2\n", text)
            text = textacy.preprocess.preprocess_text(text, **preprocess_args)
        corpus.add_text(text, utility.extend(dict(filename=filename), metadata))
    return corpus

def get_textacy_corpus_filename(source_path, language, nlp_args=None, preprocess_args=None):
    nlp_args = nlp_args or {}
    preprocess_args = preprocess_args or {}
    disabled_pipes = nlp_args.get('disable', [])
    suffix = '_{}_{}{}'.format(
        language,
        '_'.join([ k for k in preprocess_args if preprocess_args[k] ]),
        '_disable({})'.format(','.join(disabled_pipes)) if len(disabled_pipes) > 0 else ''
    )
    return path_add_suffix(source_path, suffix, new_extension='.pkl')

def setup_nlp_language_model(language, **nlp_args):
    
    def remove_whitespace_entities(doc):
        doc.ents = [ e for e in doc.ents if not e.text.isspace() ]
        return doc

    logger.info('Loading model: %s...', language)
    
    Language.factories['remove_whitespace_entities'] = lambda nlp, **cfg: remove_whitespace_entities
    
    nlp = textacy.load_spacy(LANGUAGE_MODEL_MAP[language], **nlp_args)
    pipeline = lambda: [ x[0] for x in nlp.pipeline ]
    
    logger.info('Using pipeline: ' + ' '.join(pipeline()))
    
    return nlp

def generate_textacy_corpus(source_path, language, options):
    
    nlp = setup_nlp_language_model(language, options.get('nlp_args', {}))
    
    if force or not os.path.isfile(textacy_corpus_filename):
        logger.info('Working: Computing new corpus ' + textacy_corpus_filename + '...')
        corpus = create_textacy_corpus(document_stream, nlp, preprocess_args)
        corpus.save(textacy_corpus_filename)
    else:
        logger.info('Working: Loading corpus ' + textacy_corpus_filename + '...')
        corpus = textacy.Corpus.load(textacy_corpus_filename)
        
    if kwargs.get('merge_named_entities', False):
        logger.info('Working: Merging named entities...')
        for doc in corpus:
            named_entities = textacy.extract.named_entities(doc)
            merge_spans(named_entities, doc.spacy_doc)
    else:
        logger.info('Note: named entities not merged')
        
    logger.info('Done!')
    
    return textacy_corpus_filename, corpus

def propagate_document_attributes(corpus):
    for doc in corpus:
        doc.spacy_doc.user_data['title'] = doc.metadata['treaty_id']
        doc.spacy_doc.user_data['treaty_id'] = doc.metadata['treaty_id']
    
def get_corpus_documents(corpus):
    df = pd.DataFrame([
        (document_id, doc.metadata['treaty_id'], doc.metadata['filename'])
                for document_id, doc in enumerate(corpus) ], columns=['treaty_id', 'title', 'filename']
    ).set_index('treaty_id')
    return df

CORPUS_OPTIONS = {
    'language': LANGUAGE,
    PREPPED_CORPUS_PATH = utility.path_add_suffix(CORPUS_PATH, '_preprocessed')
}

PREPPED_CORPUS_PATH = utility.path_add_suffix(CORPUS_PATH, '_preprocessed')
if not os.path.isfile(PREPPED_CORPUS_PATH):
    preprocess_text(CORPUS_PATH, PREPPED_CORPUS_PATH)

def corpus_stream():
    return get_document_stream(PREPPED_CORPUS_PATH, LANGUAGE, WTI_INDEX, PERIOD_GROUP, treaty_filter='is_cultural', parties=None)

target_name = get_textacy_corpus_filename(source_path, language, corpus_args, preprocess_args):

TEXTACY_CORPUS_FILENAME, CORPUS = generate_textacy_corpus(
    PREPPED_CORPUS_PATH,
    LANGUAGE,
    preprocess_args=dict(),
    merge_named_entities=True,
    force=False,
    target_folder=DATA_FOLDER
)

2018-11-15 13:34:34,171 : INFO : Loading model: english...
2018-11-15 13:34:34,172 : INFO : Using pipeline: tagger parser ner
2018-11-15 13:34:34,173 : INFO : Working: Computing new corpus ../data/corpus_en__disable().pkl...


TypeError: create_textacy_corpus() missing 1 required positional argument: 'period_group'

In [3]:
# %writefile: spacy_utility

import spacy
import pandas as pd

def doc_to_dataframe(spacy_doc, ignore_tags=None, ignore_pos=None):
    ignore_tags = ignore_tags or ['_SP']
    ignore_pos = ignore_pos or []
    df_source = ({
        "text": w.text.lower(),
        "lemma": w.lemma_.lower(),
        "pos": w.pos_,
        "tag": w.tag_,
        "dep": w.dep_,
        "is_alpha": w.is_alpha,
        "is_stop": w.is_stop
    } for w in spacy_doc
        if w.tag_ not in ignore_tags
        and w.pos_ not in ignore_pos)
    return pd.DataFrame(df_source)

def document_statistics(spacy_doc):
    
    spacy_sentences = list(spacy_doc.sents)
    token_length_histogram = collections.Counter((len(x) for x in spacy_doc))
    
    return {
        'word_count': len(spacy_doc),
        'sentence_count': len(spacy_sentences),
        'avg_words_per_sentence': len(spacy_doc) / len(spacy_sentences),
        'token_lengths': dict(token_length_histogram)
    }

def examplary_pipeline_component(doc):
    '''Logs length of  document
    
    '''
    logger.info("Doc has %s tokens." % len(doc))
    return doc

class ExamplaryPipelineClass(object):
    ''' A custom pipeline component can also be a class. A custom class compontent must be added to the "Language.factories" so spaCy know how to create them.
    
    from spacy.language import Language
    Language.factories['entity_matcher'] = lambda nlp, **cfg: ExamplaryPipelineClass(nlp, **cfg)
    
    In this example the instance is initialised with the nlp object,
    a terminology list and an entity label. Using the PhraseMatcher, it then matches the terms in the Doc and adds them to the existing entities.
    '''
    name = 'entity_matcher'

    def __init__(self, nlp, terms, label):
        patterns = [nlp(text) for text in terms]
        self.matcher = PhraseMatcher(nlp.vocab)
        self.matcher.add(label, None, *patterns)

    def __call__(self, doc):
        matches = self.matcher(doc)
        for match_id, start, end in matches:
            span = Span(doc, start, end, label=match_id)
            doc.ents = list(doc.ents) + [span]
        return doc
    

### Create and Store a TextaCy Corpus

In [4]:
# Create a textacy corpus

def create_textacy_corpus(wti_index, corpus_path, language, period_group, treaty_filter='is_cultural', parties=None):
    
    treaties = get_treaties(wti_index, language=language, period_group=period_group, treaty_filter=treaty_filter, parties=parties)
    treaties['treaty_id'] = treaties.index

    treaty_ids = list(treaties.index)
    metadata_stream = ( treaties.loc[treaty_id].to_dict() for treaty_id in treaty_ids )
    text_stream = ( content for _, _, _, content in treaty_corpus.TreatyCompressedFileReader(corpus_path, language, treaty_ids))

    corpus = textacy.Corpus(language, texts=text_stream, metadatas=metadata_stream)
    
    return corpus

period_group = 'years_1945-1972'
language = 'en'

corpus = create_textacy_corpus(wti_index, CORPUS_PATH, language, period_group=period_group, treaty_filter='is_cultural', parties=None)

corpus_tag = 'test1'
corpus_name = os.path.join('./data', 'corpus_textacy_{}_{}_{}.pkl'.format(language, period_group, corpus_tag))
corpus.save(corpus_name)


KeyboardInterrupt: 

In [None]:
#corpus = create_textacy_corpus(wti_index, CORPUS_PATH, LANGUAGE, period_group='years_1945-1972', treaty_filter='is_cultural', parties=None)

CORPUS_TAG = 'test1'
corpus_name = os.path.join('./data', 'corpus_textacy_{}_{}_{}.pkl'.format(LANGUAGE, PERIOD_GROUP, CORPUS_TAG))
corpus = textacy.Corpus.load(corpus_name)


In [177]:
corpus[0][400:500].text


') of Czechoslovak professors of higher education, who will work in French universities, institutes and laboratories, in order to mitigate the effects of the interruption of their scientific and professional training owing to the closing of Czechoslovak higher educational establishments by the Hitlers regime.\r\nThe Czechoslovak Government and the French Government will take the necessary measures to ensure that the exchange of work and study posts in specialised institutes of the two States should be encouraged as far as possible by savants and professors on a reciprocal basis.\r\nThe Czechoslovak Government and the'

In [None]:

corpus1960 = textacy.Corpus(nlp, texts=[d.text for d in corpus.get(lambda x: bool(x.metadata['signed_year'] == 1960))])

#corpus1960 = textacy.Corpus('en')
#docs = corpus.get(lambda x: bool(x.metadata['signed_year'] == 1960))
#for doc in docs:
#    corpus1960.add_doc(doc)
    
for doc in corpus1960:
    print([x.lemma_ for x in textacy.extract.words(doc, include_pos=['NOUN'], min_freq=2) ])
    #print([x for x in textacy.extract.acronyms_and_definitions(doc) ])
    #print(set([x.lemma_ for x,y,z in textacy.extract.subject_verb_object_triples(doc) ]))
    #print([x for x in doc.spacy_doc.noun_chunks]) #  x for x in textacy.extract.noun_chunks(doc, drop_determiners=True, min_freq=1)])
    

In [207]:
corpus[0].spacy_doc.count_by(attr_id=spacy.attrs.IS_CURRENCY)

{}

In [45]:
#ts = textacy.TextStats(corpus[0])
#ts.basic_counts

import spacy
from spacy.attrs import LOWER, POS, ENT_TYPE, IS_ALPHA
from spacy.tokens import Doc
import numpy

def remove_tokens_on_match(doc):
    indexes = []
    
    for index, token in enumerate(doc):
        if (token.pos_  in ('PUNCT', 'NUM', 'SYM')):
            indexes.append(index)
            
    np_array = doc.to_array([LOWER, POS, ENT_TYPE, IS_ALPHA])
    np_array = numpy.delete(np_array, indexes, axis = 0)
    
    doc2 = Doc(doc.vocab, words=[t.text for i, t in enumerate(doc) if i not in indexes])
    doc2.from_array([LOWER, POS, ENT_TYPE, IS_ALPHA], np_array)
    
    return doc2

meta_columns = [ 'treaty_id', 'is_cultural', 'signed_year', 'signed', 'topic' ]

def f(d):
    return not d.spacy_doc.is_stopword

documents = ( d for d in corpus if f(d) )

df = pd.DataFrame([ utility.extend({ k: x.metadata[k] for k in meta_columns }, textacy.TextStats(x).basic_counts) for x in documents ])
df

AttributeError: 'spacy.tokens.doc.Doc' object has no attribute 'is_stopword'

### Create DataFrame Corpus from spaCy Documents

In [None]:

def create_dataframe_corpus(treaties, corpus_path, language, ticker=utility.noop):
    
    nlp = spacy.load(language)
    treaty_ids = list(treaties.index)
    text_stream = ( content for _, _, _, content in treaty_corpus.TreatyCompressedFileReader(corpus_path, language, treaty_ids))

    df_corpus = None
    try:
        for (_, treaty_id, _, content) in stream:
            spacy_doc = spacy_model(content)
            df = spacy_doc_2_df(spacy_doc)
            df['treaty_id'] = treaty_id
            if df_corpus is None:
                df_corpus = df
            else:
                df_corpus = df_corpus.append(df, ignore_index=True)
            tick()
    finally:
        tick(0)
    return df_corpus

progress_widget = widgets.IntProgress(min=0,max=treaties.shape[0],step=1, layout=widgets.Layout(width='95%'))
display(progress_widget)

def ticker(w):
    def tick(x=None):
        w.value = w.value + 1 if x is None else x
    return tick

tick = ticker(progress_widget)
df_corpus = create_dataframe_corpus(treaties, corpus_path=CORPUS_PATH, language=LANGUAGE, ticker=tick)
df_corpus.to_csv('spaCy_PoS_corpus.csv', sep='\t', encoding='utf-8') # , compression='zip')


In [11]:
df_corpus2 = pd.read_csv('spaCy_PoS_corpus.csv', index_col=0, sep='\t', encoding='utf-8')

In [60]:
df_treaty_pos_tokens = df_corpus[~df_corpus.Stop&~df_corpus.POS.isin(['SPACE', 'PUNCT'])].groupby(['treaty_id', 'Lemma', 'POS']).size()
df_treaty_pos_tokens = df_treaty_pos_tokens.reset_index().rename(columns={'Lemma': 'lemma', 'POS': 'pos', 0: 'count'})
df_treaty_pos_tokens = df_treaty_pos_tokens.set_index(['treaty_id'])
df_treaty_pos_tokens = df_treaty_pos_tokens.merge(pd.DataFrame(treaties['signed_year']), how='inner', left_index=True, right_index=True)

In [None]:

#df_treaty_pos_tokens.head()

df_year_pos_tokens = df_treaty_pos_tokens.groupby(['signed_year', 'lemma', 'pos']).sum()
df_year_pos_tokens = df_year_pos_tokens.reset_index()
df_year_pos_tokens[df_year_pos_tokens.pos.isin(['NOUN'])].nlargest(50, 'count')


In [1]:
df_token_pos = df_corpus[~df_corpus.POS.isin(['SPACE', 'PUNCT'])].groupby(['Lemma', 'POS']).size()

NameError: name 'df_corpus' is not defined

## Text Corpus vs WTI Index

In [251]:
corpus_documents = corpus.documents.set_index(['treaty_id', 'language'])
treaty_text_languages = wti_index.get_treaty_text_languages().set_index(['treaty_id', 'language'])

treaties_in_corpus_not_in_wti = corpus_documents.index.difference(treaty_text_languages.index).get_values()
treaties_in_wti_not_in_corpus = treaty_text_languages.index.difference(corpus_documents.index).get_values()

print(  'Found in corpus, but not in WTI: ' +
        ', '.join([ '{}/{}'.format(x,y) for x,y in treaties_in_corpus_not_in_wti ]))

print(  'Found in WTI, but not in corpus: ' +
        ', '.join([ '{}/{}'.format(x,y) for x,y in treaties_in_wti_not_in_corpus ]))

#corpus_documents.loc[corpus_text_not_in_wti]
#treaty_text_languages.loc[wti_not_in_corpus]

#wti_not_in_corpus

# Duplicates:
#corpus_documents.index.get_duplicates()
#treaty_text_languages.corpus_documents.index.get_duplicates()


Found in corpus, but not in WTI: 304127/en, 400039/de, 415293/fr, XXX042/de, XXX046/de
Found in WTI, but not in corpus: XXX010/de


### Task: Basic Corpus Statistics
See https://www.nltk.org/book/ch01.html

* Size of treaties over time
* Unique word, unique words per word class
* Lexical diversity
* Frequency distribution
* Average word length, sentence length


```python
 	
>>> len(texts) / count(docs)
0.06230453042623537
>>>

>>> len(set(text3)) / len(text3)
0.06230453042623537
>>>

>>> > def lexical_diversity(text): [1]
...     return len(set(text)) / len(text) [2]
...
>>> def percentage(count, total): [3]
...     return 100 * count / total

# Most common words
fdist1 = FreqDist(text1)
fdist1.most_common(50)

# Word length frequencies
>>> fdist = FreqDist(len(w) for w in text1)  [2]
>>> print(fdist)

```


In [10]:
# Code 

corpus = None
def display_token_toplist_interact(source_folder):
    global corpus
    progress_widget = None
    
    def display_token_toplist(source_folder, language, statistics='', remove_stopwords=False):
        global corpus

        try:

            progress_widget.value = 1

            corpus = TreatyCorpusSaveLoad(source_folder=source_folder, lang=language[0]).load_mm_corpus()

            progress_widget.value = 2
            service = MmCorpusStatisticsService(corpus, dictionary=corpus.dictionary, language=language)

            print("Corpus consists of {} documents, {} words in total and a vocabulary size of {} tokens."\
                      .format(len(corpus), corpus.dictionary.num_pos, len(corpus.dictionary)))

            progress_widget.value = 3
            if statistics == 'word_freqs':
                display(service.compute_word_frequencies(remove_stopwords))
            elif statistics == 'documents':
                display(service.compute_document_stats())
            elif statistics == 'word_count':
                display(service.compute_word_stats())
            else:
                print('Unknown: ' + statistics)

        except Exception as ex:
            logger.error(ex)

        progress_widget.value = 5
        progress_widget.value = 0
        return corpus
    
    language_widget=widgets.Dropdown(
        options={
            'English': ('en', 'english'),
            'French': ('fr', 'french'),
            'German': ('de', 'german'),
            'Italian': ('it', 'italian')
        },
        value=('en', 'english'),
        description='Language:', **dict(layout=widgets.Layout(width='260px'))
    )
    
    statistics_widget=widgets.Dropdown(
        options={
            'Word freqs': 'word_freqs',
            'Documents': 'documents',
            'Word count': 'word_count'
        },
        value='word_count',
        description='Statistics:', **dict(layout=widgets.Layout(width='260px'))
    )
    
    remove_stopwords_widget=widgets.ToggleButton(
        description='Remove stopwords', value=True,
        tooltip='Do not include stopwords in token toplist'
    )
    
    progress_widget=widgets.IntProgress(min=0, max=5, step=1, value=0) #, layout=widgets.Layout(width='100%')),

    wi = widgets.interactive(
        display_token_toplist,
        source_folder=source_folder,
        language=language_widget,
        statistics=statistics_widget,
        remove_stopwords=remove_stopwords_widget
    )

    boxes = widgets.HBox(
        [
            language_widget, statistics_widget, remove_stopwords_widget, progress_widget
        ]
    )
    display(widgets.VBox([boxes, wi.children[-1]]))
    wi.update()

display_token_toplist_interact('../data')

VBox(children=(HBox(children=(Dropdown(description='Language:', index=1, layout=Layout(width='260px'), options…

In [16]:
filename_predicate = None
filename_predicate = filename_predicate or (lambda x: True)

### <span style='color: red'>WORK IN PROGRESS</span> Task: Treaty Keyword Extraction (using TF-IDF weighing)
- [ML Wiki.org](http://mlwiki.org/index.php/TF-IDF)
- [Wikipedia](https://en.wikipedia.org/wiki/Tf%E2%80%93idf)
- Spärck Jones, K. (1972). "A Statistical Interpretation of Term Specificity and Its Application in Retrieval".
- Manning, C.D.; Raghavan, P.; Schutze, H. (2008). "Scoring, term weighting, and the vector space model". ([PDF](http://nlp.stanford.edu/IR-book/pdf/06vect.pdf))
- https://markroxor.github.io/blog/tfidf-pivoted_norm/
$\frac{tf-idf}{\sqrt(rowSums( tf-idf^2 ) )}$
- https://nlp.stanford.edu/IR-book/html/htmledition/pivoted-normalized-document-length-1.html

Neural Network Methods in Natural Language Processing, Yoav Goldberg:
![image.png](attachment:image.png)

In [18]:
# Code
from scipy.sparse import csr_matrix
%timeit

    
def get_top_tfidf_words(data, n_top=5):
    top_list = data.groupby(['treaty_id'])\
        .apply(lambda x: x.nlargest(n_top, 'score'))\
        .reset_index(level=0, drop=True)
    return top_list

def compute_tfidf_scores(corpus, dictionary, smartirs='ntc'):
    #model = gensim.models.logentropy_model.LogEntropyModel(corpus, normalize=True)
    model = gensim.models.tfidfmodel.TfidfModel(corpus, dictionary=dictionary, normalize=True) #, smartirs=smartirs)
    rows, cols, scores = [], [], []
    for r, document in enumerate(corpus): 
        vector = model[document]
        c, v = zip(*vector)
        rows += (len(c) * [ int(r) ])
        cols += c
        scores += v
        
    return csr_matrix((scores, (rows, cols)))
    
if True: #'tfidf_cache' not in globals():
    tfidf_cache = {
    }
    
def display_tfidf_scores(source_folder, language, period, n_top=5, threshold=0.001):
    
    global state, tfw, tfidf_cache
    
    try:
        treaties = state.treaties

        tfw.progress.value = 0
        tfw.progress.value += 1
        if language[0] not in tfidf_cache.keys():
            corpus = TreatyCorpusSaveLoad(source_folder=source_folder, lang=language[0])\
                .load_mm_corpus(normalize_by_D=True)
            document_names = corpus.document_names
            dictionary = corpus.dictionary
            _ = dictionary[0]

            tfw.progress.value += 1
            A = compute_tfidf_scores(corpus, dictionary)

            tfw.progress.value += 1
            scores = pd.DataFrame(
                [ (i, j, dictionary.id2token[j], A[i, j]) for i, j in zip(*A.nonzero())],
                columns=['document_id', 'token_id', 'token', 'score']
            )
            tfw.progress.value += 1
            scores = scores.merge(document_names, how='inner', left_on='document_id', right_index=True)\
                .drop(['document_id', 'token_id', 'document_name'], axis=1)

            scores = scores[['treaty_id', 'token', 'score']]\
                .sort_values(['treaty_id', 'score'], ascending=[True, False])

            tfidf_cache[language[0]] = scores

        scores = tfidf_cache[language[0]]
        if threshold > 0:
            scores = scores.loc[scores.score >= threshold]

        tfw.progress.value += 1

        #scores = get_top_tfidf_words(scores, n_top=5)
        #scores = scores.groupby(['treaty_id']).sum() 

        scores = scores.groupby(['treaty_id'])\
            .apply(lambda x: x.nlargest(n_top, 'score'))\
            .reset_index(level=0, drop=True)\
            .set_index('treaty_id')

        if period is not None:
            periods = state.treaties[period]
            scores = scores.merge(periods.to_frame(), left_index=True, right_index=True, how='inner')\
                .groupby([period, 'token']).score.agg([np.mean])\
                .reset_index().rename(columns={0:'score'}) #.sort_values('token')

        #['token'].apply(' '.join)

        display(scores)
    except Exception as ex:
        logger.error(ex)
        
    tfw.progress.value = 0

#if 'tfidf_scores' not in globals():
#    tfidf_scores = compute_document_tfidf(corpus, corpus.dictionary, state.treaties)
#    tfidf_scores = tfidf_scores.sort_values(['treaty_id', 'score'], ascending=[True, False])

tfw = BaseWidgetUtility(
    language=widgets.Dropdown(
        options={
            'English': ('en', 'english'),
            'French': ('fr', 'french'),
            'German': ('de', 'german'),
            'Italian': ('it', 'italian')
        },
        value=('en', 'english'),
        description='Language:', **drop_style
    ),
    remove_stopwords=widgets.ToggleButton(
        description='Remove stopwords', value=True,
        tooltip='Do not include stopwords in token toplist', **toggle_style
    ),    
    n_top=widgets.IntSlider(
        value=5, min=1, max=25, step=1,
        description='Top #:',
        continuous_update=False
    ),
    threshold=widgets.FloatSlider(
        value=0.001, min=0.0, max=0.5, step=0.01,
        description='Threshold:',
        tooltip='Word having a TF-IDF score below this value is filtered out',
        continuous_update=False,
        readout_format='.3f',
    ), 
    period=widgets.Dropdown(
        options={
            '': None,
            'Year': 'signed_year',
            'Default division': 'signed_period',
            'Alt. division': 'signed_period_alt'
        },
        value='signed_period',
        description='Period:', **drop_style
    ),
    output=widgets.Dropdown(
        options={
            '': None,
            'Year': 'signed_year',
            'Default division': 'signed_period',
            'Alt. division': 'signed_period_alt'
        },
        value='signed_period',
        description='Output:', **drop_style
    ),
    progress=widgets.IntProgress(min=0, max=5, step=1, value=0) #, layout=widgets.Layout(width='100%')),
)

itfw = widgets.interactive(
    display_tfidf_scores,
    source_folder='./data',
    language=tfw.language,
    n_top=tfw.n_top,
    threshold=tfw.threshold,
    period=tfw.period
)

boxes = widgets.HBox(
    [
        widgets.VBox([tfw.language, tfw.period]),
        widgets.VBox([tfw.n_top, tfw.threshold]),
        widgets.VBox([tfw.progress, tfw.output])
    ]
)

display(widgets.VBox([boxes, itfw.children[-1]]))
itfw.update()
