## The Culture of International Relations - Text Analysis
### <span style='color: green'>SETUP </span> Prepare and Setup Notebook <span style='float: right; color: red'>MANDATORY</span>

In [None]:

import ipywidgets as widgets
import types
import pandas as pd

from common import treaty_state
from common import config
from common.corpus import textacy_corpus_utility as textacy_utility
from IPython.display import display
from loguru import logger

treaty_state.load_wti_index_with_gui(data_folder=config.DATA_FOLDER)

%matplotlib inline

current_corpus_container = lambda: textacy_utility.CorpusContainer.container()
current_corpus = lambda: textacy_utility.CorpusContainer.corpus()

from common.gui import textacy_corpus_gui

try:
    container: textacy_utility.CorpusContainer = current_corpus_container()
    textacy_corpus_gui.display_corpus_load_gui(config.DATA_FOLDER, treaty_state.current_wti_index(), container)
except Exception as ex:
    logger.error(ex)


## Compute word in document per year frequencies

N.B. *All* documents in the selected text corpus are used in this computation (WTI index selection above has no effect in this notebook)!


In [None]:

def compute_doc_frequencies(corpus, token_to_find, normalize='lemma'):
    logger.info("Current corpus has {} documents".format(len(corpus)))
    f =  { 'lemma': lambda x: x.lemma_, 'lower': lambda x: x.lower_, 'orth': lambda x: x.orth_ }[normalize]
    token_to_find = token_to_find.lower() if normalize != 'orth' else token_to_find
    data = {}
    for doc in corpus:
        signed_year = doc._.meta['signed_year']
        if signed_year not in data:
            data[signed_year] = { 'signed_year': signed_year,  'match_count': 0, 'total_count': 0}
        data[signed_year]['total_count'] += 1
        if token_to_find in [ f(x) for x in doc ]:
            data[signed_year]['match_count'] += 1
    df = pd.DataFrame(list(data.values())).set_index('signed_year')
    df['doc_frequency'] = (df.match_count / df.total_count) * 100
    df = df[['total_count', 'match_count', 'doc_frequency']]
    return df

def word_doc_frequencies_gui(corpus):
    normalize_options   = { 'Text':  'orth', 'Lemma': 'lemma', 'Lower': 'lower' }
    
    gui = types.SimpleNamespace(
        normalize=widgets.Dropdown(description='Normalize', options=normalize_options, value='lemma', layout=widgets.Layout(width='200px')),
        compute=widgets.Button(description='Compute', button_style='Success', layout=widgets.Layout(width='120px')),
        token=widgets.Text(description='Word'),
        output=widgets.Output(layout={'border': '1px solid black'})
    )
    
    boxes = widgets.VBox([
        widgets.HBox([
            gui.normalize,
            gui.token,
            gui.compute
        ]),
        gui.output
    ])
    
    display(boxes)
    
    def compute_callback_handler(*_args):
        gui.output.clear_output()
        with gui.output:
            try:
                gui.compute.disabled = True
                df_counts = compute_doc_frequencies(
                    corpus=corpus,
                    token_to_find=gui.token.value,
                    normalize=gui.normalize.value
                )
                display(df_counts)
            except Exception as ex:
                logger.error(ex)
                raise
            finally:
                gui.compute.disabled = False

    gui.compute.on_click(compute_callback_handler)

try:
    word_doc_frequencies_gui(current_corpus())
except Exception as ex:
    logger.error(ex)