## The Culture of International Relations - Text Analysis
### <span style='color: green'>SETUP </span> Prepare and Setup Notebook <span style='float: right; color: red'>MANDATORY</span>

In [1]:
# Setup
%load_ext autoreload
%autoreload 2

import sys
import ipywidgets as widgets

sys.path = list(set(['..', '../3_text_analysis']) - set(sys.path)) + sys.path

import matplotlib.pyplot as plt
import common.utility as utility
import common.treaty_utility as treaty_utility
import common.treaty_state as treaty_repository
import treaty_corpus
import textacy_corpus_utility as textacy_utility
from beakerx.object import beakerx
from beakerx import *
from IPython.display import display, set_matplotlib_formats

logger = utility.getLogger('corpus_text_analysis')

DATA_FOLDER = '../data'
WTI_INDEX = treaty_repository.load_wti_index(data_folder=DATA_FOLDER)

%matplotlib inline

current_corpus_container = lambda: textacy_utility.CorpusContainer.container()
current_corpus = lambda: textacy_utility.CorpusContainer.corpus()

import textacy_corpus_gui

try:
    container = current_corpus_container()
    textacy_corpus_gui.display_corpus_load_gui(DATA_FOLDER, WTI_INDEX, container)
except Exception as ex:
    logger.error(ex)


2019-09-27 14:40:08,150 : INFO : Reading file: Treaties_Master_List_Treaties.csv...
2019-09-27 14:40:08,605 : INFO : Reading file: country_continent.csv...
2019-09-27 14:40:08,612 : INFO : Reading file: parties_curated_parties.csv...
2019-09-27 14:40:08,622 : INFO : Reading file: parties_curated_continent.csv...
2019-09-27 14:40:08,628 : INFO : Reading file: parties_curated_group.csv...
2019-09-27 14:40:08,694 : INFO : WTI index loaded!


VBox(children=(IntProgress(value=0, layout=Layout(width='90%'), max=5), HBox(children=(Dropdown(description='C…

## Compute word in document per year frequencies

In [3]:

import pandas as pd

def compute_doc_frequencies(corpus, token_to_find, normalize='lemma'):
    f =  { 'lemma': lambda x: x.lemma_, 'lower': lambda x: x.lower_, 'orth': lambda x: x.orth_ }[normalize]
    token_to_find = token_to_find.lower() if normalize != 'orth' else token_to_find
    data = {}
    for doc in corpus:
        signed_year = doc._.meta['signed_year']
        if signed_year not in data:
            data[signed_year] = { 'signed_year': signed_year,  'match_count': 0, 'total_count': 0}
        data[signed_year]['total_count'] += 1
        if token_to_find in [ f(x) for x in doc ]:
            data[signed_year]['match_count'] += 1
    df = pd.DataFrame(list(data.values())).set_index('signed_year')
    df['doc_frequency'] = (df.match_count / df.total_count) * 100
    df = df[['total_count', 'match_count', 'doc_frequency']]
    return df

def word_doc_frequencies_gui(corpus):
    normalize_options   = { 'Text':  'orth', 'Lemma': 'lemma', 'Lower': 'lower' }
    
    gui = types.SimpleNamespace(
        normalize=widgets.Dropdown(description='Normalize', options=normalize_options, value='lemma', layout=widgets.Layout(width='200px')),
        compute=widgets.Button(description='Compute', button_style='Success', layout=widgets.Layout(width='120px')),
        token=widgets.Text(description='Word'),
        output=widgets.Output(layout={'border': '1px solid black'})
    )
    
    boxes = widgets.VBox([
        widgets.HBox([
            gui.normalize,
            gui.token,
            gui.compute
        ]),
        gui.output
    ])
    
    display(boxes)
    
    def compute_callback_handler(*_args):
        gui.output.clear_output()
        with gui.output:
            try:
                gui.compute.disabled = True
                df_counts = compute_doc_frequencies(
                    corpus=corpus,
                    token_to_find=gui.token.value,
                    normalize=gui.normalize.value
                )
                display(df_counts)
            except Exception as ex:
                logger.error(ex)
                raise
            finally:
                gui.compute.disabled = False

    gui.compute.on_click(compute_callback_handler)

try:
    word_doc_frequencies_gui(current_corpus())
except Exception as ex:
    logger.error(ex)

VBox(children=(HBox(children=(Dropdown(description='Normalize', index=2, layout=Layout(width='200px'), options…