## The Culture of International Relations - Text Analysis
### <span style='color: green'>SETUP </span> Prepare and Setup Notebook <span style='float: right; color: red'>MANDATORY</span>

In [1]:
# Setup
%load_ext autoreload
%autoreload 2

import sys, os, collections, zipfile
import re, typing.re
import nltk, textacy, spacy 
import pandas as pd
import ipywidgets as widgets

sys.path = list(set(['..', '../3_text_analysis']) - set(sys.path)) + sys.path

#import bokeh, bokeh.plotting, bokeh.models, 
import matplotlib.pyplot as plt
import common.utility as utility
import common.widgets_utility as widgets_utility
import common.widgets_config as widgets_config
import common.config as config
import common.utility as utility
import common.treaty_utility as treaty_utility
import common.treaty_state as treaty_repository
import treaty_corpus
#import types, glob
import textacy.keyterms

from beakerx.object import beakerx
from beakerx import *
from IPython.display import display, set_matplotlib_formats

logger = utility.getLogger('corpus_text_analysis')

utility.setup_default_pd_display(pd)

DATA_FOLDER = '../data'
PATTERN = '*.txt'
PERIOD_GROUP = 'years_1945-1972'
DF_TAGSET = pd.read_csv('../data/tagset.csv', sep='\t').fillna('')
WTI_INDEX = treaty_repository.load_wti_index(data_folder=DATA_FOLDER)
TREATY_TIME_GROUPINGS = WTI_INDEX.get_treaty_time_groupings()

%matplotlib inline
# set_matplotlib_formats('svg')   
#bokeh.plotting.output_notebook()

current_corpus_container = lambda: textacy_utility.CorpusContainer.container()
current_corpus = lambda: textacy_utility.CorpusContainer.corpus()


2019-08-30 16:09:44,844 : INFO : WTI index loaded!


## <span style='color: green'>PREPARE </span> Load and Prepare Corpus <span style='float: right; color: red'>MANDATORY</span>


In [2]:
import textacy_corpus_utility as textacy_utility
import textacy_corpus_gui

try:
    container = current_corpus_container()
    textacy_corpus_gui.display_corpus_load_gui(DATA_FOLDER, WTI_INDEX, container)
except Exception as ex:
    logger.error(ex)


VBox(children=(IntProgress(value=0, layout=Layout(width='90%'), max=5), HBox(children=(Dropdown(description='C…

### <span style='color: green;'>DESCRIBE</span> List of Most Frequent Words<span style='color: blue; float: right'>OPTIONAL</span>

In [6]:
import spacy
import collections
from spacy import attrs

def textacy_doc_to_bow(doc, target='lemma', weighting='count', as_strings=False, include=None):

    spacy_store = doc.vocab.strings
    
    weighing_keys = { 'count', 'freq' }
    target_keys = { 'lemma': attrs.LEMMA, 'lower': attrs.LOWER, 'orth': attrs.ORTH }
    
    default_exclude = lambda x: x.is_stop or x.is_punct or x.is_space
    exclude = default_exclude if include is None else lambda x: x.is_stop or x.is_punct or x.is_space or not include(x)
    
    assert weighting in weighing_keys
    assert target in target_keys

    target_weights = doc.count_by(target_keys[target], exclude=exclude)
    n_tokens = doc._.n_tokens

    if weighting == 'count':
        n_tokens = 1
        
    if as_strings:
        bow = {
            spacy_store[word_id]: count / n_tokens for word_id, count in target_weights.items()
        }
        if target == 'lemma':
            lower_cased_word_counts = collections.Counter()
            for k, v in bow.items():
                lower_cased_word_counts.update({ k.lower(): v })
            bow = lower_cased_word_counts
    else:
        bow = target_weights
        
    return bow

def compute_list_of_most_frequent_words(
    corpus,
    gui,
    group_by_column='signed_year',
    parties=None,
    target='lemma',
    weighting='count',
    include_pos=None,
    stop_words=None,
    display_score=False
):
    stop_words = stop_words or set()
    
    def include(token):
        flag = True
        if not include_pos is None:
             flag = flag and token.pos_ in include_pos
        flag = flag and token.lemma_ not in stop_words
        return flag
    
    gui.progress.max = len(corpus)
    
    df_freqs = pd.DataFrame({ 'treaty_id': [], 'signed_year': [], 'token': [], 'score': [] })
    
    parties_set = set(parties or [])
    
    docs = corpus if len(parties_set) == 0 \
        else ( x for x in corpus if len(set((x._.meta['party1'], x._.meta['party2'])) & parties_set) > 0 )
                                                   
    for doc in docs:
        
        doc_freqs = textacy_doc_to_bow(doc, target=target, weighting=weighting, as_strings=True, include=include)
        
        df = pd.DataFrame({
            'treaty_id': doc._.meta['treaty_id'],
            'signed_year': int(doc._.meta['signed_year']),
            'token': list(doc_freqs.keys()),
            'score': list(doc_freqs.values())
        })
        
        # print('Added {}: {} words'.format(doc._.meta['treaty_id'], len(df)))
        
        df_freqs = df_freqs.append(df)
        gui.progress.value = gui.progress.value + 1
        
    df_freqs['signed_year'] = df_freqs.signed_year.astype(int)
    
    for key, group in TREATY_TIME_GROUPINGS.items():
        if key in df_freqs.columns:
            continue
        df_freqs[key] = (group['fx'])(df_freqs)
        
    df_freqs['term'] = df_freqs.token # if True else df_freqs.token
    
    df_freqs = df_freqs.groupby([group_by_column, 'term']).sum().reset_index()[[group_by_column, 'term', 'score']]
    
    if display_score is True:
        df_freqs['term'] = df_freqs.term + '*' + (df_freqs.score.apply('{:,.3f}'.format) if weighting == 'freq' else df_freqs.score.astype(str))
        
    df_freqs['position'] = df_freqs.sort_values(by=[group_by_column, 'score'], ascending=False).groupby([group_by_column]).cumcount() + 1
    
    gui.progress.value = 0
    
    return df_freqs
    
def display_list_of_most_frequent_words(gui, df):
    if gui.output_type.value == 'table':
        display(df)
    elif gui.output_type.value == 'rank':
        group_by_column = gui.group_by_column.value
        df = df[df.position <= gui.n_tokens.value]
        df_unstacked_freqs = df[[group_by_column, 'position', 'term']].set_index([group_by_column, 'position']).unstack()
        display(df_unstacked_freqs)
    else:
        filename = '../data/word_trend_data.xlsx'
        df.to_excel(filename)
        print('Excel written: ' + filename)
        
def word_frequency_gui(wti_index, corpus, compute_callback, display_callback):
    
    lw = lambda w: widgets.Layout(width=w)
    
    include_pos_tags = [ 'ADJ', 'VERB', 'NUM', 'ADV', 'NOUN', 'PROPN' ]
    weighting_options = { 'Count': 'count', 'Frequency': 'freq' }
    normalize_options = { '':  False, 'Lemma': 'lemma', 'Lower': 'lower' }
    #pos_tags = DF_TAGSET[DF_TAGSET.POS.isin(include_pos_tags)].groupby(['POS'])['DESCRIPTION'].apply(list).apply(lambda x: ', '.join(x[:1])).to_dict()
    #pos_options = { k + ' (' + v + ')': k for k,v in pos_tags.items() }
    pos_options = include_pos_tags
    
    default_include_pos = ['NOUN', 'PROPN']
    frequent_words = [ x[0] for x in textacy_utility.get_most_frequent_words(corpus, 100, include_pos=default_include_pos) ]

    group_by_options = { TREATY_TIME_GROUPINGS[k]['title']: k for k in TREATY_TIME_GROUPINGS }
    output_type_options = [ ( 'List', 'table' ), ( 'Rank', 'rank' ), ( 'Excel', 'excel' ), ]
    ngrams_options = { '-': None, '1': [1], '1,2': [1,2], '1,2,3': [1,2,3]}
    party_preset_options = wti_index.get_party_preset_options()
    parties_options = [ x for x in wti_index.get_countries_list() if x != 'ALL OTHER' ]
    gui = types.SimpleNamespace(
        progress=widgets.IntProgress(value=0, min=0, max=5, step=1, description='', layout=lw('98%')),
        parties=widgets.SelectMultiple(description='Parties', options=parties_options, value=[], rows=7, layout=lw('200px')),
        party_preset=widgets_config.dropdown('Presets', party_preset_options, None, layout=lw('200px')),
        ngrams=widgets.Dropdown(description='n-grams', options=ngrams_options, value=None, layout=lw('200px')),
        min_word=widgets.Dropdown(description='Min length', options=[1,2,3,4], value=1, layout=lw('200px')),
        normalize=widgets.Dropdown(description='Normalize', options=normalize_options, value='lemma', layout=lw('200px')),
        weighting=widgets.Dropdown(description='Weighting', options=weighting_options, value='freq', layout=lw('200px')),
        include_pos=widgets.SelectMultiple(description='POS', options=pos_options, value=default_include_pos, rows=7, layout=lw('150px')),
        stop_words=widgets.SelectMultiple(description='STOP', options=frequent_words, value=list([]), rows=7, layout=lw('200px')),
        group_by_column=widgets.Dropdown(description='Group by', value='signed_year', options=group_by_options, layout=lw('200px')),
        output_type=widgets.Dropdown(description='Output', value='rank', options=output_type_options, layout=lw('200px')),
        n_tokens=widgets.IntSlider(description='#tokens', value=25, min=3, max=500, layout=lw('250px')),
        compute=widgets.Button(description='Compute', button_style='Success', layout=lw('120px')),
        display_score=widgets.ToggleButton(description='Display score', icon='check', value=False, layout=lw('120px')),
        output=widgets.Output(layout={'border': '1px solid black'})
    )
    
    boxes = widgets.VBox([
        gui.progress,
        widgets.HBox([
            widgets.VBox([
                gui.normalize,
                gui.ngrams,
                gui.weighting,
                gui.group_by_column,
                gui.output_type,
            ]),
            widgets.VBox([
                gui.parties,
                gui.party_preset,
            ]),
            gui.include_pos,
            gui.stop_words,
            widgets.VBox([
                gui.n_tokens,
                gui.display_score,
                gui.compute,
            ], layout=widgets.Layout(align_items='flex-end')),
        ]),
        gui.output
    ])
    
    display(boxes)
    
    def on_party_preset_change(change):  # pylint: disable=W0613
        if gui.party_preset.value is None:
            return
        gui.parties.value = gui.parties.options if 'ALL' in gui.party_preset.value else gui.party_preset.value
            
    gui.party_preset.observe(on_party_preset_change, names='value')
    
    def pos_change_handler(*args):
        with gui.output:
            gui.compute.disabled = True
            selected = set(gui.stop_words.value)
            frequent_words = [
                x[0] for x in textacy_utility.get_most_frequent_words(
                    corpus,
                    100,
                    normalize=gui.normalize.value,
                    include_pos=gui.include_pos.value,
                    weighting=gui.weighting.value
                )
            ]
            gui.stop_words.options = frequent_words
            selected = selected & set(gui.stop_words.options)
            gui.stop_words.value = list(selected)
            gui.compute.disabled = False
        
    gui.include_pos.observe(pos_change_handler, 'value')    
    gui.weighting.observe(pos_change_handler, 'value')    
    
    def compute_callback_handler(*_args):
        gui.output.clear_output()
        with gui.output:
            #try:
                gui.compute.disabled = True
                df_freqs = compute_callback(
                    corpus=corpus,
                    gui=gui,
                    target=gui.normalize.value,
                    group_by_column=gui.group_by_column.value,
                    parties=gui.parties.value,
                    weighting=gui.weighting.value,
                    include_pos=gui.include_pos.value,
                    stop_words=set(gui.stop_words.value),
                    display_score=gui.display_score.value
                )
                display_callback(gui, df_freqs)
            #finally:
            #    gui.compute.disabled = False

    gui.compute.on_click(compute_callback_handler)
    return gui
                
try:
    word_frequency_gui(
        WTI_INDEX,
        current_corpus(),
        compute_callback=compute_list_of_most_frequent_words,
        display_callback=display_list_of_most_frequent_words
    )
except Exception as ex:
    logger.error(ex)
    raise
    


VBox(children=(IntProgress(value=0, layout=Layout(width='98%'), max=5), HBox(children=(VBox(children=(Dropdown…