## The Culture of International Relations - Text Analysis
### <span style='color: green'>SETUP </span> Prepare and Setup Notebook <span style='float: right; color: red'>MANDATORY</span>

In [3]:
# Setup
%load_ext autoreload
%autoreload 2

import sys, os, collections, zipfile
import re, typing.re

sys.path = list(set(['.', '..', '../..', '../../3_text_analysis']) - set(sys.path)) + sys.path

import nltk, textacy, spacy 
import pandas as pd
import ipywidgets as widgets
import bokeh, bokeh.plotting, bokeh.models, matplotlib.pyplot as plt
import common.utility as utility
import common.widgets_utility as widgets_utility
import common.widgets_config as widgets_config
import common.config as config
import common.utility as utility
import common.treaty_utility as treaty_utility
import common.treaty_state as treaty_repository
import treaty_corpus
import types, glob

from beakerx.object import beakerx
from beakerx import *
from IPython.display import display, set_matplotlib_formats

logger = utility.getLogger('corpus_text_analysis')

pd.options.display.max_columns = None
pd.options.display.max_rows = None
pd.options.display.colheader_justify = 'left'

PATTERN = '*.txt'
PERIOD_GROUP = 'years_1945-1972'
DF_TAGSET = pd.read_csv(os.path.join(config.DATA_FOLDER, 'tagset.csv'), sep='\t').fillna('')
treaty_repository.load_wti_index_with_gui(data_folder=config.DATA_FOLDER)
GPE_FILENAME = os.path.join(config.DATA_FOLDER, 'gpe_substitutions.txt')

%matplotlib inline
bokeh.plotting.output_notebook()

current_corpus_container = lambda: textacy_utility.CorpusContainer.container()
current_corpus = lambda: textacy_utility.CorpusContainer.corpus()


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


ImportError: No module named 'py4j'

## <span style='color: green'>PREPARE </span> Load and Prepare Corpus <span style='float: right; color: red'>MANDATORY</span>


In [None]:
import textacy_corpus_gui
try:
    textacy_corpus_gui.display_corpus_load_gui(config.DATA_FOLDER, treaty_repository.current_wti_index(), current_corpus_container())
except Exception as ex:
    logger.error(ex)

## <span style='color: green'>EXPLORE </span> Document Similarity <span style='float: right; color: red'>WORK IN PROGRESS</span>


In [None]:
import scipy

def sumorial(n):
    return int(n * (n + 1) / 2)

def compute_similarity(corpus, metric, extract_token_args, tick=utility.noop):
    document_tokens = [ list(x) for x in textacy_utility.extract_corpus_terms(corpus, extract_token_args) ]
    n = sumorial(len(document_tokens))
    tick(0)
    row = np.zeros(n, dtype=int)
    col = np.zeros(n, dtype=int)
    data = np.zeros(n, dtype=int)
    p = 0
    for i in range(1, n-1):
        tick()
        for j in range(i+1, len(document_tokens)):
            data[p] = metric.score(document_tokens[i], document_tokens[j])
            row[p] = i
            col[p] = j
            p += 1
    m = scipy.sparse.coo_matrix((data, (row, col)), shape=(n,n))
    tick(0)
    return m

gui = types.SimpleNamespace(
    progress=widgets.IntProgress(min=0, max=len(document_tokens), value=0)
)

display(gui.progress)

extract_token_args = dict(
    args=dict(
        ngrams=[ 1 ],
        named_entities=False,
        normalize='lemma',
        as_strings=True
    ),
    kwargs=dict(
        min_freq=2,
        include_pos=['NOUN', 'PROPN'],
        filter_stops=True,
        filter_punct=True
    ),
    mask_gpe=True,
    min_freq=2, # tokens below this threshold is added to extra_stop_words
    max_doc_freq=100,
    extra_stop_words=set([]),
    min_length=2
)

def tick(n=None):
    gui.progress.value = n if n is not None else gui.progress.value + 1

corpus = get_current_corpus().textacy_corpus
metric = Hirschberg()
gui.progress.max = len(document_tokens)

m = compute_similarity(corpus, metric, extract_token_args, tick=tick)

treaty_index = { doc.metadata['treaty_id']: i for i, doc in enumerate(corpus) }

#df.to_excel('hirschberg_scores_lemma_noun.xlsx')


In [None]:
df = pd.DataFrame({ 'treaty_i': list(m.row), 'treaty_j': list(m.col), 'score': list(m.data) })
df.head()

In [None]:
df_i = df.set_index('treaty_i').merge(WTI_INDEX.treaties[['signed_year', 'party1', 'party2']], how='inner', left_index=True, right_index=True).reset_index().rename(columns={ 'index': 'treaty_i', 'signed_year': 'signed_year_i', 'party1': 'party1_i', 'party2': 'party2_i'})
df_ij = df_i.set_index('treaty_j').merge(WTI_INDEX.treaties[['signed_year', 'party1', 'party2']], how='inner', left_index=True, right_index=True).reset_index().rename(columns={ 'index': 'treaty_j', 'signed_year': 'signed_year_j', 'party1': 'party1_j', 'party2': 'party2_j'})

document_tokens = [ list(x) for x in textacy_utility.extract_corpus_terms(corpus, extract_token_args) ]

#treaty_index
output_left = widgets.Output()
output_right = widgets.Output()

display(widgets.HBox([output_left, output_right]))
df_ij.sort_values('score', ascending=False).head()


### Textacy Similarity

```
textacy.similarity.word_movers(doc1, doc2, metric='cosine')
Measure the semantic similarity between two documents using Word Movers Distance.

Parameters:	
doc1 (textacy.Doc or spacy.Doc) –
doc2 (textacy.Doc or spacy.Doc) –
metric ({'cosine', 'euclidean', 'l1', 'l2', 'manhattan'}) –
Returns:	
Similarity between doc1 and doc2 in the interval [0.0, 1.0], where larger values correspond to more similar documents.

Return type:	
float
```


In [None]:
import textacy.similarity

corpus = current_corpus()

doc1 = corpus[0]
doc2 = corpus[3]

metric_options = [ ('Cosine', 'cosine'), ('Euclidean', 'euclidean'), ('L1', 'l1'), ('L2', 'l2'), ('Manhattan', 'manhattan') ] 

gui = types.SimpleNamespace(
    metric=widgets.Dropdown(description='Metric', options=metric_options, value=metric_options[0][1]),
    index=widgets.IntSlider(description='Document', min=0,max=len(corpus)-1, step=1, value=0)
)

display(gui.metric)
display(gui.index)

score = textacy.similarity.word_movers(doc1, doc2, metric='cosine')

score
