## The Culture of International Relations - Text Analysis
### <span style='color: green'>SETUP </span> Prepare and Setup Notebook <span style='float: right; color: red'>MANDATORY</span>

In [1]:
# Setup
%load_ext autoreload
%autoreload 2

import sys, os, collections, zipfile
import re, typing.re

sys.path = list(set(['.', '..']) - set(sys.path)) + sys.path

import nltk, textacy, spacy 
import pandas as pd
import ipywidgets as widgets
import bokeh, bokeh.plotting, bokeh.models, matplotlib.pyplot as plt
import common.utility as utility
import common.widgets_utility as widgets_utility
import common.widgets_config as widgets_config
import common.config as config
import common.utility as utility
import common.treaty_utility as treaty_utility
import common.treaty_state as treaty_repository
import treaty_corpus
import types, glob

from beakerx.object import beakerx
from beakerx import *
from IPython.display import display, set_matplotlib_formats

logger = utility.getLogger('corpus_text_analysis')

pd.options.display.max_columns = None
pd.options.display.max_rows = None
#pd.options.display.max_colwidth = -1
pd.options.display.colheader_justify = 'left'
#pd.options.display.precision = 4

DATA_FOLDER = '../data'
PATTERN = '*.txt'
PERIOD_GROUP = 'years_1945-1972'
DF_TAGSET = pd.read_csv('../data/tagset.csv', sep='\t').fillna('')
WTI_INDEX = treaty_repository.load_wti_index(data_folder=DATA_FOLDER)

%matplotlib inline
# set_matplotlib_formats('svg')   
bokeh.plotting.output_notebook()

class CorpusNotLoaded(Exception):
    pass

def get_current_corpus():
    if 'CURRENT_CORPUS' in globals():
        if globals()['CURRENT_CORPUS'].textacy_corpus is not None:
            return globals()['CURRENT_CORPUS']
    raise CorpusNotLoaded('Corpus not loaded or computed')


2018-12-23 04:59:35,450 : INFO : WTI index loaded!


In [22]:
df = WTI_INDEX.treaties[WTI_INDEX.treaties.is_cultural].head()
td = TableDisplay(df[['party1']])

def highlight(row, column, td):
    print(column)
    #if (column in (6,7,8,9)):
    print(td.values[row][column])
    return Color.RED # if display5.values[row][column] == 'FRANCE' else Color.GREEN

td.addCellHighlighter(highlight)



In [None]:
def map = [
   [a:1, b:2, c:3],
   [a:4, b:5, c:6],
   [a:7, b:8, c:5]
]
def display5 = new TableDisplay(map)
display5.addCellHighlighter { row, column, tableDisplay ->
  if (column == 2) {
    display5.values[row][column] < 5 ? Color.RED : Color.GREEN
  }
}
display5

## <span style='color: green'>PREPARE </span> Load and Prepare Corpus <span style='float: right; color: red'>MANDATORY</span>


In [None]:
import textacy_corpus_utility as textacy_utility
import textacy_corpus_gui

if 'CURRENT_CORPUS' not in globals():
    CURRENT_CORPUS = types.SimpleNamespace(
        language=None,
        source_path=None,
        prepped_source_path=None,
        textacy_corpus_path=None,
        textacy_corpus=None,
        nlp=None
    )

try:
    textacy_corpus_gui.display_corpus_load_gui(DATA_FOLDER, WTI_INDEX, CURRENT_CORPUS)
except Exception as ex:
    logger.error(ex)


## <span style='color: green'>EXPLORE </span> Document Similarity <span style='float: right; color: red'>WORK IN PROGRESS</span>


In [None]:
from similarity.alignment.alignment import Needleman, Hirschberg

corpus = get_current_corpus().textacy_corpus

SIMILARITY_SKIP_POS = set([spacy.symbols.SPACE, spacy.symbols.NUM ])
SIMILARITY_INCLUDE_POS = set([spacy.symbols.NOUN])

def similarity_prepare_token(x):
    
    if x.pos == spacy.symbols.NOUN:
        return x.lemma_
    
    return x.pos_

def similarity_prepare_doc(doc, include_pos=None, min_length=2):
    tokens = doc
    tokens = ( x for x in tokens if x.pos not in SIMILARITY_SKIP_POS )
    tokens = ( x for x in tokens if x.is_alpha and not x.is_stop and len(x) >= min_length  )
    if include_pos is not None: 
        tokens = ( x for x in tokens if x.pos in include_pos  )
        
    return [ similarity_prepare_token(x) for x in tokens ]

seqa = similarity_prepare_doc(corpus[56], include_pos=[spacy.symbols.NOUN], min_length=4)
seqb = similarity_prepare_doc(corpus[62], include_pos=[spacy.symbols.NOUN], min_length=4)

# Align using Needleman-Wunsch algorithm.
n = Needleman()
a,b = n.align(seqa, seqb)

l = min(len(a), len(b))
df = pd.DataFrame(
    {
        'Doc #1 ' + corpus[56].metadata['treaty_id']: a[:l],
        'Doc #2 ' + corpus[62].metadata['treaty_id']: b[:l]
        #corpus[0].metadata['treaty_id']: [ corpus.spacy_vocab[int(x)].lower if x != '|' else 0 for x in a[:l] ],
        #corpus[1].metadata['treaty_id']: [ corpus.spacy_vocab[int(x)].lower if x != '|' else 0 for x in b[:l] ]
    }
)
display(df)


In [None]:
i = 0
n = Hirschberg()
scores = []
for i in range(1, len(corpus)-1):
    for j in range(i+1, len(corpus)):
        seqa = similarity_prepare_doc(corpus[i], include_pos=[spacy.symbols.NOUN], min_length=4)
        seqb = similarity_prepare_doc(corpus[j], include_pos=[spacy.symbols.NOUN], min_length=4)
        score = n.score(seqa, seqb)
        #print('Treaty #1: {} Treaty #2 {} Score: {}'.format(corpus[i].metadata['treaty_id'], corpus[j].metadata['treaty_id'], score))
        scores.append( {
            'treaty_1': corpus[i].metadata['treaty_id'],
            'treaty_2': corpus[j].metadata['treaty_id'],
            'score': score
        })
df = pd.DataFrame(scores)
df.to_excel('hirschberg_scores_lemma_noun.xlsx')
#display(df.sort_values(['score'], ascending=False))


In [None]:
lda = TM_GUI_MODEL.model.tm_model
corpus = TM_GUI_MODEL.model.tm_corpus
index = gensim.similarities.MatrixSimilarity(lda[corpus])
index.save("simIndex.index")


In [None]:
import scipy

def sumorial(n):
    return int(n * (n + 1) / 2)

def compute_similarity(corpus, metric, extract_token_args, tick=utility.noop):
    document_tokens = [ list(x) for x in textacy_utility.extract_corpus_terms(corpus, extract_token_args) ]
    n = sumorial(len(document_tokens))
    tick(0)
    row = np.zeros(n, dtype=int)
    col = np.zeros(n, dtype=int)
    data = np.zeros(n, dtype=int)
    p = 0
    for i in range(1, n-1):
        tick()
        for j in range(i+1, len(document_tokens)):
            data[p] = metric.score(document_tokens[i], document_tokens[j])
            row[p] = i
            col[p] = j
            p += 1
    m = scipy.sparse.coo_matrix((data, (row, col)), shape=(n,n))
    tick(0)
    return m

gui = types.SimpleNamespace(
    progress=widgets.IntProgress(min=0, max=len(document_tokens), value=0)
)

display(gui.progress)

extract_token_args = dict(
    args=dict(
        ngrams=[ 1 ],
        named_entities=False,
        normalize='lemma',
        as_strings=True
    ),
    kwargs=dict(
        min_freq=2,
        include_pos=['NOUN', 'PROPN'],
        filter_stops=True,
        filter_punct=True
    ),
    mask_gpe=True,
    min_freq=2, # tokens below this threshold is added to extra_stop_words
    max_doc_freq=100,
    extra_stop_words=set([]),
    min_length=2
)

def tick(n=None):
    gui.progress.value = n if n is not None else gui.progress.value + 1

corpus = get_current_corpus().textacy_corpus
metric = Hirschberg()
gui.progress.max = len(document_tokens)

m = compute_similarity(corpus, metric, extract_token_args, tick=tick)

treaty_index = { doc.metadata['treaty_id']: i for i, doc in enumerate(corpus) }

#df.to_excel('hirschberg_scores_lemma_noun.xlsx')


In [None]:
df = pd.DataFrame({ 'treaty_i': list(m.row), 'treaty_j': list(m.col), 'score': list(m.data) })
df.head()

In [None]:
df_i = df.set_index('treaty_i').merge(WTI_INDEX.treaties[['signed_year', 'party1', 'party2']], how='inner', left_index=True, right_index=True).reset_index().rename(columns={ 'index': 'treaty_i', 'signed_year': 'signed_year_i', 'party1': 'party1_i', 'party2': 'party2_i'})
df_ij = df_i.set_index('treaty_j').merge(WTI_INDEX.treaties[['signed_year', 'party1', 'party2']], how='inner', left_index=True, right_index=True).reset_index().rename(columns={ 'index': 'treaty_j', 'signed_year': 'signed_year_j', 'party1': 'party1_j', 'party2': 'party2_j'})

document_tokens = [ list(x) for x in textacy_utility.extract_corpus_terms(corpus, extract_token_args) ]

#treaty_index
output_left = widgets.Output()
output_right = widgets.Output()

display(widgets.HBox([output_left, output_right]))
df_ij.sort_values('score', ascending=False).head()


In [None]:
# libraries
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
 
# Build a dataframe with your connections
df = pd.DataFrame({ 'from':['A', 'B', 'C','A'], 'to':['D', 'A', 'E','C'], 'value':[1, 10, 5, 5]})
df
 
# Build your graph
G=nx.from_pandas_dataframe(df, 'from', 'to', create_using=nx.Graph() )
 
# Custom the nodes:
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_color=df['value'], width=10.0, edge_cmap=plt.cm.Blues)

In [None]:
import graph_tool as gt
import graph_tool.draw
import graph_tool.collection
import matplotlib.pyplot as plt

%matplotlib inline

graph = gt.collection.data["karate"]
plt.switch_backend('cairo')  # the only supported backend

In [37]:
class A:
    
    _singleton = None
    
    def __init__(self):
        self._b = 'HEJ'
    
    @staticmethod
    def singleton():
        A._singleton = A._singleton or A()
        return A._singleton
    
    def b():
        print('b called')
    
    def c(self):
        print('c called')
    
x = A()
x.singleton().c()

c called


In [38]:
A.singleton

<function __main__.A.singleton()>

In [39]:
A.singleton

<function __main__.A.singleton()>