## The Culture of International Relations

### <span style='color: green'>SETUP </span> Prepare and Setup Notebook <span style='float: right; color: red'>MANDATORY</span>

In [13]:
# Setup
%load_ext autoreload
%autoreload 2

import sys, os

def project_root():
    folder = os.getcwd()
    while not os.path.exists(os.path.join(folder, "common")):
        folder, _ = os.path.split(folder)
    return folder

sys.path.append(project_root())
sys.path.append(os.path.join(project_root(), "3_text_analysis"))

import glob, re
import pandas as pd
import ipywidgets as widgets
import common.utility as utility
import common.treaty_state as treaty_repository
import common.config as config
import textacy_corpus_utility as textacy_utility
import treaty_corpus

# from beakerx.object import beakerx
# from beakerx import *

from IPython.display import display, set_matplotlib_formats

#import pyarrow.parquet as pq

logger = utility.getLogger('corpus_text_analysis')

utility.setup_default_pd_display(pd)

PERIOD_GROUP = 'years_1945-1972'
DF_TAGSET = pd.read_csv(os.path.join(config.DATA_FOLDER, 'tagset.csv'), sep='\t').fillna('')
treaty_repository.load_wti_index_with_gui(data_folder=config.DATA_FOLDER)
TREATY_TIME_GROUPINGS = treaty_repository.current_wti_index().get_treaty_time_groupings()
GPE_FILENAME = os.path.join(config.DATA_FOLDER, 'gpe_substitutions.txt')

current_corpus_container = lambda: textacy_utility.CorpusContainer.container()
current_corpus = lambda: textacy_utility.CorpusContainer.corpus()

from bokeh.plotting import output_notebook, show
output_notebook()


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


VBox(children=(Dropdown(description='Load index', layout=Layout(width='300px'), options=(('WTI 7CULT', 'is_cul…

## <span style='color: green'>PREPARE </span> Load and Prepare Corpus <span style='float: right; color: red'>MANDATORY</span>


In [14]:
import textacy_corpus_gui
try:
    container = current_corpus_container()
    textacy_corpus_gui.display_corpus_load_gui(config.DATA_FOLDER, treaty_repository.current_wti_index(), container)
except Exception as ex:
    raise
    logger.error(ex)


VBox(children=(IntProgress(value=0, layout=Layout(width='90%'), max=5), HBox(children=(Dropdown(description='C…

## <span style='color: green'>PREPARE </span> Compute Similarity Scores <span style='float: right; color: red'>MANDATORY</span>

#### gensim LSI or TF-IDF Document (BoW) Similarity
See https://radimrehurek.com/gensim/tut3.html and “Indexing by Latent Semantic Analysis, Deerwester et al. (1990)”.
Compute cosine similarity between documents based on LSI or TF-IDF models.

#### gensim LDA Document Similarity
Cosine similarity on document-topic assignments from (gensim) LDA topic models.

#### sklearn TF-IDF Document Similarity
Cosine similarity on sklearn TF-IDF models.

#### biopython Alignment Scores Document Similarity
See [Biopython](https://biopython.org/). GLobal sequence alignment using specifically [PairwiseAligner](http://biopython.org/DIST/docs/api/). The biopython package has many other sequence alignment packages. Note that the alignment is _character_ based, which might not be optimal for text similarity.

Since this is a very time-consuming computational method, the text are uppercased, and characters not in a predefined alphabet are filtered out. This alphabet consists of letters A-Z but can be configured. Note that all other characters are stripped away, including whitespaces etc. To limit the search space, there is an option to first compute LSI similarity, and then only tries to align the n top n most similar document based on this metric.

From [documentation](http://biopython.org/DIST/docs/api/): "Based on the values of the gap scores, a PairwiseAligner object automatically chooses the appropriate alignment algorithm (the Needleman-Wunsch, Smith-Waterman, Gotoh, or Waterman-Smith-Beyer global or local alignment algorithm)."

#### {NOT IMPLEMENTED} SIMD Smith-Waterman Alignment Scores Document Similarity
An SIMD Smith-Waterman C/C++/Python/Java Library for Use in Genomic Applications

- [libssw](https://bioconda.github.io/recipes/libssw/README.html)
- [github](https://github.com/mengyao/Complete-Striped-Smith-Waterman-Library) Citation: https://doi.org/10.1371/journal.pone.0082138

#### {NOT IMPLEMENTED} VT-PASSIM
See separate tests.

#### Python "python-alignment" PACKAGE
https://github.com/eseraygun/python-alignment
```pip install alignment```



In [19]:
import string, heapq
import gensim
import generic_process_corpus_gui
import networkx as nx
import matplotlib.pyplot as plt
import itertools
#from Bio import Align
from sklearn.feature_extraction.text import TfidfVectorizer

%matplotlib inline
clocks = None
def plot_histogram(ax, values, bins=20, **kwargs):
    global clocks
    clocks = locals()
    #subplot = kwargs.get('subplot', None)
    figsize = kwargs.get('figsize', None)
    title = kwargs.get('title', None)
    xlabel, ylabel = kwargs.get('xlabel', None), kwargs.get('ylabel', None)
    
    avg_length  = sum(values) / float(len(values))

    #if subplot is not None:
    #    plt.subplot(*subplot)
    
    #if figsize is not None:
    #    plt.rc('figure', figsize=figsize)
        
    ##plt.rc('font', size=12)
    #plt.rc('lines', linewidth=2)

    ax.hist(values, bins=bins)
    ax.axvline(avg_length, color='#e41a1c')
    
    if title is not None:
        ax.set_title(title)
        
    if xlabel is not None:
        ax.set_xlabel(xlabel)
        
    #ax.text(100, 50, 'avg = %.2f' % avg_length)
    
__current_similarity_model = None
def current_similarity_model():
    assert __current_similarity_model is not None, 'Please create a similarity model first'
    return __current_similarity_model

class IdentityModel():
    def __getitem__(self, key):
        return key
    
def compute_gensim_similarity_scores(model_type, terms, process_opts, n_features=25):
    
    dictionary = gensim.corpora.Dictionary(terms)
    bow_corpus = [ dictionary.doc2bow(tokens) for tokens in terms ]
    
    if model_type == "lsi":
        similarity_model  = gensim.models.LsiModel(bow_corpus, id2word=dictionary, num_topics=n_features)
    elif model_type == "lda":
        similarity_model  = gensim.models.LdaModel(
            bow_corpus,
            id2word=dictionary,
            num_topics=n_features,
            eval_every=1,
            iterations=2000,
            passes=4,
            alpha='auto'
        )
    elif model_type == "bow":
        similarity_model  = IdentityModel()
    elif model_type == "tfidf":
        similarity_model  = gensim.models.TfidfModel(bow_corpus)
    else:
        assert False, 'Unknown model type'

    model_corpus      = similarity_model[bow_corpus]
    similarity_index  = gensim.similarities.MatrixSimilarity(model_corpus, corpus_len=len(bow_corpus))
    model_scores      = list(enumerate(similarity_index[model_corpus]))
    
    similarity_scores = [ (i, j, score) for i, scores in model_scores for (j, score) in enumerate(scores) if i < j]
    return similarity_scores

def get_top_candidates_by_lsi_scores(terms, process_opts, n_largest=500, n_features=25, p_threshold=0.90):
    lsi_scores = compute_gensim_similarity_scores('lsi', terms, process_opts, n_features=n_features)
    candidates = [ (i, j, score) for i, j, score in lsi_scores if score >= p_threshold]
    candidates = heapq.nlargest(n_largest, candidates, key=lambda x: x[2])
    return ( (i,j) for i, j, _ in candidates )

def compute_tfidf_similarity_scores(model_type, terms, process_opts, n_features=25):
    
    texts = [ ' '.join(list(x)) for x in terms ]
    tfidf = TfidfVectorizer().fit_transform(texts)
    m = (tfidf * tfidf.T).tocoo() 
    
    similarity_scores = list(( (i, j, score) for i, j, score in zip(*(m.row, m.col, m.data)) if i < j ))
    return similarity_scores

def terms_to_text(terms, alphabet=string.ascii_uppercase):
    text = ''.join(terms).upper()
    return ''.join(c for c in text if c in alphabet)

def compute_biopython_alignment_score(model_type, terms, process_opts, threshold=0.5, n_lsi_largest=None):
    
    tick           = process_opts.get('tick', utility.noop)
    match_score    = process_opts.get('match_score', 1)
    mismatch_score = process_opts.get('mismatch_score', -1)
    alphabet       = string.ascii_uppercase # 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
    
    aligner = Align.PairwiseAligner()
    aligner.mode = 'global'
    aligner.match = match_score
    aligner.mismatch  = mismatch_score
    
    terms_corpus  = [ terms_to_text(doc, alphabet) for doc in terms ]
    
    if n_lsi_largest is not None:
        candidates = get_top_candidates_by_lsi_scores(terms, process_opts, n_largest=n_lsi_largest, n_features=25, p_threshold=threshold)
        n_candidates = n_lsi_largest
    else:
        corpus_length = len(terms_corpus)
        candidates = itertools.combinations(range(corpus_length), r=2)
        n_candidates = (corpus_length ** 2) / 2 - corpus_length

    tick(0, n_candidates)
    
    for i, j in candidates:
        score = aligner.score(terms_corpus[i], terms_corpus[j]) / max(len(terms_corpus[i]), len(terms_corpus[j]))
        tick()
        if score >= threshold:
            yield (i, j, score)

    tick(0)
    
def compute_leebird_alignment_score(model_type, terms, process_opts, threshold=0.5):
    
    from similarity.alignment.alignment import Needleman, Hirschberg, SegmentAlignment
    
    model_types = {
        "needleman": Needleman,
        "hirschberg": Hirschberg,
        "segmentalignment": SegmentAlignment
    }
    
    aligner = model_types[model_type]()
    
    if n_lsi_largest is not None:
        candidates = get_top_candidates_by_lsi_scores(terms, process_opts, n_largest=n_lsi_largest, n_features=25, p_threshold=threshold)
        n_candidates = n_lsi_largest
    else:
        corpus_length = len(terms)
        candidates = itertools.combinations(range(corpus_length), r=2)
        n_candidates = (corpus_length ** 2) / 2 - corpus_length

    for i, j in candidates:
        score = aligner.score(terms_corpus[i], terms_corpus[j]) / max(len(terms_corpus[i]), len(terms_corpus[j]))
        if score >= threshold:
            yield (i, j, score)

def compute_python_alignment_score(model_type, terms, process_opts, threshold=0.7, n_lsi_largest=500):

    from alignment.sequence import Sequence
    from alignment.vocabulary import Vocabulary
    from alignment.sequencealigner import SimpleScoring, GlobalSequenceAligner

    tick           = process_opts.get('tick', utility.noop)
    match_score    = process_opts.get('match_score', 1)
    mismatch_score = process_opts.get('mismatch_score', -1)
    gap_score      = process_opts.get('gap_score', -1)

    #alphabet=string.ascii_uppercase + ' '
    #[a-zA-Z]+|[0-9]+|\s+|[.,;!\(\)]+
    #terms_corpus = [ terms_to_text(doc, alphabet=string.ascii_uppercase) for doc in terms ]

    terms_corpus = terms
    
    candidates = [ (i,j) for i,j in get_top_candidates_by_lsi_scores(terms, process_opts, n_largest=n_lsi_largest, n_features=25, p_threshold=threshold) ]

    v = Vocabulary()
    
    scoring = SimpleScoring(match_score, mismatch_score)
    aligner = GlobalSequenceAligner(scoring, gap_score)
    
    tick(0, n_lsi_largest)
    
    if False:
        
        from joblib import Parallel, delayed        
        
        def align(i, j):

            a = Sequence(terms_corpus[i])
            b = Sequence(terms_corpus[j])

            a_encoded, b_encoded = v.encodeSequence(a), v.encodeSequence(b)
            score = aligner.align(a_encoded, b_encoded, backtrace=False) / max(len(terms_corpus[i]), len(terms_corpus[j]))
            tick()
            return (i, j, score)
        
        data = Parallel(n_jobs=4, prefer="threads")(delayed(align)(i,j) for i,j in candidates)
        return data
                
    else:
        
        for i, j in candidates:

            a = Sequence(terms_corpus[i])
            b = Sequence(terms_corpus[j])

            a_encoded, b_encoded = v.encodeSequence(a), v.encodeSequence(b)
            score = aligner.align(a_encoded, b_encoded, backtrace=False) / max(len(terms_corpus[i]), len(terms_corpus[j]))

            tick()

            if score >= threshold:
                yield (i, j, score)
            
    tick(0)
    
def compute_jaccard_similarity_scores(model_type, terms, process_opts, threshold=0.7, n_largest=1000):
    
    def jaccard(s1, s2):
        return len(s1 & s2) / len(s1 | s2)
    
    sets = [ set(x) for x in terms ]
    ij_s = itertools.combinations(range(len(terms)), r=2)
    
    scores = ( (i, j, jaccard(sets[i], sets[j])) for i, j in ij_s)
    
    return heapq.nlargest(n_largest, (x for x in scores if x[2] >= threshold), key=lambda x: x[2])

#    for i, j in itertools.combinations(range(terms), r=2):
#        score = len(sets[i] & sets[j]) / len(sets[i] | sets[j])
#        if score >= threshold:
#            yield (i, j, len(sets[i] & sets[j]) / len(sets[i] | sets[j]))

def compute_wmd_word2vec_similarity_scores(model_type, terms, process_opts, threshold=0.7, n_largest=1000):
    
    if model_type == 'googlenews':
        model_path = os.path.join(DATA_FOLDER, 'GoogleNews-vectors-negative300.bin.gz')
        if not os.path.exists(model_path):
            raise ValueError("SKIP: Model file {} not found".format(model_path))
        model = gensim.models.KeyedVectors.load_word2vec_format(model_path, binary=True)
    else:
        model = gensim.models.Word2Vec(terms, workers=3, size=100)
    
    ij_s = itertools.combinations(range(len(terms)), r=2)
    
    scores = ( (i, j, model.wmdistance(terms[i], terms[j])) for i, j in ij_s)
    
    return heapq.nlargest(n_largest, (x for x in scores if x[2] >= threshold), key=lambda x: x[2])

#    for i, j in itertools.combinations(range(terms), r=2):
#        score = len(sets[i] & sets[j]) / len(sets[i] | sets[j])
#        if score >= threshold:
#            yield (i, j, len(sets[i] & sets[j]) / len(sets[i] | sets[j]))
        
    
def compute_similarity_scores(corpus, process_opts, extract_args):
    
    global __current_similarity_model
    
    tick = process_opts.get('tick', utility.noop)
    
    tick(1)
    
    n_features = process_opts.get('n_features').value
    model_type = process_opts.get('model_type').value
    
    [engine, model_type] = model_type.split('_')
    
    terms = [ list(doc) for doc in textacy_utility.extract_corpus_terms(corpus, extract_args) ]
    
    dictionary = gensim.corpora.Dictionary(terms)
    bow_corpus = [ dictionary.doc2bow(tokens) for tokens in terms ]
    
    tick()
    
    if engine == 'gensim':
        
        similarity_scores = compute_gensim_similarity_scores(model_type, terms, process_opts, n_features=n_features)
        
    elif engine == 'sklearn':
        
        similarity_scores = compute_tfidf_similarity_scores(model_type, terms, process_opts)

    elif engine == 'biopython':
        
        similarity_scores = list(compute_biopython_alignment_score(model_type, terms, process_opts, n_lsi_largest=None))

    elif engine == 'alignment':
        
        similarity_scores = list(compute_python_alignment_score(model_type, terms, process_opts, n_lsi_largest=None))
        
    elif engine == 'jaccard':
        
        similarity_scores = list(compute_jaccard_similarity_scores(model_type, terms, process_opts, threshold=0.7, n_largest=1000))
        
    elif engine == 'wmd':
        
        similarity_scores = list(compute_wmd_word2vec_similarity_scores(model_type, terms, process_opts, threshold=0.7, n_largest=1000))
        
    else:
        assert False, 'Unknown engine'

    tick()
        
    #total_scores = [ ]
    #for i in range(1, len(corpus)-1):
    #    lsi_document = lsi_model[bow_corpus[i]]
    #    scores = list(enumerate(lsi_index[lsi_document]))
    #    total_scores += [ (i, j, w) for (j, w) in scores if i < j ]
        
    df = pd.DataFrame(similarity_scores, columns=['doc_id1', 'doc_id2', 'score'])
    
    tick()
    
    def get_parties(index):
        metadata = corpus[index].metadata
        return '{} vs {}'.format(metadata['party1'], metadata['party2'])
    
    df['treaty_id1'] = df.doc_id1.apply(lambda x: corpus[x].metadata['treaty_id'])
    df['year1'] = df.doc_id1.apply(lambda x: corpus[x].metadata['signed_year'])
    df['parties1'] = df.doc_id1.apply(get_parties)
    
    df['treaty_id2'] = df.doc_id2.apply(lambda x: corpus[x].metadata['treaty_id'])
    df['year2'] = df.doc_id2.apply(lambda x: corpus[x].metadata['signed_year'])
    df['parties2'] = df.doc_id2.apply(get_parties)
    
    df['score%'] = df.score.apply(lambda x: int(x*100))
    df['score1K'] = df.score.apply(lambda x: int(x*1000))
    
    __current_similarity_model = df.sort_values(['score'], ascending=False)
    
    #current_similarity_model().groupby('score%').size().plot()
    tick()
    
    process_opts.get('gui').output.clear_output()

    with process_opts.get('gui').output:
        
        if process_opts.get('output_type').value == 'table':
            display(df.head(1000))
        else:
            
            f, axs = plt.subplots(1,3,figsize=(16,4))
            # plt.rc('figure', figsize=(9,4))
            
            #df.groupby(['score%']).size().plot(kind='line')
            plot_histogram(axs[0], list(df['score%'].values), bins=100, figsize=None, subplot=None, title='Score distribution')
            
            values = [ len(doc) for doc in corpus ]
            plot_histogram(axs[1], values, bins=100, figsize=None, subplot=None, title='Corpus doc lengths')
            
            values = [ len(doc) for doc in terms ]
            plot_histogram(axs[2], values, bins=100, figsize=None, subplot=None, title='Reduced doc lengths')
            
            plt.show()
            
    
    tick(0)
try:
    model_options = [
        ('bag-of-word', 'gensim_bow'),
        ('Jaccard', 'jaccard_simple'),
        ('gensim LSI', 'gensim_lsi'),
        ('gensim TF-IDF', 'gensim_tfidf'),
        ('gensim LDA', 'gensim_lda'),
        ('biopython Pairwise (slow)', 'biopython_pairwise-score'),
        ('sklearn TF-IDF', 'sklearn_tfidf'),
        ('alignment PACKAGE (slow)', 'alignment_GlobalSequenceAligner'),
        ('WMD GoogleNews (slow)', 'wmd_googlenews'),
        ('WMD Corpus (slow)', 'wmd_corpus'),
    ]
    output_options = [
        ('Network', 'network'),
        ('Table', 'table')
    ]
    n_features = widgets.IntSlider(description='Features', min=2, max=30, value=10)
    model_type = widgets.Dropdown(description='Model', options=model_options, value='gensim_lsi')
    output_type = widgets.Dropdown(description='Output', options=output_options, value='network')
    display(
        widgets.HBox([model_type, n_features])
    )
    _ = generic_process_corpus_gui.process_corpus_gui(
        current_corpus_container(),
        treaty_repository.current_wti_index(),
        compute_similarity_scores,
        tagset=DF_TAGSET,
        gpe_filename=GPE_FILENAME,
        model_type=model_type,
        n_features=n_features,
        output_type=output_type,
        match_score=1,
        mismatch_score=-1,
        gap_score=-1
    )
except Exception as ex:
    raise
    logger.error(ex)
    

HBox(children=(Dropdown(description='Model', index=2, options=(('bag-of-word', 'gensim_bow'), ('Jaccard', 'jac…

2020-06-02 14:28:32,126 : INFO : generic_process_corpus_gui.py.process_corpus_gui() : ...loading term substitution mappings...


VBox(children=(IntProgress(value=0, layout=Layout(width='90%'), max=5), HBox(children=(VBox(children=(HBox(chi…

## <span style='color: green'>Explore </span> Similarity Scores <span style='float: right; color: red'>OPTIONAL</span>
Explore previously computed similarity scores.

In [21]:
import common.network.plot as network_plot
import types

def plot_similarity_network(n_top, p_threshold, output_type):

    plot_opts = dict(
        #x_axis_type=None,
        #y_axis_type=None,
        #background_fill_color='white',
        line_opts=dict(color='black', alpha=0.5),
        figsize=(900, 900),
        node_opts=dict(color='lightgreen', level='overlay', alpha=1.0),
        node_size=20,
        node_label='name',
        node_label_opts=dict(y_offset=15, x_offset=0),
        #edge_label='edge_label', 
        #edge_label_opts={}
    )

    layout_opts = {
        'algorithm': 'nx_spring_layout',
        'args': {} # dict(scale=1.0, K=K, C=C, p=p1)
    }
    
    df = current_similarity_model()

    if df is None:
        return

    df = df[df.score >= (p_threshold / 100.0)].head(n_top)

    if output_type == "network":
        plot_data = network_plot.plot_df(df, source='treaty_id1', target='treaty_id2', weight='score', layout_opts=layout_opts, plot_opts=plot_opts)
    else:
        display(df)
        
def display_similarity_plot_gui():
    
    output_options = [
        ('Network', 'network'),
        ('Table', 'table')
    ]
    
    gui = types.SimpleNamespace(
        n_top=widgets.IntSlider(description='Most similar', min=10, max=1000, value=25),
        p_threshold=widgets.FloatSlider(description='Threshold', min=0.0, max=100.0, value=90.0),
        output_type = widgets.Dropdown(description='Output', options=output_options, value='network')
    )
    
    iw = widgets.interactive(plot_similarity_network, n_top=gui.n_top, p_threshold=gui.p_threshold, output_type=gui.output_type)
    
    display(
        widgets.VBox(
            [
                widgets.HBox([gui.n_top, gui.p_threshold, gui.output_type]),
                iw.children[-1]
            ]
        )
    )
    
    iw.update()
        
display_similarity_plot_gui()
        

VBox(children=(HBox(children=(IntSlider(value=25, description='Most similar', max=1000, min=10), FloatSlider(v…

### <span style='color: green;'>PREPARE</span> Generate PASSIM input file<span style='color: blue; float: right'>OPTIONAL</span>

### Explore Text using PASSIM
http://www.ccs.neu.edu/home/dasmith/infect-dl-2014.pdf
https://github.com/ViralTexts/vt-passim/wiki/Passim-data-model

#### Alternatives?:

[tess] Forstall, C., Coffee, N., Buck, T., Roache, K., & Jacobson, S.
       (2014). Modeling the scholars: Detecting intertextuality through
       enhanced word-level n-gram matching. Digital Scholarship in the
       Humanities, 30(4), 503-515.
       
https://github.com/tesserae/tesserae-v5

Detect and align similar passages: https://github.com/dasmiq/passim

Install pyarroq for easy bindings to Apache Arrow data (i.e. parquet): ```sudo pip3 install pyarrow```



In [23]:
import generic_process_corpus_gui
import json

def corpus_to_passim(corpus, process_opts, extract_args):
    
    def terms_to_text(terms):
        
        ts = ' '.join(terms)
        
        ts = cgi.escape(ts)
        
        return ts
    
    def doc_to_json(doc):
        
        treaty_id     = doc.metadata['treaty_id']
        signed_year   = doc.metadata['signed_year']
        terms         = [ x for x in textacy_utility.extract_document_terms(doc, extract_args)]
        
        document_text = terms_to_text(terms)
        
        doc_data      = dict(id=treaty_id, series=treaty_id, text=document_text, date=str(signed_year))
        json_data     = json.dumps(doc_data, ensure_ascii=True, sort_keys=True)
        
        return json_data
               
    filename = process_opts.get('filename_widget').value
    
    with open(filename, 'w') as f:
    
        for doc in corpus:
            terms = [ x for x in textacy_utility.extract_document_terms(doc, extract_args)]
            f.write(doc_to_json(doc) + '\n')

try:
    
    filename_widget = widgets.Text(value='treaties_passim.json', placeholder='filename', description='Filename:', disabled=False)
    
    _ = generic_process_corpus_gui.process_corpus_gui(
        current_corpus_container(), treaty_repository.current_wti_index(), corpus_to_passim, tagset=DF_TAGSET, gpe_filename=GPE_FILENAME, filename_widget=filename_widget)
    
    display(filename_widget)
    
    
except Exception as ex:
    raise
    logger.error(ex)
    

2020-06-02 14:29:56,790 : INFO : generic_process_corpus_gui.py.process_corpus_gui() : ...loading term substitution mappings...


VBox(children=(IntProgress(value=0, layout=Layout(width='90%'), max=5), HBox(children=(VBox(children=(HBox(chi…

Text(value='treaties_passim.json', description='Filename:', placeholder='filename')

### <span style='color: green;'>EXPLORE</span> Explore PASSIM outputs<span style='color: blue; float: right'>OPTIONAL</span>

>>
From [ViralTexts Wiki](https://github.com/ViralTexts/vt-passim/wiki/Passim-data-model):
>>
*By default, passim outputs a set of clusters of reprinted passages. While the passages are grouped into clusters, the concrete output is a flat series of records, which are easier to manage than nested or relational data, especially when not much data is associated with the clusters as such rather than their constituent passages.
>>
As noted above, passim passes many input fields unchanged into the output. New or changed fields include:
>>
- cluster is a long integer uniquely identifies the cluster.
- size is an integer count of the number of passages in the cluster. It is a convenience field that allows us to sort clusters in descending order of size.
uid is a long integer unique key computed from the input id field, which is preserved unchanged. It is used for internal passim computations and is present in the output for debugging purposes.
- text is a substring of the input text field that contains the reused passage.
- begin is the character offset into the input text where the reused passage begins.
- end is the character offset into the input text where the reused passage ends.
- pages is an array of pages and bounding boxes on those pages corresponding to the reused passage.
- locs is an array of canonical citations corresponding to the reused passage.*


In [None]:

def display_passim_output_table(data_folder, kind):
    
    source = os.path.join(data_folder, '{}.parquet'.format(kind))
    
    df = pq.read_table(source).to_pandas()
    display(df)
    
def display_passim_output_gui():
    
    folders = glob.glob('./output*')
    
    # kind_options = list(map(lambda x: re.search('.*\/(\w+)\.parquet$', x).group(1), filenames))    
    kind_options = ['dfpost', 'pass', 'clusters', 'out', 'extents', 'align', 'pairs']
    
    gui = types.SimpleNamespace(
        folder=widgets.Dropdown(description='Batch', options=folders, value=None),
        kind=widgets.Dropdown(description='Kind', options=kind_options, value='pairs'),
        display=widgets.Button(description='Display', button_style='Success'),
        output=widgets.Output()
    )
    
    display(
        widgets.VBox([
            widgets.HBox([gui.folder, gui.kind, gui.display]),
            gui.output
        ])
    )
    
    def display_handler(*args):
        
        gui.output.clear_output()
        
        folder = gui.folder.value
        kind = gui.kind.value
        
        if folder is None or kind is None:
            return
        
        gui.display.disabled = True
        
        with gui.output:
            
            filename = os.path.join(gui.folder.value, '{}.parquet'.format(gui.kind.value))
            
            if not os.path.isdir(filename):
                
                logger.info('File %s not found.' % filename)
                
            else:
                
                dataset = pq.ParquetDataset(filename)

                df = dataset.read().to_pandas()

                if kind == 'align':
                    headers = ['id1', 'id2', 'matches', 'score', 's1', 's2']
                    df = df[headers]
                    
                display(df.head())
            
        gui.display.disabled = False
    
    gui.display.on_click(display_handler)
    
#display_passim_output_table('./output_xxx', 'align')

#['dfpost', 'pass', 'clusters', 'out', 'extents', 'align', 'pairs']
#data = types.SimpleNamespace(
#    dfpost = pq.ParquetDataset('./output_xxx/dfpost.parquet').read().to_pandas(),
#    passage = pq.ParquetDataset('./output_xxx/pass.parquet').read().to_pandas(),
#    clusters = pq.ParquetDataset('./output_xxx/clusters.parquet').read().to_pandas(),
#    out = pq.ParquetDataset('./output_xxx/out.parquet').read().to_pandas(),
#    extents = pq.ParquetDataset('./output_xxx/extents.parquet').read().to_pandas(),
#    align = pq.ParquetDataset('./output_xxx/align.parquet').read().to_pandas(),
#    pairs = pq.ParquetDataset('./output_xxx/pairs.parquet').read().to_pandas()
#)

display_passim_output_gui()



In [None]:
import difflib
filename = './output_xxx/align.parquet'
dataset = pq.ParquetDataset(filename)
df = dataset.read().to_pandas().sort_values(['score'], ascending=False)

gui = types.SimpleNamespace(
    left=widgets.Output(layout={'border': '1px solid black'}),
    right=widgets.Output(layout={'border': '1px solid black'}),
    score=widgets.Text(description='SCORE', disabled=True),
    #difference=widgets.HTML(),
    left_header=widgets.Label(layout={'border': '1px solid black', 'font-weight': 'bold'}),
    right_header=widgets.Label(layout={'border': '1px solid black', 'font-weight': 'bold'}),
    index=widgets.IntSlider(description='Index', min=0, max=len(df)-1,value=0),
    forward=widgets.Button(description='>>', button_style='Success'),
    backward=widgets.Button(description='<<', button_style='Success'),
)

def on_index_value_change(*args):
    
    gui.left.clear_output()
    gui.right.clear_output()
    
    row = df.iloc[gui.index.value].to_dict()
    
    treaty_left = treaty_repository.current_wti_index().treaties.loc[row['id1']]
    treaty_right = treaty_repository.current_wti_index().treaties.loc[row['id2']]
    
    gui.left_header.value = 'Treaty {} between {} and {}. Signed: {} Length: {}'.format(treaty_left.name, treaty_left.party1, treaty_left.party2, treaty_left.signed_year, row['len1'])
    gui.right_header.value = 'Treaty {} between {} and {}. Signed: {} Length: {}'.format(treaty_right.name, treaty_right.party1, treaty_right.party2, treaty_right.signed_year, row['len2'])
    
    gui.score.value = str(int(row['score']))
    
        #html = difflib.HtmlDiff().make_table(row['s1'].split(), row['s2'].split())
        #gui.difference.value = html
        #{'uid2': 4219526764423281057, 'gid2': 4219526764423281057, 'tok1': 344, 'ew2': 487,
        # 's1': '', 'id2': '100514', 'b2': 3551, 'bw2': 441, 'len2': 3979,
        # 's2': '', 'uid1': -3491716573906744210, 'e1': 2844, 'id1': '100075', 'e2': 3924, 'series2': '100514', 'bw1': 300, 'gid1': -3491716573906744210,
        # 'series1': '100075', 'tok2': 495, 'ew1': 343, 'score': 502.0, 'b1': 2510, 'len1': 2844, 'matches': 284}        

    with gui.left:
        print(row['s1'])

    with gui.right:
        print(row['s2'])

def on_stepper_click(sender, *args):
    
    if sender.description == '<<':
        if gui.index.value > 0:
            gui.index.value = gui.index.value - 1
            
    if sender.description == '>>':
        if gui.index.value < gui.index.max:
            gui.index.value = gui.index.value + 1
    
gui.index.observe(on_index_value_change, names='value')
gui.forward.on_click(on_stepper_click)
gui.backward.on_click(on_stepper_click)

display(
    widgets.VBox([
        #widgets.HBox([gui.difference]),
        widgets.HBox([gui.index, gui.backward, gui.forward]),
        widgets.HBox([gui.score]),
        widgets.HBox([
            widgets.VBox([
                gui.left_header,
                gui.left
            ]),
            widgets.VBox([
                gui.right_header,
                gui.right
            ])
        ])
    ])
)
on_index_value_change()
