## tCoIR - Text Analysis
### <span style='color: green'>SETUP </span> Prepare and Setup Notebook <span style='float: right; color: red'>MANDATORY</span>

In [3]:
# Setup
%load_ext autoreload
%autoreload 2

import sys, os, collections, zipfile
import re, typing.re
import nltk, textacy, spacy 
import pandas as pd
import ipywidgets as widgets

sys.path = list(set(['.', '..']) - set(sys.path)) + sys.path

import matplotlib.pyplot as plt
import common.utility as utility
import common.widgets_utility as widgets_utility
import common.widgets_config as widgets_config
import common.config as config
import common.utility as utility
import text_corpus
import textacy.keyterms
import gui_utility

from beakerx.object import beakerx
from beakerx import *
from IPython.display import display, set_matplotlib_formats

logger = utility.getLogger('corpus_text_analysis')

utility.setup_default_pd_display(pd)

DATA_FOLDER = '../../data'
DF_TAGSET = pd.read_csv(os.path.join(DATA_FOLDER, 'tagset.csv'), sep='\t').fillna('')

%matplotlib inline
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

current_corpus_container = lambda: textacy_utility.CorpusContainer.container()
current_corpus = lambda: textacy_utility.CorpusContainer.corpus()
current_document_index = lambda: current_corpus_container().document_index

import domain_logic_vatican as domain_logic

extract_args = dict(
    args=dict(
        ngrams=[1],
        named_entities=False,
        normalize='lemma',
        as_strings=True
    ),
    kwargs=dict(
        min_freq=1,
        include_pos=['NOUN'],
        filter_stops=True,
        filter_punct=True
    ),
    extra_stop_words=None,
    substitutions=None
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## <span style='color: green'>PREPARE </span> Load and Prepare Corpus <span style='float: right; color: red'>MANDATORY</span>


In [2]:
import textacy_corpus_utility as textacy_utility
import textacy_corpus_gui

try:
    container = current_corpus_container()
    textacy_corpus_gui.display_corpus_load_gui(DATA_FOLDER, container=container)
except Exception as ex:
    raise
    logger.error(ex)


VBox(children=(IntProgress(value=0, layout=Layout(width='90%'), max=5), HBox(children=(Dropdown(description='C…

## <span style='color: green'>PREPARE </span> HAL Co-Windows Ratio (CWR)<span style='float: right; color: red'>MANDATORY</span>

\begin{aligned}
nw(x) &= \text{number of sliding windows that contains term $x$} \\
nw(x, y) &= \text{number of sliding windows that contains $x$ and $y$} \\
\\
f(x, y) &= \text{normalized version of nw(x, y)} \\
CWR(x, y) &= \frac{nw(x, y)}{nw(x) + nw(y) - nw(x, y)}\\
\end{aligned}

Term co-occurrence frequencies is calculated in accordance with Hyperspace Analogue to Language (Lund; Burgess, 1996) vector-space model. The computation is specified in detail in section 3.1 in (Chen; Lu, 2011).

- Chen Z.; Lu Y., "A Word Co-occurrence Matrix Based Method for Relevance Feedback"
- Lund, K.; Burgess, C. & Atchley, R. A. (1995). "Semantic and associative priming in high-dimensional semantic space".[Link](https://books.google.de/books?id=CSU_Mj07G7UC).
- Lund, K.; Burgess, C. (1996). "Producing high-dimensional semantic spaces from lexical co-occurrence". doi:10.3758/bf03204766 [Link](https://dx.doi.org/10.3758%2Fbf03204766).



In [202]:
import array
import scipy.sparse as sp
import itertools

class HyperspaceAnalogueToLanguageVectorizer():
    
    def __init__(self, corpus=None, token2id=None, tick=utility.noop):
        """
        Build vocabulary and create nw_xy term-term matrix and nw_x term global occurence vector
        
        Parameter:
            corpus Iterable[Iterable[str]]

        """
        self.token2id = token2id
        self.corpus = corpus
        self.term_count = 0
        
        if corpus is not None and token2id is None:
            self._build_vocabulary(corpus)
        
        self.nw_xy = None
        self.nw_x = None
        self._id2token = None
        self.tick = tick
        
    def _build_vocabulary(self, corpus):
        ''' Iterates corpus and add distict terms to vocabulary '''
        logger.info('Builiding vocabulary...')
        token2id = collections.defaultdict()
        token2id.default_factory = token2id.__len__
        term_count = 0
        for doc in corpus:
            for term in doc:
                token2id[term]
                term_count += 1
        self.token2id = token2id
        self.term_count = term_count
        logger.info('Vocabulary of size {} built from {} terms.'.format(len(token2id), term_count))
    
    @property
    def id2token(self):
        if self._id2token is None:
            if self.token2id is not None:
                self._id2token = { v:k for k,v in self.token2id.items() }
        return self._id2token
    
    def sliding_window(self, seq, n):
        it = itertools.chain(iter(seq), [None] * n)
        memory = tuple(itertools.islice(it, n+1))
        if len(memory) == n+1:
            yield memory
        for x in it:
            memory = memory[1:] + (x,)
            yield memory
        
    def fit(self, corpus=None, size=2, weighing=0, zero_out_diag=False):
        
        '''Trains HAL for a document. Note that sentence borders (for now) are ignored'''
        
        if corpus is not None:
            self.corpus = corpus
            self._build_vocabulary(corpus)
            
        assert self.token2id is not None, "Fit with no vocabulary!"
        assert self.corpus is not None, "Fit with no corpus!"

        nw_xy = sp.lil_matrix ((len(self.token2id), len(self.token2id)), dtype=np.int32)
        nw_x = np.zeros(len(self.token2id), dtype=np.int32)
        
        for terms in corpus:
            
            id_terms = ( self.token2id[size] for size in terms)
            
            self.tick()
            
            for win in self.sliding_window(id_terms, size):
                
                #logger.info([ self.id2token[x] if x is not None else None for x in win])
                
                if win[0] is None:
                    continue
                    
                for x in win:
                    if x is not None:
                        nw_x[x] += 1

                for i in range(1, size+1):

                    if win[i] is None:
                        continue
                        
                    if zero_out_diag:
                        if win[0] == win[i]:
                            continue
                        
                    d = i # abs(n - i)
                    if weighing == 0: #  linear i.e. adjacent equals window size, then decreasing by one
                        w = size - d + 1
                    elif weighing == 1: # f(d) = 1 / d
                        w = 1.0 / d
                    elif weighing == 2: # Constant value of 1
                        w = 1

                    #print('*', i, self.id2token[win[0]], self.id2token[win[i]], w, [ self.id2token[x] if x is not None else None for x in win])
                    nw_xy[win[0], win[i]] += w
                    
        self.nw_x = nw_x
        self.nw_xy = nw_xy
        #self.f_xy = nw_xy / np.max(nw_xy)

        return self
    
    def to_df(self):
        columns = [ self.id2token[i] for i in range(0,len(self.token2id))]
        return pd.DataFrame(
            data=self.nw_xy.todense(),
            index=list(columns),
            columns=list(columns),
            dtype=np.float64
        ).T
    
    def cwr(self, direction_sensitive=False, normalize='size'):

        n = self.nw_x.shape[0]
        
        nw = self.nw_x.reshape(n,1)
        nw_xy = self.nw_xy
        
        norm = 1.0
        if normalize == 'size':
            norm = float(self.term_count)
        elif norm == 'max':
            norm = float(np.max(nw_xy))
        
        #nw.resize(nw.shape[0], 1)
        
        self.cwr = sp.lil_matrix(nw_xy / (-nw_xy + nw + nw.T)) #nw.reshape(n,1).T))
        
        if norm != 1.0:
            self.cwr = self.cwr / norm
            
        coo_matrix = self.cwr.tocoo(copy=False)
        df = pd.DataFrame({
            'x_id': coo_matrix.row,
            'y_id': coo_matrix.col,
            'cwr': coo_matrix.data
        }).sort_values(['x_id', 'y_id']).reset_index(drop=True)
        
        df = df.assign(
            x_term=df.x_id.apply(lambda x: self.id2token[x]),
            y_term=df.y_id.apply(lambda x: self.id2token[x])
        )
        df_nw_x = pd.DataFrame(self.nw_x, columns=['nw'])

        
        df = df.merge(df_nw_x, left_on='x_id', right_index=True, how='inner').rename(columns={'nw': 'nw_x'})
        df = df.merge(df_nw_x, left_on='y_id', right_index=True, how='inner').rename(columns={'nw': 'nw_y'})
        
        df = df[['x_id', 'y_id', 'x_term', 'y_term', 'cwr']]
        
        return df
    
    def cooccurence2(self, direction_sensitive=False, normalize='size', zero_diagonal=True):
        n = self.cwr.shape[0]
        df = pd.DataFrame([(
                i,
                j,
                self.id2token[i],
                self.id2token[j],
                self.nw_xy[i,j],
                self.nw_x[i],
                self.nw_x[j],
                self.cwr[i,j]
            ) for i,j in itertools.product(range(0,n), repeat=2) if self.cwr[i,j] > 0 ], columns=['x_id', 'y_id', 'x_term', 'y_term', 'nw_xy', 'nw_x', 'nw_y', 'cwr'])
        
        return df    
    
    def cooccurence(self, direction_sensitive=False, normalize='size', zero_diagonal=True):
        '''Return computed co-occurrence values'''
        
        matrix = self.nw_xy
        
        if not direction_sensitive:
            matrix += matrix.T
            matrix[np.tril_indices(matrix.shape[0])] = 0
            coo_matrix = matrix.tocoo(copy=False)
        else:
            if zero_diagonal:
                matrix.fill_diagonal(0)
            coo_matrix = matrix.tocoo(copy=False)
        
        df_nw_x = pd.DataFrame(self.nw_x, columns=['nw'])
        
        df = pd.DataFrame({
            'x_id': coo_matrix.row,
            'y_id': coo_matrix.col,
            'nw_xy': coo_matrix.data
        })[['x_id', 'y_id', 'nw_xy']].sort_values(['x_id', 'y_id']).reset_index(drop=True)
        
        df = df.assign(
            x_term=df.x_id.apply(lambda x: self.id2token[x]),
            y_term=df.y_id.apply(lambda x: self.id2token[x])
        )
        
        df = df.merge(df_nw_x, left_on='x_id', right_index=True, how='inner').rename(columns={'nw': 'nw_x'})
        df = df.merge(df_nw_x, left_on='y_id', right_index=True, how='inner').rename(columns={'nw': 'nw_y'})
        
        df = df[['x_id', 'y_id', 'x_term', 'y_term', 'nw_xy', 'nw_x', 'nw_y']]
        
        if normalize == 'size':
            df['nw_xy'] /= self.term_count
            df['nw_x']  /= self.term_count
            df['nw_y']  /= self.term_count
        elif normalize == 'max':
            df['nw_xy'] /= df.nw_xy.max()
            df['nw_x']  /= df.nw_x.max()
            df['nw_y']  /= df.nw_x.max()
        
        df = df.assign(cwr=df.nw_xy / (df.nw_x + df.nw_y - df.nw_xy))
        
        return df

def test_burgess_litmus_test():
    terms = 'The Horse Raced Past The Barn Fell .'.lower().split()
    answer = {
     'barn':  {'.': 4,  'barn': 0,  'fell': 5,  'horse': 0,  'past': 0,  'raced': 0,  'the': 0},
     'fell':  {'.': 5,  'barn': 0,  'fell': 0,  'horse': 0,  'past': 0,  'raced': 0,  'the': 0},
     'horse': {'.': 0,  'barn': 2,  'fell': 1,  'horse': 0,  'past': 4,  'raced': 5,  'the': 3},
     'past':  {'.': 2,  'barn': 4,  'fell': 3,  'horse': 0,  'past': 0,  'raced': 0,  'the': 5},
     'raced': {'.': 1,  'barn': 3,  'fell': 2,  'horse': 0,  'past': 5,  'raced': 0,  'the': 4},
     'the':   {'.': 3,  'barn': 6,  'fell': 4,  'horse': 5,  'past': 3,  'raced': 4,  'the': 2}
    }
    df_answer = pd.DataFrame(answer).astype(np.int32)[['the', 'horse', 'raced', 'past', 'barn', 'fell']].sort_index()
    #display(df_answer)
    vectorizer = HyperspaceAnalogueToLanguageVectorizer()
    vectorizer.fit([terms], size=5)
    df_imp = vectorizer.to_df().astype(np.int32)[['the', 'horse', 'raced', 'past', 'barn', 'fell']].sort_index()
    assert df_imp.equals(df_answer), "Test failed"
    #df_imp == df_answer

    # Example in Chen, Lu:
    terms = 'The basic concept of the word association'.lower().split()
    vectorizer = HyperspaceAnalogueToLanguageVectorizer().fit([terms], size=5)
    df_imp = vectorizer.to_df().astype(np.int32)[['the', 'basic', 'concept', 'of', 'word', 'association']].sort_index()
    df_answer = pd.DataFrame({
        'the': [2, 5, 4, 3, 6, 4],
        'basic': [3, 0, 5, 4, 2, 1],
        'concept': [4, 0, 0, 5, 3, 2], 
        'of': [5, 0, 0, 0, 4, 3],
        'word': [0, 0, 0, 0, 0, 5],
        'association': [0, 0, 0, 0, 0, 0]
        },
        index=['the', 'basic', 'concept', 'of', 'word', 'association'],
        dtype=np.int32
    ).sort_index()[['the', 'basic', 'concept', 'of', 'word', 'association']]
    assert df_imp.equals(df_answer), "Test failed"
    print('Test run OK')
    
    
    
test_burgess_litmus_test()


2019-02-25 15:35:28,994 : INFO : Builiding vocabulary...
2019-02-25 15:35:28,996 : INFO : Vocabulary of size 7 built from 8 terms.
2019-02-25 15:35:29,004 : INFO : Builiding vocabulary...
2019-02-25 15:35:29,006 : INFO : Vocabulary of size 6 built from 7 terms.


Test run OK


In [205]:

normalize = 'size'

terms = 'The basic concept of the word association'.lower().split()
vectorizer = HyperspaceAnalogueToLanguageVectorizer().fit([terms], size=5)
id2token = vectorizer.id2token
df = vectorizer.cwr(normalize=normalize)

df2 = vectorizer.cooccurence(direction_sensitive=True, normalize=normalize, zero_diagonal=False)

df.merge(df2[['x_id', 'y_id', 'cwr']], left_on=['x_id', 'y_id'], right_on=['x_id', 'y_id'], how='outer')
#df_imp = vectorizer.to_df().astype(np.int32)[vectorizer.id2token.values()] #.sort_index()
#a = { x: list(df_imp[x]) for x in df_imp.columns }; a['index'] = list(df_imp.index)
#a

2019-02-25 15:36:29,149 : INFO : Builiding vocabulary...
2019-02-25 15:36:29,151 : INFO : Vocabulary of size 6 built from 7 terms.


In [124]:
import itertools
list(itertools.product(range(0,6),repeat=2))


[(0, 0),
 (0, 1),
 (0, 2),
 (0, 3),
 (0, 4),
 (0, 5),
 (1, 0),
 (1, 1),
 (1, 2),
 (1, 3),
 (1, 4),
 (1, 5),
 (2, 0),
 (2, 1),
 (2, 2),
 (2, 3),
 (2, 4),
 (2, 5),
 (3, 0),
 (3, 1),
 (3, 2),
 (3, 3),
 (3, 4),
 (3, 5),
 (4, 0),
 (4, 1),
 (4, 2),
 (4, 3),
 (4, 4),
 (4, 5),
 (5, 0),
 (5, 1),
 (5, 2),
 (5, 3),
 (5, 4),
 (5, 5)]

In [84]:
a = np.array([
    [ 5, 1 , 3],
    [ 1, 1 , 2],
    [ 1, 2 , 1]
])

b = np.array(
    [1, 2, 3]
)
b.resize(3,1)
a
b
a - (b.T)

a - b


In [96]:
vectorizer.nw_x.resize(vectorizer.nw_x.shape[0], 1)
vectorizer.nw_xy / (-vectorizer.nw_xy + vectorizer.nw_x + vectorizer.nw_x.T)


  return np.true_divide(self.todense(), other)


matrix([[0.2       , 1.66666667, 0.8       , 0.42857143, 1.        ,
         0.5       ],
        [0.6       , 0.        ,        inf, 2.        , 0.33333333,
         0.14285714],
        [0.8       , 0.        , 0.        , 2.5       , 0.5       ,
         0.28571429],
        [1.        , 0.        , 0.        , 0.        , 0.66666667,
         0.42857143],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.71428571],
        [0.        , 0.        , 0.        , 0.        , 0.        ,
         0.        ]])

In [37]:
#corpus = [ current_corpus()[0] ]
#terms = [ list(doc) for doc in textacy_utility.extract_corpus_terms(corpus, extract_args) ]
terms = 'The Horse Raced Past The Barn Fell .'.lower().split()
vectorizer = HyperspaceAnalogueToLanguageVectorizer()
vectorizer.fit(terms, size=5)
matrix = vectorizer.p_ij

matrix += matrix.T
matrix[np.tril_indices(matrix.shape[0])] = 0
coo_matrix = matrix.tocoo(copy=False)
list(coo_matrix.todense())

df_p_i = pd.DataFrame(self.p_i, columns=['p_i_count'])

df = pd.DataFrame({
    'x_id': coo_matrix.row,
    'y_id': coo_matrix.col,
    'p_xy': coo_matrix.data
})[['x_id', 'y_id', 'p_xy']]\
.sort_values(['x_id', 'y_id'])\
.reset_index(drop=True)

df = df.assign(
    x_term=df.x_id.apply(lambda x: self.id2token[x]),
    y_term=df.y_id.apply(lambda x: self.id2token[x])
)

df = df.merge(df_p_i, left_on='x_id', right_index=True, how='inner').rename(columns={'p_i_count': 'p_x'})
df = df.merge(df_p_i, left_on='y_id', right_index=True, how='inner').rename(columns={'p_i_count': 'p_y'})

df = df[['x_id', 'y_id', 'x_term', 'y_term', 'p_xy', 'p_x', 'p_y']]

if normalize_by_term_count:
    df['p_xy'] /= self.term_count
    df['p_x']  /= self.term_count
    df['p_y']  /= self.term_count

df = df.assign(score=df.p_xy / (df.p_x + df.p_y - df.p_xy))     

2019-02-25 08:56:18,036 : INFO : Builiding vocabulary...
2019-02-25 08:56:18,038 : INFO : Vocabulary of size 15 built from 29 terms.


[matrix([[ 0., 10.,  8.,  0.,  0.,  5.,  4.,  0.,  0.,  3.,  0.,  0.,  0.,
           0.,  0.]]),
 matrix([[ 0.,  0., 12.,  5.,  4.,  3.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
           0.,  0.]]),
 matrix([[0., 0., 0., 3., 7., 5., 4., 5., 5., 0., 0., 0., 5., 9., 0.]]),
 matrix([[0., 0., 0., 0., 5., 4., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 matrix([[ 0.,  0.,  0.,  0.,  0.,  5., 10.,  4.,  2.,  0.,  4.,  5.,  0.,
           0.,  0.]]),
 matrix([[0., 0., 0., 0., 0., 0., 5., 0., 0., 4., 0., 0., 0., 0., 0.]]),
 matrix([[0., 0., 0., 0., 0., 0., 0., 5., 3., 5., 5., 4., 0., 0., 0.]]),
 matrix([[0., 0., 0., 0., 0., 0., 0., 0., 4., 0., 0., 0., 0., 0., 0.]]),
 matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 3., 0., 0., 0.]]),
 matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]),
 matrix([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

In [22]:
import spacy
   
def compute_co_occurrence(corpus, gui, documents, document_filters, terms_filter, window_size, group_by_columns, weighting, tick):
    
    # FIXME: Split based on group_by_filters
    docs = list(gui_utility.get_documents_by_field_filters(corpus, documents, document_filters))
    print(len(docs))
    gui.progress.max = len(docs)
    
    #def terms():
    #    for doc in doc:
    #        gui.progress.value += 1
    #        yield textacy_utility.extract_corpus_terms(docs, terms_filter)
            
    #terms = [ list(doc) for doc in textacy_utility.extract_corpus_terms(docs, terms_filter) ]
    
    vectorizer = HyperspaceAnalogueToLanguageVectorizer(tick=tick)
    vectorizer.fit(terms, size=window_size, weighing=weighting)
    logger.warning('Computation is mased on (P_ij + P_ij.T) by defauöt (before/after weights are sumed up) )')
    df = vectorizer.cooccurence()
    df.sort_values('score', ascending=False)
    return df.head(500)
    
def display_co_occurrences(gui, df):
    display(df)

def word_co_occurrence_gui(documents, corpus, compute_callback, display_callback, filter_options, group_by_options):
    
    lw = lambda w: widgets.Layout(width=w)
    
    include_pos_tags = [ 'ADJ', 'VERB', 'NUM', 'ADV', 'NOUN', 'PROPN' ]
    weighting_options = { 'Linear': 0, 'Reciprocal': 1, 'Constant': 2 }
    normalize_options = { '':  False, 'Lemma': 'lemma', 'Lower': 'lower' }
    pos_options = include_pos_tags
    
    default_include_pos = ['NOUN', 'PROPN']
    frequent_words = [ x[0] for x in textacy_utility.get_most_frequent_words(corpus, 100, include_pos=default_include_pos) ]

    output_type_options = [ ( 'List', 'table' ), ( 'Rank', 'rank' ) ]
    ngrams_options = { '-': None, '1': [1], '1,2': [1,2], '1,2,3': [1,2,3]}
    
    document_filters = gui_utility.generate_field_filters(documents, filter_options)
    
    gui = types.SimpleNamespace(
        progress=widgets.IntProgress(value=0, min=0, max=5, step=1, description='', layout=lw('98%')),
        document_filters=document_filters,
        ngrams=widgets.Dropdown(description='n-grams', options=ngrams_options, value=[1], layout=lw('200px')),
        normalize=widgets.Dropdown(description='Normalize', options=normalize_options, value='lemma', layout=lw('200px')),
        weighting=widgets.Dropdown(description='Weighting', options=weighting_options, value=0, layout=lw('200px')),
        include_pos=widgets.SelectMultiple(description='POS', options=pos_options, value=default_include_pos, rows=7, layout=lw('150px')),
        stop_words=widgets.SelectMultiple(description='STOP', options=frequent_words, value=list([]), rows=7, layout=lw('200px')),
        group_by_columns=widgets.Dropdown(description='Group by', value=group_by_options[0][1], options=group_by_options, layout=lw('200px')),
        #output_type=widgets.Dropdown(description='Output', value='rank', options=output_type_options, layout=lw('200px')),
        window_size=widgets.IntSlider(description='Window', min=5, max=20, value=5, layout=lw('200px')),
        min_freq=widgets.IntSlider(description='Min freq', min=1, max=10, value=1, layout=lw('200px')),
        compute=widgets.Button(description='Compute', button_style='Success', layout=lw('120px')),
        output=widgets.Output(layout={'border': '1px solid black'})
    )
    
    def tick():
        gui.progress.value += 1
        
    boxes = widgets.VBox([
        gui.progress,
        widgets.HBox([
            widgets.VBox([
                gui.normalize,
                gui.ngrams,
                gui.weighting,
                gui.group_by_columns,
                gui.min_freq,
                gui.window_size
                #gui.output_type,
            ]),
            widgets.VBox([ x['widget'] for x in gui.document_filters]),
            gui.include_pos,
            gui.stop_words,
            widgets.VBox([
                gui.compute,
            ], layout=widgets.Layout(align_items='flex-end')),
        ]),
        gui.output
    ])
    
    display(boxes)
    
    def pos_change_handler(*args):
        with gui.output:
            gui.compute.disabled = True
            selected = set(gui.stop_words.value)
            frequent_words = [
                x[0] for x in textacy_utility.get_most_frequent_words(
                    corpus,
                    100,
                    normalize=gui.normalize.value,
                    include_pos=gui.include_pos.value,
                    weighting=gui.weighting.value
                )
            ]
            gui.stop_words.options = frequent_words
            selected = selected & set(gui.stop_words.options)
            gui.stop_words.value = list(selected)
            gui.compute.disabled = False
        
    gui.include_pos.observe(pos_change_handler, 'value')    
    gui.weighting.observe(pos_change_handler, 'value')    
    
    def compute_callback_handler(*_args):
        gui.output.clear_output()
        with gui.output:
            try:
                gui.compute.disabled = True
                terms_filter = dict(
                    args=dict(
                        ngrams=gui.ngrams.value,
                        named_entities=None,
                        normalize=gui.normalize.value,
                        as_strings=True
                    ),
                    kwargs=dict(
                        min_freq=gui.min_freq.value,
                        include_pos=gui.include_pos.value,
                        filter_stops=True,
                        filter_punct=True
                    ),
                    extra_stop_words=set(gui.stop_words.value),
                    substitutions=None
                )
                df = compute_callback(
                    corpus=corpus,
                    gui=gui,
                    documents=documents,
                    document_filters=[ (x['field'], x['widget'].value) for x in gui.document_filters],
                    terms_filter=terms_filter,
                    window_size=gui.window_size.value,
                    group_by_columns=gui.group_by_columns.value,
                    weighting=gui.weighting.value,
                    tick=tick
                )
                display_callback(gui, df)
            finally:
                gui.compute.disabled = False
                gui.progress.value = 0

    gui.compute.on_click(compute_callback_handler)
    return gui
                
try:
    document_index = domain_logic.compile_documents(current_corpus())
    word_co_occurrence_gui(
        document_index,
        current_corpus(),
        compute_callback=compute_co_occurrence,
        display_callback=display_co_occurrences,
        filter_options=domain_logic.DOCUMENT_FILTERS,
        group_by_options=domain_logic.GROUP_BY_OPTIONS
    )
except Exception as ex:
    raise
    logger.error(ex)
    


VBox(children=(IntProgress(value=0, layout=Layout(width='98%'), max=5), HBox(children=(VBox(children=(Dropdown…

In [None]:
document_index.query('')

In [130]:
cimport cython
@cython.boundscheck(False)
@cython.wraparound(False)
cpdef make_lower_triangular(double[:,:] A, int k):
    """ Set all the entries of array A that lie above
    diagonal k to 0. """
    cdef int i, j
    for i in range(min(A.shape[0], A.shape[0] - k)):
        for j in range(max(0, i+k+1), A.shape[1]):
            A[i,j] = 0.

SyntaxError: invalid syntax (<ipython-input-130-b3b046b78708>, line 1)