## tCoIR - Text Analysis
### <span style='color: green'>SETUP </span> Prepare and Setup Notebook <span style='float: right; color: red'>MANDATORY</span>

In [5]:
# Setup
%load_ext autoreload
%autoreload 2

import sys
import pandas as pd
import text_analytic_tools.utility as utility
import text_analytic_tools.utility.widgets as widgets
import text_analytic_tools.common.text_corpus as text_corpus
import text_analytic_tools.common.textacy_utility as textacy_utility

from beakerx.object import beakerx
from beakerx import *
from IPython.display import display #, set_matplotlib_formats
from IPython.core.interactiveshell import InteractiveShell
from text_analytic_tools.domain_logic_config import current_domain as domain_logic

InteractiveShell.ast_node_interactivity = "all"

logger = utility.getLogger('corpus_text_analysis')

utility.setup_default_pd_display(pd)


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## <span style='color: green'>PREPARE </span> HAL Co-Windows Ratio (CWR)<span style='float: right; color: red'>MANDATORY</span>

Term "HAL" co-occurrence frequencies is calculated in accordance with Hyperspace Analogue to Language (Lund; Burgess, 1996) vector-space model. The computation is specified in detail in section 3.1 in (Chen; Lu, 2011).

\begin{aligned}
nw(x) &= \text{number of sliding windows that contains term $x$} \\
nw(x, y) &= \text{number of sliding windows that contains $x$ and $y$} \\
\\
f(x, y) &= \text{normalized version of nw(x, y)} \\
CWR(x, y) &= \frac{nw(x, y)}{nw(x) + nw(y) - nw(x, y)}\\
\end{aligned}

- Chen Z.; Lu Y., "A Word Co-occurrence Matrix Based Method for Relevance Feedback"
- Lund, K.; Burgess, C. & Atchley, R. A. (1995). "Semantic and associative priming in high-dimensional semantic space".[Link](https://books.google.de/books?id=CSU_Mj07G7UC).
- Lund, K.; Burgess, C. (1996). "Producing high-dimensional semantic spaces from lexical co-occurrence". doi:10.3758/bf03204766 [Link](https://dx.doi.org/10.3758%2Fbf03204766).


In [6]:
import itertools
import glove
import pandas as pd
import collections

def build_vocab(corpus):
    ''' Iterates corpus and add distict terms to vocabulary '''
    logger.info('Builiding vocabulary...')
    token2id = collections.defaultdict()
    token2id.default_factory = token2id.__len__
    for doc in corpus:
        for term in doc:
            token2id[term]
    logger.info('Vocabulary of size {} built.'.format(len(token2id)))
    return token2id

# See http://www.foldl.me/2014/glove-python/
class GloveVectorizer():
    
    def __init__(self, corpus=None, token2id=None):
        
        self.token2id = token2id
        self._id2token = None
        self.corpus = corpus        
        
    @property
    def corpus(self):
        return self._corpus
    
    @corpus.setter
    def corpus(self, value):
    
        self._corpus = value
        self.term_count = sum(map(len, value or []))
        
        if self.token2id is None and value is not None:
            self.token2id = build_vocab(value)
            self._id2token = None
    
    @property
    def id2token(self):
        if self._id2token is None:
            if self.token2id is not None:
                self._id2token = { v:k for k,v in self.token2id.items() }
        return self._id2token
    
    #def fit(self, sentences, window=2, dictionary=None):
    def fit(self, corpus=None, size=2):  #, distance_metric=0, zero_out_diag=False):
        
        if corpus is not None:
            self.corpus = corpus
            
        assert self.token2id is not None, "Fit with no vocabulary!"
        assert self.corpus is not None, "Fit with no corpus!"
        
        glove_corpus = glove.Corpus(dictionary=self.token2id)
        glove_corpus.fit(corpus, window=size)

        self.nw_xy = glove_corpus.matrix
        
        return self
    
    def cooccurence(self, normalize='size', zero_diagonal=True):
        '''Return computed co-occurrence values'''
        
        matrix = self.nw_xy
        
        #if zero_diagonal:
        #    matrix.fill_diagonal(0)
                
        coo_matrix = matrix #.tocoo(copy=False)
        
        df = pd.DataFrame({
            'x_id': coo_matrix.row,
            'y_id': coo_matrix.col,
            'nw_xy': coo_matrix.data,
            'nw_x': 0,
            'nw_y': 0,
        }).reset_index(drop=True)
        
        df = df.assign(
            x_term=df.x_id.apply(lambda x: self.id2token[x]),
            y_term=df.y_id.apply(lambda x: self.id2token[x])
        )
        
        df = df[['x_id', 'y_id', 'x_term', 'y_term', 'nw_xy', 'nw_x', 'nw_y']]
        
        norm = 1.0
        if normalize == 'size':
            norm = self.term_count
        elif normalize == 'max':
            norm = np.max(coo_matrix)
        elif normalize is None:
            logger.warning('No normalize method specified. Using absolute counts...')
            pass # return as as is..."
        else:
            assert False, 'Unknown normalize specifier'

        df_nw_xy = df.assign(cwr=df.nw_xy / norm)
        
        return df_nw_xy[df_nw_xy.cwr > 0]


In [7]:
import sys, array, collections
import scipy.sparse as sp
import numpy as np
import pandas as pd
import itertools
import text_analytic_tools.utility as utility

from glove import Corpus 

logger = utility.getLogger('corpus_text_analysis')

class HyperspaceAnalogueToLanguageVectorizer():
    
    def __init__(self, corpus=None, token2id=None, tick=utility.noop):
        """
        Build vocabulary and create nw_xy term-term matrix and nw_x term global occurence vector
        
        Parameter:
            corpus Iterable[Iterable[str]]

        """
        self.token2id = token2id
        self._id2token = None
        self.corpus = corpus
        
        self.nw_xy = None
        self.nw_x = None
        self.tick = tick

    @property
    def corpus(self):
        return self._corpus
    
    @corpus.setter
    def corpus(self, value):
    
        self._corpus = value
        self.term_count = sum(map(len, value or []))
        
        if self.token2id is None and value is not None:
            self.token2id = build_vocab(value)
            self._id2token = None
    
    @property
    def id2token(self):
        if self._id2token is None:
            if self.token2id is not None:
                self._id2token = { v:k for k,v in self.token2id.items() }
        return self._id2token
    
    def sliding_window(self, seq, n):
        it = itertools.chain(iter(seq), [None] * n)
        memory = tuple(itertools.islice(it, n+1))
        if len(memory) == n+1:
            yield memory
        for x in it:
            memory = memory[1:] + (x,)
            yield memory

    def fit(self, corpus=None, size=2, distance_metric=0, zero_out_diag=False):
        
        '''Trains HAL for a document. Note that sentence borders (for now) are ignored'''
        
        if corpus is not None:
            self.corpus = corpus
            
        assert self.token2id is not None, "Fit with no vocabulary!"
        assert self.corpus is not None, "Fit with no corpus!"

        nw_xy = sp.lil_matrix ((len(self.token2id), len(self.token2id)), dtype=np.int32)
        nw_x = np.zeros(len(self.token2id), dtype=np.int32)
        
        for terms in corpus:
            
            id_terms = ( self.token2id[size] for size in terms)
            
            self.tick()
            
            for win in self.sliding_window(id_terms, size):
                
                #logger.info([ self.id2token[x] if x is not None else None for x in win])
                
                if win[0] is None:
                    continue
                    
                for x in win:
                    if x is not None:
                        nw_x[x] += 1

                for i in range(1, size+1):

                    if win[i] is None:
                        continue
                        
                    if zero_out_diag:
                        if win[0] == win[i]:
                            continue
                        
                    d = float(i) # abs(n - i)
                    if distance_metric == 0: #  linear i.e. adjacent equals window size, then decreasing by one
                        w = (size - d + 1) # / size
                    elif distance_metric == 1: # f(d) = 1 / d
                        w = 1.0 / d
                    elif distance_metric == 2: # Constant value of 1
                        w = 1

                    #print('*', i, self.id2token[win[0]], self.id2token[win[i]], w, [ self.id2token[x] if x is not None else None for x in win])
                    nw_xy[win[0], win[i]] += w
                    
        self.nw_x = nw_x
        self.nw_xy = nw_xy
        #self.f_xy = nw_xy / np.max(nw_xy)

        return self
    
    def to_df(self):
        columns = [ self.id2token[i] for i in range(0,len(self.token2id))]
        return pd.DataFrame(
            data=self.nw_xy.todense(),
            index=list(columns),
            columns=list(columns),
            dtype=np.float64
        ).T
    
    def cwr(self, direction_sensitive=False, normalize='size'):

        n = self.nw_x.shape[0]
        
        nw = self.nw_x.reshape(n,1)
        nw_xy = self.nw_xy
        
        norm = 1.0
        if normalize == 'size':
            norm = float(self.term_count)
        elif norm == 'max':
            norm = float(np.max(nw_xy))
        elif norm == 'sum':
            norm = float(np.sum(nw_xy))
        
        #nw.resize(nw.shape[0], 1)
        
        self.cwr = sp.lil_matrix(nw_xy / (-nw_xy + nw + nw.T)) #nw.reshape(n,1).T))
        
        if norm != 1.0:
            self.cwr = self.cwr / norm
            
        coo_matrix = self.cwr.tocoo(copy=False)
        df = pd.DataFrame({
            'x_id': coo_matrix.row,
            'y_id': coo_matrix.col,
            'cwr': coo_matrix.data
        }).sort_values(['x_id', 'y_id']).reset_index(drop=True)
        
        df = df.assign(
            x_term=df.x_id.apply(lambda x: self.id2token[x]),
            y_term=df.y_id.apply(lambda x: self.id2token[x])
        )
        
        df_nw_x = pd.DataFrame(self.nw_x, columns=['nw'])
        
        df = df.merge(df_nw_x, left_on='x_id', right_index=True, how='inner').rename(columns={'nw': 'nw_x'})
        df = df.merge(df_nw_x, left_on='y_id', right_index=True, how='inner').rename(columns={'nw': 'nw_y'})
        
        df = df[['x_id', 'y_id', 'x_term', 'y_term', 'cwr']]
        
        return df
    
    def cooccurence2(self, direction_sensitive=False, normalize='size', zero_diagonal=True):
        n = self.cwr.shape[0]
        df = pd.DataFrame([(
                i,
                j,
                self.id2token[i],
                self.id2token[j],
                self.nw_xy[i,j],
                self.nw_x[i],
                self.nw_x[j],
                self.cwr[i,j]
            ) for i,j in itertools.product(range(0,n), repeat=2) if self.cwr[i,j] > 0 ], columns=['x_id', 'y_id', 'x_term', 'y_term', 'nw_xy', 'nw_x', 'nw_y', 'cwr'])
        
        return df    
    
    def cooccurence(self, direction_sensitive=False, normalize='size', zero_diagonal=True):
        '''Return computed co-occurrence values'''
        
        matrix = self.nw_xy
        
        if not direction_sensitive:
            matrix += matrix.T
            matrix[np.tril_indices(matrix.shape[0])] = 0
        else:
            if zero_diagonal:
                matrix.fill_diagonal(0)
                
        coo_matrix = matrix.tocoo(copy=False)
        
        df_nw_x = pd.DataFrame(self.nw_x, columns=['nw'])
        
        df = pd.DataFrame({
            'x_id': coo_matrix.row,
            'y_id': coo_matrix.col,
            'nw_xy': coo_matrix.data
        })[['x_id', 'y_id', 'nw_xy']].sort_values(['x_id', 'y_id']).reset_index(drop=True)
        
        df = df.assign(
            x_term=df.x_id.apply(lambda x: self.id2token[x]),
            y_term=df.y_id.apply(lambda x: self.id2token[x])
        )
        
        df = df.merge(df_nw_x, left_on='x_id', right_index=True, how='inner').rename(columns={'nw': 'nw_x'})
        df = df.merge(df_nw_x, left_on='y_id', right_index=True, how='inner').rename(columns={'nw': 'nw_y'})
        
        df = df[['x_id', 'y_id', 'x_term', 'y_term', 'nw_xy', 'nw_x', 'nw_y']]
        
        norm = 1.0
        if normalize == 'size':
            norm = self.term_count
        elif normalize == 'max':
            norm = np.max(coo_matrix)
        elif normalize is None:
            logger.warning('No normalize method specified. Using absolute counts...')
            pass # return as as is..."
        else:
            assert False, 'Unknown normalize specifier'

        #logger.info('Normalizing for document corpus size %s.', norm)

        df_nw_xy = df.assign(cwr=((df.nw_xy / (df.nw_x + df.nw_y - df.nw_xy)) / norm))

        df_nw_xy.loc[df_nw_xy.cwr < 0.0, 'cwr'] = 0
        df_nw_xy.cwr.fillna(0.0, inplace=True)
        
        return df_nw_xy[df_nw_xy.cwr > 0]

def test_burgess_litmus_test():
    terms = 'The Horse Raced Past The Barn Fell .'.lower().split()
    answer = {
     'barn':  {'.': 4,  'barn': 0,  'fell': 5,  'horse': 0,  'past': 0,  'raced': 0,  'the': 0},
     'fell':  {'.': 5,  'barn': 0,  'fell': 0,  'horse': 0,  'past': 0,  'raced': 0,  'the': 0},
     'horse': {'.': 0,  'barn': 2,  'fell': 1,  'horse': 0,  'past': 4,  'raced': 5,  'the': 3},
     'past':  {'.': 2,  'barn': 4,  'fell': 3,  'horse': 0,  'past': 0,  'raced': 0,  'the': 5},
     'raced': {'.': 1,  'barn': 3,  'fell': 2,  'horse': 0,  'past': 5,  'raced': 0,  'the': 4},
     'the':   {'.': 3,  'barn': 6,  'fell': 4,  'horse': 5,  'past': 3,  'raced': 4,  'the': 2}
    }
    df_answer = pd.DataFrame(answer).astype(np.int32)[['the', 'horse', 'raced', 'past', 'barn', 'fell']].sort_index()
    #display(df_answer)
    vectorizer = HyperspaceAnalogueToLanguageVectorizer()
    vectorizer.fit([terms], size=5, distance_metric=0)
    df_imp = vectorizer.to_df().astype(np.int32)[['the', 'horse', 'raced', 'past', 'barn', 'fell']].sort_index()
    assert df_imp.equals(df_answer), "Test failed"
    #df_imp == df_answer

    # Example in Chen, Lu:
    terms = 'The basic concept of the word association'.lower().split()
    vectorizer = HyperspaceAnalogueToLanguageVectorizer().fit([terms], size=5, distance_metric=0)
    df_imp = vectorizer.to_df().astype(np.int32)[['the', 'basic', 'concept', 'of', 'word', 'association']].sort_index()
    df_answer = pd.DataFrame({
        'the': [2, 5, 4, 3, 6, 4],
        'basic': [3, 0, 5, 4, 2, 1],
        'concept': [4, 0, 0, 5, 3, 2], 
        'of': [5, 0, 0, 0, 4, 3],
        'word': [0, 0, 0, 0, 0, 5],
        'association': [0, 0, 0, 0, 0, 0]
        },
        index=['the', 'basic', 'concept', 'of', 'word', 'association'],
        dtype=np.int32
    ).sort_index()[['the', 'basic', 'concept', 'of', 'word', 'association']]
    assert df_imp.equals(df_answer), "Test failed"
    print('Test run OK')
    
    
    
test_burgess_litmus_test()


2019-09-01 10:18:23,050 : INFO : Builiding vocabulary...
2019-09-01 10:18:23,052 : INFO : Vocabulary of size 7 built.
2019-09-01 10:18:23,059 : INFO : Builiding vocabulary...
2019-09-01 10:18:23,060 : INFO : Vocabulary of size 6 built.


Test run OK


## <span style='color: green'>PREPARE </span> Compute Using Prepared Tokenized Corpus <span style='float: right; color: red'>MANDATORY</span>


In [8]:
import ipywidgets
import text_corpus
import time

class PreparedCorpusUserInterface():
    
    def __init__(self, data_folder):
        self.data_folder = data_folder
        
    def display(self, compute_handler):
        
        def on_button_clicked(b):
            
            if self.filepath.value is None:
                return
            
            self.out.clear_output()
            with self.out:
                self.button.disabled = True
                compute_handler(
                    self.filepath.value,
                    window_size=self.window_size.value,
                    distance_metric=self.distance_metric.value,
                    direction_sensitive=False, # self.direction_sensitive.value,
                    method=self.method.value
                )
                self.button.disabled = False

        corpus_files = sorted(glob.glob(os.path.join(self.data_folder, '*.tokenized.zip')))
        distance_metric_options = [
            ('linear', 0),
            ('inverse', 1),
            ('constant', 2)
        ]
        
        self.filepath            = widgets.Dropdown(description='Corpus', options=corpus_files, value=None, layout=widgets.Layout(width='400px'))
        self.window_size         = widgets.IntSlider(description='Window', min=2, max=40, value=5, layout=widgets.Layout(width='250px'))
        self.method              = widgets.Dropdown(description='Method', options=['HAL', 'Glove'], value='HAL', layout=widgets.Layout(width='200px'))
        self.button              = widgets.Button(description='Compute', button_style='Success', layout=widgets.Layout(width='115px',background_color='blue'))
        self.out                 = widgets.Output()
        
        self.distance_metric     = widgets.Dropdown(description='Dist.f.', options=distance_metric_options, value=2, layout=widgets.Layout(width='200px'))
        #self.direction_sensitive = widgets.ToggleButton(description='L/R', value=False, layout=widgets.Layout(width='115px',background_color='blue'))
        #self.zero_diagonal       = widgets.ToggleButton(description='Zero Diag', value=False, layout=widgets.Layout(width='115px',background_color='blue'))
        
        self.button.on_click(on_button_clicked)
        
        return widgets.VBox([
            widgets.HBox([
                widgets.VBox([
                    self.filepath,
                    self.method
                ]),
                widgets.VBox([
                    self.window_size,
                    self.distance_metric
                ]),
                widgets.VBox([
                    #self.direction_sensitive,
                    self.button
                ])
            ]),
            self.out])
    
import re
def source_corpus_filename(tokenized_corpus_name):
    try:
        m = re.match('(.*\.txt)_preprocessed.*', tokenized_corpus_name)
        return m.groups()[0] + '.zip'
    except:
        return None

#FIXME: UNESCO-specific logic
def get_source_specific_index(source_name):
    source_index = domain_logic.load_corpus_index(source_name)
    if source_index is not None:
        source_index = source_index.set_index('local_number')
    return source_index
    
def do_some_stuff(
    filepath,
    window_size=5,
    distance_metric=0,
    direction_sensitive=False,
    normalize='size',
    zero_diagonal=True,
    method='HAL'
):

    corpus = text_corpus.SimplePreparedTextCorpus(filepath, lowercase=True)
    doc_terms = [ [ t.lower().strip('_') for t in terms if len(t) > 2] for terms in corpus.get_texts() ]
    
    common_token2id = build_vocab(doc_terms)
    
    source_name = source_corpus_filename(filepath)
    document_index = domain_logic.compile_documents(corpus)
    
    source_index = get_source_specific_index(source_name)
    #threshold = 0.005
    
    dfs = []
    min_year, max_year = document_index.year.min(),  document_index.year.max()
    document_index['sequence_id'] = range(0, len(document_index))

    for year in range(min_year, max_year + 1):
        
        year_indexes = list(document_index.loc[document_index.year == year].sequence_id)
        
        docs = [ doc_terms[y] for y in year_indexes ]
        
        logger.info('Year %s...', year)
        
        if method == "HAL":
            
            vectorizer = HyperspaceAnalogueToLanguageVectorizer(token2id=common_token2id)\
                .fit(docs, size=window_size, distance_metric=distance_metric)
        
            df = vectorizer.cooccurence(direction_sensitive=direction_sensitive, normalize=normalize, zero_diagonal=zero_diagonal)
            
        else:
            
            vectorizer = GloveVectorizer(token2id=common_token2id)\
                .fit(docs, size=window_size)
            
            df = vectorizer.cooccurence(normalize=normalize, zero_diagonal=zero_diagonal)
            
        df['year'] = year
        #df = df[df.cwr >= threshhold]
        
        dfs.append(df[['year', 'x_term', 'y_term', 'nw_xy', 'nw_x', 'nw_y', 'cwr']])

        #if i == 5: break
            
    df = pd.concat(dfs, ignore_index=True)

    df['cwr'] = df.cwr / np.max(df.cwr, axis=0)
    #display(df.sort_values('cwr', ascending=False).head(100))

    result_filename = '{}_{}_result_co_occurrence_{}.xlsx'.format(method, window_size, time.strftime("%Y%m%d_%H%M%S"))
    df.to_excel(result_filename)
    print('Result saved to file {}'.format(result_filename))
    
    print('Now you are ready to do some serious stuff!')
    #return doc_terms
    
display(PreparedCorpusUserInterface(domain_logic.DATA_FOLDER).display(do_some_stuff))


VBox(children=(HBox(children=(VBox(children=(Dropdown(description='Corpus', layout=Layout(width='400px'), opti…