## <span style='color: green'>PREPARE </span> UNESCO Courier Sample Corpus <span style='float: right; color: green'>TRY IT</span>

In [20]:
model_name = 'en_core_web_sm'
import domain_logic_unesco as domain_logic

# Streaming spaCy docs...
def create_nlp(model='en_core_web_sm', disable=None):
    '''Create a spaCy nlp model'''
    nlp = spacy.load(model, disable=disable)
    nlp.tokenizer = textacy_utility.keep_hyphen_tokenizer(nlp)
    return nlp

def create_source_stream(source_path, lang, document_index=None):
    '''Returns a filename/text stream from txt files found in given zip file '''
    reader = text_corpus.CompressedFileReader(source_path)
    stream = domain_logic.get_document_stream(reader, lang, document_index=document_index)
    return stream

def source_document_stream(source_path, document_index=None, model='en_core_web_sm', files_of_interest=None):
    '''Returns a filename/spaCy doc stream for text files in given zip file'''
    logger.info('Loading model {}...'.format(model))
    nlp = create_nlp(model=model, disable=('ner', 'parser', 'textcat'))
    logger.info('Processing {}...'.format(source_path))
    stream = create_source_stream(source_path, model[:2])
    file_counter = 0
    
    for filename, text, _ in stream:
        if files_of_interest is not None:
            if not filename in files_of_interest:
                continue
                    
        file_counter += 1
        doc = nlp(text)
        #doc['tensor'] = None
        if file_counter % 10 == 0:
            logger.info('Processed {} files...'.format(file_counter))
            break
        yield filename, doc
        doc = None


In [22]:
import numpy as np
import pandas as pd
import text_corpus
import re, os



In [61]:
import os
import pandas as pd
import text_corpus
import re
import logging
import numpy as np

logger = logging.getLogger('UNESCO')
logger.setLevel(logging.INFO)

# FIXME VARYING ASPECTS: 
DOCUMENT_FILTERS = [
    ]
        
GROUP_BY_OPTIONS = [
    ('Year', ['year']),
]

FILE_PATTERN = r'([0-9]+)(\w{2})\w*\.txt'

def split_name(filename):
    m = re.match(FILE_PATTERN, filename)
    if m is None:
        logger.error('Parse failed for filename: ' + filename)
        return None, None
    g = m.groups()
    return g[0], g[1]

def compile_documents_by_filename(filenames):

    local_numbers, langs = list(zip(*[ split_name(x) for x in filenames ]))

    df = pd.DataFrame( {
        'local_number': [ int(x) for x in local_numbers],
        'document_id': [ int(x) for x in local_numbers],
        'filename': filenames,
        'lang': langs,
        'year': 0
    })
    #df = df.set_index('local_number')
    
    df['title'] = df.document_id.apply(lambda x: str(x).zfill(10))
    
    return df

def chunk_text(text, step=50000):
    for segment in ( text[i:i + step] for i in range(0,len(text), step) ):
        yield segment

def load_corpus_index(corpus_name):
    '''Given a corpus filename "xxxxx.zip", the document index name is expected to be "xxxxx_index.txt"'''
    
    basename, _ = os.path.splitext(corpus_name)
    corpus_index_name = '{}_index.txt'.format(basename)
    global_index_name = './global_corpus_index.txt'    
    
    if not (os.path.isfile(corpus_index_name) or os.path.isfile(global_index_name)):
        logger.info('No Corpus Index found (looked for {} or {}.)'.format(corpus_index_name, global_index_name))
        return None
    
    index_name = corpus_index_name if os.path.isfile(corpus_index_name) else global_index_name
    
    logger.info('Using corpus index: '.format(index_name))
    
    df = pd.read_csv(index_name, sep='\t')
    
    return df

def compile_unesco_corpus_index(source):

    df = None
    if hasattr(source, 'path') or isinstance(source, str):
        # Try to load pre-compiled index if exists
        source_path = source.path if hasattr(source, 'path') else source
        df = load_corpus_index(source_path)
        if df is not None:
            df['local_number'] = df.local_number.astype(np.int64)
            df = df.set_index('local_number')
            df['local_number'] = df.index
            df['document_id'] = df.index
            return df

    if hasattr(source, 'filenames'):
        # Fallback, cretae index out of file names
        df = compile_documents_by_filename(source.filenames).set_index('local_number')
        df['local_number'] = df.index
        if 'year' not in df.columns:
            df['year'] = 0
        return df
    
    return None

def compile_documents(corpus, index=None):
    
    filenames = [ x.metadata['filename'] for x in corpus ]
    
    df = compile_documents_by_filename(filenames)
    
    return df

def get_document_stream(source, lang, **kwargs):

    reader = text_corpus.CompressedFileReader(source) if isinstance(source, str) else source
    
    df_corpus_index = compile_unesco_corpus_index(reader)
    
    reader.filenames = sorted(list(df_corpus_index[(df_corpus_index.year//10).isin([196, 197])].filename.values))
    
    df_corpus_index = df_corpus_index.loc[df_corpus_index.filename.isin(reader.filenames)].sort_values('filename')
    
    assert len(reader.filenames) == len(df_corpus_index)
    
    row_id = 0
    for filename, text in reader:
        
        local_number, lang = split_name(filename)
        local_number = int(local_number)
        
        metadata = df_corpus_index.loc[local_number].to_dict()
        
        if metadata['n_words'] < 50:
            logger.info('WARNING: Skipping empty file {} '.format(filename))
            continue
            
        if metadata['lang'] != 'en':
            logger.info('WARNING: Skipping file (unknown language) {} '.format(filename))
            continue
        
        trunc_size = 50000
        if metadata['n_bytes'] > trunc_size:
            logger.info('WARNING: Truncating huge file {} '.format(filename))
            text = text[:trunc_size]
            
#        i = 0
#        for segment in chunk_text(text, step=50000):
#            basename = str(local_number).zfill(10) + '_' + str(i).zfill(3)
#            yield filename, text[:100000], metadata
#            i += 1
            
        yield filename, text, metadata
        row_id += 1

# FIXME VARYING ASPECTs: What attributes to extend
def add_domain_attributes(df, document_index):
    df_extended = pd.merge(df, document_index, left_index=True, right_index=True, how='inner')    
    return df_extended[['filename', 'year']]

    

In [62]:
source_path = './../../data/unesco_test_corpus_20190307.txt_preprocessed.zip'
reader = text_corpus.CompressedFileReader(source_path)
df_corpus_index = compile_unesco_corpus_index(reader)

reader = text_corpus.CompressedFileReader(source_path)

df_corpus_index = compile_unesco_corpus_index(reader)

reader.filenames = sorted(list(df_corpus_index[(df_corpus_index.year//10).isin([196, 197])].filename.values))

df_corpus_index = df_corpus_index.loc[df_corpus_index.filename.isin(reader.filenames)].sort_values('filename')

assert len(reader.filenames) == len(df_corpus_index)

#len(df_corpus_index[(df_corpus_index.year>=1960)&(df_corpus_index.year<1970)].filename.values)
len(df_corpus_index[(df_corpus_index.year//10).isin([196, 197])].filename.values)
#filenames = df_corpus_index.loc[((df_corpus_index.year >= 1960 & df_corpus_index.year >= 1960))].valuesprint('hej')
stream = get_document_stream(source_path, 'en')

for x in stream:
    print(x)
    break

('001173engo.txt', "Cultural policy\n a preliminary\n study\n Unesco\nStudies and documents on cultural policies 1\n Published by the\n United Nations Educational, Scientific\n and Cultural Organization\n Place de Fontenoy, 75 Pari~-7~\n First edition 1969\n Second, revised edition 1969\nPrinted by Imprimerie Blanchard, Paris\n 0 Unesco 1969\n Printed in France\n SHC,69/XIX. 1 a/A\nThis publication is the nrst in the Studies and Documents on Cultural Policies series,\npublished as part of the programme adopted by the Unesco General Conference at its\nfifteenth session for the study of cultural policies.\n In this context, 'cultural policy' is taken to mean a body of operational principles,\nadministrative and budgetary practices and procedures which provide a basis for cultural\naction by the State. Obviously, there cannot be oue cultural policy suited to all countries;\neach Member State determines its own cultural policy according to the cultural values,\naims and choices it sets for

In [None]:
    row_id = 0
    for filename, text in reader:
        
        local_number, lang = split_name(filename)
        local_number = int(local_number)
        namedata = lookup.loc[filename].to_dict()
        metadata = df_index.loc[local_number].to_dict()
        metadata.update(namedata)
        
        if not (1960 <= metadata['year'] < 1970):
            #print('SKIPPING: file since not 1960ths ' + filename)
            continue
            
        if metadata['n_words'] < 50:
            print('WARNING: skipping empty file ' + filename)
            continue
            
        if metadata['lang'] != 'en':
            print('WARNING: Unknown language, skipping file ' + filename)
            continue
        
        trunc_size = 50000
        if metadata['n_bytes'] > trunc_size:
            print('WARNING: Truncating file ' + filename)
            text = text[:trunc_size]
            
#        i = 0
#        for segment in chunk_text(text, step=50000):
#            basename = str(local_number).zfill(10) + '_' + str(i).zfill(3)
#            yield filename, text[:100000], metadata
#            i += 1
            
        row_id += 1

## <span style='color: green'>PREPARE/DESCRIBE </span> Find Key Terms - Semantic Network<span style='float: right; color: green'>WORK IN PROGRESS</span>



In [None]:
doc
normalize='lemma'
window_width=2
edge_weighting='binary'
ranking_algo='pagerank'
join_key_words=False,
n_keyterms=10


if isinstance(n_keyterms, float):
    if not 0.0 < n_keyterms <= 1.0:
        raise ValueError('`n_keyterms` must be an int, or a float between 0.0 and 1.0')
    n_keyterms = int(round(len(doc) * n_keyterms))

    
include_pos = {'NOUN', 'PROPN', 'ADJ'}
word_list = textacy.extract.words()
if normalize == 'lemma':
    word_list = [word.lemma_ for word in doc]
    good_word_list = [word.lemma_ for word in doc
                      if not word.is_stop and not word.is_punct and word.pos_ in include_pos]
elif normalize == 'lower':
    word_list = [word.lower_ for word in doc]
    good_word_list = [word.lower_ for word in doc
                      if not word.is_stop and not word.is_punct and word.pos_ in include_pos]
elif not normalize:
    word_list = [word.text for word in doc]
    good_word_list = [word.text for word in doc
                      if not word.is_stop and not word.is_punct and word.pos_ in include_pos]
else:
    word_list = [normalize(word) for word in doc]
    good_word_list = [normalize(word) for word in doc
                      if not word.is_stop and not word.is_punct and word.pos_ in include_pos]

good_word_list = [word for word in good_word_list if word]

graph = network.terms_to_semantic_network(good_word_list, window_width=window_width, edge_weighting=edge_weighting)

# rank nodes by algorithm, and sort in descending order
if ranking_algo == 'pagerank':
    word_ranks = nx.pagerank_scipy(graph, weight='weight')
elif ranking_algo == 'divrank':
    word_ranks = rank_nodes_by_divrank(graph, r=None, lambda_=kwargs.get('lambda_', 0.5), alpha=kwargs.get('alpha', 0.5))
elif ranking_algo == 'bestcoverage':
    word_ranks = rank_nodes_by_bestcoverage(graph, k=n_keyterms, c=kwargs.get('c', 1), alpha=kwargs.get('alpha', 1.0))

# bail out here if all we wanted was key *words* and not *terms*
if join_key_words is False:
    return [(word, score) for word, score in sorted(word_ranks.items(), key=operator.itemgetter(1), reverse=True)[:n_keyterms]]

top_n = int(0.25 * len(word_ranks))
top_word_ranks = { word: rank for word, rank in sorted(word_ranks.items(), key=operator.itemgetter(1), reverse=True)[:top_n] }

# join consecutive key words into key terms
seen_joined_key_terms = set()
joined_key_terms = []
for key, group in itertools.groupby(word_list, lambda word: word in top_word_ranks):
    if key is True:
        words = list(group)
        term = ' '.join(words)
        if term in seen_joined_key_terms:
            continue
        seen_joined_key_terms.add(term)
        joined_key_terms.append((term, sum(word_ranks[word] for word in words)))

return sorted(joined_key_terms, key=operator.itemgetter(1, 0), reverse=True)[:n_keyterms]


corpus = current_corpus()
doc = corpus[0]
include_pos = { 'NOUN', 'PROPN', 'ADJ' }
#exclude_pos = { 'PRON' }
window_width = 10
edge_weighting = 'cooc_freq'
#key_terms = textacy.keyterms.key_terms_from_semantic_network(
#    doc, normalize='lemma', window_width=4, edge_weighting='cooc_freq', ranking_algo='pagerank', join_key_words=False, n_keyterms=1000000
#)

terms = [x for x in textacy.extract.words(doc, filter_stops=True, filter_punct=True, filter_nums=False, include_pos=include_pos, exclude_pos=exclude_pos, min_freq=1)]
graph = textacy.network.terms_to_semantic_network(terms, normalize='lemma', window_width=window_width, edge_weighting=edge_weighting)
nodes = [{'name': str(i)} for i in graph.nodes(data=True)]
links = [{'source': u[0], 'target': u[1], 'weight': u[2] } for u in graph.edges(data='weight')]

import json
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
%matplotlib inline
fig, ax = plt.subplots(1, 1, figsize=(22, 12));
nx.draw_networkx(graph, ax=ax)


In [None]:
import glove
import numpy as np
import pandas as pd
pd.set_option('max_columns', 20)
pd.set_option('precision', 2)

def m2df(matrix, columns):
    return pd.DataFrame(
        data=matrix,
        index=list(columns),
        columns=list(columns)
    )

corpus_words = ['the', 'basic', 'concept', 'of', 'the', 'word', 'association']

token2id = { }
for w in corpus_words:
    if w not in token2id:
        token2id[w] = len(token2id)
                            
id2token = { k:w for w, k in token2id.items() }
columns = [ id2token[i] for i in range(0, len(id2token))]

print(token2id)
print(id2token)

#id2word = { k: w for w,k in dictionary.items() }

corpus = [corpus_words, corpus_words]

model = glove.Corpus(dictionary=dictionary)
model.fit(corpus, window=5)

X = model.matrix.astype(np.float64)

Xi = X.sum(axis=1)
Pij = (X / Xi)
Pij[np.isnan(Pij)] = 0
Pji = Pij.T

print(m2df(model.matrix.todense(), columns))
print(m2df(Pij, columns))
print(m2df(Pji, columns))
print(m2df(Pij+Pji, columns))

Pij.sum(axis=1)

#print(model.dictionary)

#Pij = np.divide(X, Xi, out=np.zeros_like(X), where=Xi!=0)
    
#for word in corpus_words:
#    assert word in model.dictionary

#assert model.matrix.shape == (len(corpus_words), len(corpus_words))

#expected = [[0.0, 1.0, 0.5],
#            [0.0, 0.0, 1.0],
#            [0.0, 0.0, 0.0]]

#assert (model.matrix.todense().tolist() == expected)


\begin{aligned}
p(A, B, k) &= \frac{cooccurrence(A, B, K)}{count(A)}
\end{aligned}

In [None]:
terms = 'The Horse Raced Past The Barn Fell .'.lower().split()
answer = {
 'barn':  {'.': 4,  'barn': 0,  'fell': 5,  'horse': 0,  'past': 0,  'raced': 0,  'the': 0},
 'fell':  {'.': 5,  'barn': 0,  'fell': 0,  'horse': 0,  'past': 0,  'raced': 0,  'the': 0},
 'horse': {'.': 0,  'barn': 2,  'fell': 1,  'horse': 0,  'past': 4,  'raced': 5,  'the': 3},
 'past':  {'.': 2,  'barn': 4,  'fell': 3,  'horse': 0,  'past': 0,  'raced': 0,  'the': 5},
 'raced': {'.': 1,  'barn': 3,  'fell': 2,  'horse': 0,  'past': 5,  'raced': 0,  'the': 4},
 'the':   {'.': 3,  'barn': 6,  'fell': 4,  'horse': 5,  'past': 3,  'raced': 4,  'the': 2}
}
df_answer = pd.DataFrame(answer)
df_answer
#list(sliding_window3(chain(terms, [None]*5), n=5))
#list(sliding_window_it(terms, 2))

In [None]:
from itertools import chain, combinations
import pandas as pd
import nltk
from nltk.util import ngrams

def skipgrams(sequence, n, k):
    """Modified from NLTK to give distance."""
    
    SENTINEL = None #object()
    for ngram in ngrams(sequence, n + k, pad_right=True, right_pad_symbol=SENTINEL):
        head = ngram[:1]
        tail = ngram[1:]
        #print(ngram, head, tail)
        for i, skip_tail in enumerate(combinations(tail, n - 1)):
            if skip_tail[-1] is SENTINEL:
                continue
            #print(head + skip_tail, k-i)
            yield (head + skip_tail, k-i)
            
def hal(corpus, window):
    skips = skipgrams(corpus, 2, window)
    #print(skips)
    X = pd.DataFrame(skips, columns=['skipgram', 'weight'])
    X[['word1', 'word2']] = X['skipgram'].apply(pd.Series)
    #print(X)
    X.drop('skipgram', axis=1, inplace=True)
    X = X.groupby(['word1', 'word2']).sum().unstack().fillna(0).astype(int)
    X.columns = X.columns.levels[1].values
    X.index = X.index.values
    return X.T

mycorpus = 'The horse raced past the barn fell .'.split()
mycorpus = [word.lower() for word in mycorpus]

hal(mycorpus, 5).to_dict()
#list(ngrams(mycorpus, 2+5, pad_right=True, right_pad_symbol=None))

In [None]:
from scipy.sparse import lil_matrix

WINDOW_SIZE = 2
LOG_BASE = 2

def run_hal(train_data):
    
    for i in range(len(train_data)):
        
        sentence_size = len(train_data[i])
        
        for j in range(sentence_size):
            
            key = train_data[i][j]
            
            # compute start of sliding window
            start_ind = 0
            if j - WINDOW_SIZE >= 0:
                start_ind = j - WINDOW_SIZE
                
            # compute end of sliding window
            end_ind = sentence_size
            if j + WINDOW_SIZE + 1 <= sentence_size:
                end_ind = j + WINDOW_SIZE + 1
                
            for index in range(start_ind, end_ind, 1):
                
                if j == index:
                    continue
                    
                weight = 1 / abs(j - index)
                idf = math.log((num_words / occ[train_data[i][index]]), LOG_BASE)
                if train_data[i][index] in voc[key]:
                    voc[key][train_data[i][index]] += weight * idf
                else:
                    voc[key][train_data[i][index]] = 0
                        

In [None]:
# preprocessing
import re
import unidecode
import math
from scipy.sparse import lil_matrix
import operator
# sorting map
import numpy as np
# local imports
import czech_stemmer as stem
from sklearn.metrics.pairwise import cosine_similarity

############################ CONSTANTS AND LOADING DATA

TRAIN_DATA_PATH = '../data/train.txt'
STOPWORDS_PATH = '../data/stopwords.txt'
REGEX = re.compile('[^a-zA-Z]')
WINDOW_SIZE = 2
LOG_BASE = 2
TOP_WORD_OCCUR = 3000
TOP_RESULTS = 10

# load data from txt files except stopwords (without accents)
with open(STOPWORDS_PATH, encoding="utf8") as file:
    stopwords = file.readlines()
with open(TRAIN_DATA_PATH, encoding="utf8") as file:
    train_data = file.readlines()

############################ METHODS

# preprocess string    
def to_string(str):
    # remove accents and lower case
    without_accents = unidecode.unidecode(str).lower()
    # keep only a-z
    without_accents = REGEX.sub('', without_accents)
    return without_accents

# check if is string among stopwords if yes returns empty string
# otherwise returns string
def remove_stopwords(str):
    # if is str in our stopwords return '' otherwise return str
    if any(str in s for s in stopwords):
        return ''
    return str

# expected list of sentences which tokenize
# preprocess and returns
def preprocess_sentences(loaded_sentences, use_stopwords = True, 
                         use_stemm = False):
    sentences = []
    for i in range(len(loaded_sentences)):
        splitted = loaded_sentences[i].split()
        preprocessed_sentence = []
        for j in range(len(splitted)):
            formatted = to_string(splitted[j])
            without_stopwords = formatted
            if use_stopwords:
                without_stopwords = remove_stopwords(formatted)
            if (without_stopwords == ''):
                continue
            # stemming can be agressive or not according to flag
            stemmed = without_stopwords
            if use_stemm:
                stemmed = stem.cz_stem(stemmed)
            preprocessed_sentence.append(stemmed)
        sentences.append(preprocessed_sentence)
    return sentences

# build scipy sparse matrix which represents ccoocurencest matrix
def build_sparse_matrix(voc, indexes):
    voc_size = len(voc)
    mat = lil_matrix((voc_size, voc_size), dtype=float)
    for key, value in voc.items():
        for key2, value2 in value.items(): 
            mat[indexes[key], indexes[key2]] = value2
    return mat

# compute cosine similarity between pivot matrix and all others words matrix   
def compute_results(indexes, context_word, sparse_matrix):
    # idnex of pivot word in sparse matrix
    ind = indexes[context_word]
    # convert sparse matrix row to long array for pivot
    mat = sparse_matrix[:, ind]
    results = {}
    # iterate through vocabulary
    for key, ind2 in indexes.items():
        # convert sparse matrix row to long array for current item
        mat2 = sparse_matrix[:, ind2]
        results[key] = cosine_similarity(np.transpose(mat), np.transpose(mat2), dense_output=False)
    return results

# select only TOP most frequented words
def select_top_occurs(train_data, occ):
    for i in range(len(train_data)):
        for j in range(len(train_data[i])):
            if train_data[i][j] in occ:
                occ[train_data[i][j]] += 1
            else:
                occ[train_data[i][j]] = 1
                
    sorted_occ = sorted(occ.items(), key=operator.itemgetter(1), reverse=True)
    sorted_occ = sorted_occ[0:TOP_WORD_OCCUR]
    sorted_occ = dict(sorted_occ)
    smaller_data = []
    for i in range(len(train_data)):
        reduced_sentence = []
        for j in range(len(train_data[i])):
            if train_data[i][j] in sorted_occ:
                reduced_sentence.append(train_data[i][j])
        if len(reduced_sentence) > 0:
            smaller_data.append(reduced_sentence)
    occ = sorted_occ
    train_data = smaller_data
    return train_data


# print results    
def print_results(res):
    # convert data from sparse matrix into single var
    for key, value in res.items(): 
        res[key] = value.toarray().flatten()[0]
    top_results = sorted(res.items(), key=operator.itemgetter(1), reverse=True)
    top_results = top_results[0:TOP_RESULTS]
    print("##### TOP " + str(TOP_RESULTS) + " results for word '" + context_word + "'")
    for i in range(len(top_results)):
        print(top_results[i][0] + ": %0.3f" % top_results[i][1])    

# find counts of neighbours for building coocurency matrix        
def run_hal(train_data):
    for i in range(len(train_data)):
        sentence_size = len(train_data[i])
        for j in range(sentence_size):
            key = train_data[i][j]
            # compute start of sliding window
            start_ind = 0
            if j - WINDOW_SIZE >= 0:
                start_ind = j - WINDOW_SIZE
            # compute end of sliding window
            end_ind = sentence_size
            if j + WINDOW_SIZE + 1 <= sentence_size:
                end_ind = j + WINDOW_SIZE + 1
            for index in range(start_ind, end_ind, 1):
                if j != index:
                    weight = 1 / abs(j - index)
                    idf = math.log((num_words / occ[train_data[i][index]]), LOG_BASE)
                    if train_data[i][index] in voc[key]:
                        voc[key][train_data[i][index]] += weight * idf
                    else:
                        voc[key][train_data[i][index]] = 0
                        
############################ MAIN   
                        
# remove accents, lower case and eventually stem or remove stopwords
train_data = preprocess_sentences(train_data)
# create smaller dataset for faster run
occ = {}
train_data = select_top_occurs(train_data, occ)
# create vocabulary and map of occurences for computing IDF
voc = {}
indexes = {}
counter = 0
num_words = 0
for i in range(len(train_data)):
    for j in range(len(train_data[i])):
        num_words += 1
        voc[train_data[i][j]] = {}
        if train_data[i][j] not in indexes:
            indexes[train_data[i][j]] = counter
            counter += 1
# process HAL
run_hal(train_data)
# building sparse matrix - we save memory space
sparse_matrix = build_sparse_matrix(voc, indexes)
# compute cosine similarity for pivot
context_word = 'washington'
res = compute_results(indexes, context_word, sparse_matrix)
# print results
print_results(res)


https://github.com/geoffbacon/semrep/blob/366d5740a117f47cda73807a8b9e6b7cf1ca8138/semrep/models/hal/HAL.ipynb

https://github.com/fozziethebeat/S-Space/blob/master/src/main/java/edu/ucla/sspace/hal/HyperspaceAnalogueToLanguage.java

In [None]:
  public void  processDocument(BufferedReader document) throws IOException {
        Queue<String> nextWords = new ArrayDeque<String>();
        Queue<String> prevWords = new ArrayDeque<String>();

        Iterator<String> documentTokens =
            IteratorFactory.tokenizeOrdered(document);

        String focus = null;

        // Rather than updating the matrix every time an occurrence is seen,
        // keep a thread-local count of what needs to be modified in the matrix
        // and update after the document has been processed.  This saves
        // potential contention from concurrent writes.
        Map<Pair<Integer>,Double> matrixEntryToCount =
            new HashMap<Pair<Integer>,Double>();

        //Load the first windowSize words into the Queue
        for(int i = 0;  i < windowSize && documentTokens.hasNext(); i++)
            nextWords.offer(documentTokens.next());

        while(!nextWords.isEmpty()) {

            // Load the top of the nextWords Queue into the focus word
            focus = nextWords.remove();

            // Add the next word to nextWords queue (if possible)
            if (documentTokens.hasNext())
                nextWords.offer(documentTokens.next());

            // If the filter does not accept this word, skip the semantic
            // processing, continue with the next word
            if (!focus.equals(IteratorFactory.EMPTY_TOKEN)) {
                int focusIndex = termToIndex.getDimension(focus);
                // Only process co-occurrences with words with non-negative
                // dimensions.
                if (focusIndex >= 0) {
                    // in front of the focus word
                    int wordDistance = -windowSize + (windowSize - prevWords.size());
                    addTokens(prevWords, focusIndex, wordDistance, matrixEntryToCount);
                }
            }

            // last, put this focus word in the prev words and shift off the
            // front if it is larger than the window
            prevWords.offer(focus);
            if (prevWords.size() > windowSize)
                prevWords.remove();
        }

        // Once the document has been processed, update the co-occurrence matrix
        // accordingly.
        for (Map.Entry<Pair<Integer>,Double> e : matrixEntryToCount.entrySet()){
            Pair<Integer> p = e.getKey();
            cooccurrenceMatrix.addAndGet(p.x, p.y, e.getValue());
        }
    }

    '''
     * Adds co-occurrence counts between the list of previous words in {@code
     * words} and the focus word represented by {@code focusIndex} which start
     * at {@code distance} tokens away from the focus word.  All Counts will be
     * added into {@code matrixEntryToCount}.
    '''

def addTokens(words, focusIndex, distance, matrix, i, j):
    for word in words:
        # skip adding co-occurence values for words that are not  accepted by the filter
        # Get the current number of times that the focus word has
        # co-occurred with this word before after it.  Weight the
        # word appropriately baed on distance
        value = weighting.weight(distance, windowSize);
        matrix[index, focusIndex] += weighting.weight(distance, windowSize)
        distance++;


In [None]:
        
def sliding_window(seq, n):
    "Returns a sliding centered window of size +/-n from (of width 2 * n + 1) "
    y = [None] * n + list(seq) + [None] * n
    for i in range(n, len(y)+n):
        yield y[i-n:i+n+1]

def sliding_window_it(it, n):
    it = itertools.chain([None] * n, it, [None] * n * 2)
    tail = tuple(itertools.islice(it, n))
    head = tuple(itertools.islice(it, n+1))
    for v in it:
        yield list(tail + head + (v,))
        tail = tail[1:] + (head[0],)
        head = head[1:] + (v,)

def sliding_window3(seq, n=2):
    "Returns a sliding window (of width n) over data from the iterable"
    "   s -> (s0,s1,...s[n-1]), (s1,s2,...,sn), ...                   "
    it = iter(seq)
    result = tuple(itertools.islice(it, n))
    if len(result) == n:
        yield result
    for elem in it:
        result = result[1:] + (elem,)
        yield result

In [5]:
import holoviews as hv
hv.help(hv.GraphNode)

AttributeError: module 'holoviews' has no attribute 'GraphNode'