In [31]:
import itertools
from glove import Corpus
import pandas as pd
    
# See http://www.foldl.me/2014/glove-python/
def compute_GloVe_df(sentences, window=2, dictionary=None):
    
    corpus = Corpus( dictionary=dictionary)
    corpus.fit(sentences, window=window)

    dm = corpus.matrix.todense()
    inverse_dictionary = { i: w for w, i in corpus.dictionary.items() }
    id2token = [ inverse_dictionary[i] for i in range(0,max(inverse_dictionary.keys())+1)]

    df = pd.DataFrame(dm.T, columns=id2token).assign(word=id2token).set_index('word')
    return df

# Create sorted dictionary to make HAL comparision easier
def create_sorted_dictionary(sentences):
    tokens = set()
    for sentence in sentences:
        tokens = tokens | set(sentence)
    tokens = list(tokens)
    tokens.sort()
    dictionary = { w: i for i, w in enumerate(tokens)}    
    return dictionary
sentences = [ "The Horse Raced Past The Barn Fell".title().split() ]

dictionary = create_sorted_dictionary(sentences)

df = compute_GloVe_df(sentences, window=5, dictionary=dictionary)
df



Unnamed: 0_level_0,Barn,Fell,Horse,Past,Raced,The
word,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Barn,0.0,0.0,0.0,0.0,0.0,0.0
Fell,1.0,0.0,0.0,0.0,0.0,0.0
Horse,0.25,0.2,0.0,0.0,0.0,0.0
Past,0.5,0.333333,0.5,0.0,0.0,0.0
Raced,0.333333,0.25,1.0,1.0,0.0,0.0
The,1.2,0.5,1.333333,1.333333,1.0,0.0


In [39]:
# Glove CO-OCCURRENCE (as implemented in python-glove):
#  The counts are ALWAYS FORWARD i.e the window is added tvalues are ABSOLUTE c
#  Added increment for each pair = 1 / distance-between-other-word
#  NO normalization

import pandas as pd
import numpy as np
import glove
from nltk.tokenize import word_tokenize

pd.options.display.precision = 2
window = 4
docs = [
    'one two two one two two one two two one two two',
    'one two two one two two one two two one two two',
    #'This is the first document.',
    #'This document is the second document.',
    #'And this is the third one.',
    #'Is this the first document?',
]

docs = [
    [ w.lower() for w in word_tokenize(doc) if len(w) > 1 ] for doc in docs
]

model = glove.Corpus()
model.fit(docs, window=window)

X = model.matrix + model.matrix.T
T = len(model.dictionary)
id2token = { i: w for w, i in model.dictionary.items()  }

df = pd.DataFrame(data=X.todense(), index=np.array(range(1, T+1)), columns=np.array(range(1, T+1)))
df.columns = list(id2token.values())
df['word'] = list(id2token.values())
df = df.set_index('word')
df

[(0, 17), (1, 1), (2, 1), (3, 10), (4, 2), (5, 1), (6, 3), (7, 1), (8, 2), (9, 2), (10, 1), (11, 3), (12, 1), (13, 4), (14, 2), (15, 1), (16, 6), (17, 90), (18, 2), (19, 2), (20, 2), (21, 2), (22, 1), (23, 2), (24, 1), (25, 2), (26, 1), (27, 28), (28, 1), (29, 1), (30, 15), (31, 3), (32, 21), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 1), (39, 2), (40, 23), (41, 4), (42, 1), (43, 1), (44, 18), (45, 1), (46, 1), (47, 1), (48, 5), (49, 2), (50, 2), (51, 2), (52, 3), (53, 21), (54, 1), (55, 1), (56, 3), (57, 1), (58, 2), (59, 1), (60, 3), (61, 1), (62, 1), (63, 2), (64, 1), (65, 1), (66, 2), (67, 1), (68, 1), (69, 3), (70, 2), (71, 1), (72, 1), (73, 1), (74, 1), (75, 1), (76, 1), (77, 16), (78, 1), (79, 1), (80, 1), (81, 11), (82, 1), (83, 1), (84, 1), (85, 1), (86, 1), (87, 7), (88, 7), (89, 1), (90, 2), (91, 1), (92, 1), (93, 2), (94, 1), (95, 27), (96, 14), (97, 1), (98, 1), (99, 1), (100, 1), (101, 3), (102, 2), (103, 15), (104, 1), (105, 1), (106, 1), (107, 1), (108, 2), (109,