In [2]:
import pandas as pd
import numpy as np

In [7]:
from collections import Counter

In [3]:
from sklearn.neighbors import NearestNeighbors

In [9]:
corpus = [
    "knowing the name of something is different from knowing something".split(),
    "knowing something about everything is alright".split()
]

In [11]:
V = Counter(corpus[0])
V.update(corpus[1])
V.most_common()

[('knowing', 3),
 ('something', 3),
 ('is', 2),
 ('the', 1),
 ('name', 1),
 ('of', 1),
 ('different', 1),
 ('from', 1),
 ('about', 1),
 ('everything', 1),
 ('alright', 1)]

In [12]:
# Size of the corpus
len(V)

11

In [13]:
to_drop = "of the alright about from".split()
for t in to_drop:
    del V[t]

In [16]:
sorted(V)

['different', 'everything', 'is', 'knowing', 'name', 'something']

In [19]:
# Sort alphabetically
V = {k: V[k] for k in sorted(V)}
V

{'different': 1,
 'everything': 1,
 'is': 2,
 'knowing': 3,
 'name': 1,
 'something': 3}

In [21]:
for sentence in corpus:
    for t in to_drop:
        while True:
            try:
                sentence.remove(t)
            except ValueError:
                break
            

In [22]:
corpus

[['knowing', 'name', 'something', 'is', 'different', 'knowing', 'something'],
 ['knowing', 'something', 'everything', 'is']]

In [66]:
def co_occurence(word, context, window_size, corpus):
    n_occur = 0
    for sentence in corpus:
        indices = [i for i, w in enumerate(sentence) if w == word]
        for index in indices:
            window = sentence[
                max(0, index - window_size): min(index + window_size + 1, len(sentence) + 1)
            ]
            n_occur += window.count(context)
    return n_occur

In [35]:
V

{'different': 1,
 'everything': 1,
 'is': 2,
 'knowing': 3,
 'name': 1,
 'something': 3}

In [67]:
C = pd.DataFrame(np.zeros((len(V), len(V)), dtype=int), index=V.keys(), columns=V.keys())
C

Unnamed: 0,different,everything,is,knowing,name,something
different,0,0,0,0,0,0
everything,0,0,0,0,0,0
is,0,0,0,0,0,0
knowing,0,0,0,0,0,0
name,0,0,0,0,0,0
something,0,0,0,0,0,0


In [69]:
for word in V:
    for context in V:
        if word != context:
            C.loc[word, context] = co_occurence(word, context, 1, corpus)
C

Unnamed: 0,different,everything,is,knowing,name,something
different,0,0,1,1,0,0
everything,0,0,1,0,0,1
is,1,1,0,0,0,1
knowing,1,0,0,0,1,2
name,0,0,0,1,0,1
something,0,1,1,2,1,0


In [70]:
X = C.values

In [71]:
X

array([[0, 0, 1, 1, 0, 0],
       [0, 0, 1, 0, 0, 1],
       [1, 1, 0, 0, 0, 1],
       [1, 0, 0, 0, 1, 2],
       [0, 0, 0, 1, 0, 1],
       [0, 1, 1, 2, 1, 0]])

In [72]:
X == X.T

array([[ True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True],
       [ True,  True,  True,  True,  True,  True]])

In [73]:
(X != 0).sum()

16

In [79]:
X_normed = X / np.linalg.norm(X, axis=1).reshape(-1, 1)

In [80]:
pd.DataFrame(X_normed @ X_normed.T, index=V.keys(), columns=V.keys())

Unnamed: 0,different,everything,is,knowing,name,something
different,1.0,0.5,0.0,0.0,0.5,0.801784
everything,0.5,1.0,0.408248,0.57735,0.5,0.267261
is,0.0,0.408248,1.0,0.707107,0.408248,0.218218
knowing,0.0,0.57735,0.707107,1.0,0.57735,0.154303
name,0.5,0.5,0.408248,0.57735,1.0,0.534522
something,0.801784,0.267261,0.218218,0.154303,0.534522,1.0


In [81]:
np.linalg.norm(X_normed, axis=1)

array([1., 1., 1., 1., 1., 1.])

In [82]:
def pmi(word, context, corpus, window_size=1):
    count = co_occurence(word, context, window_size, corpus)
    count_context = sum([k.count(context) for k in corpus])
    count_word    = sum([k.count(word) for k in corpus])
    return np.log2(count * 9 / count_context / count_word)
pmi('knowing', 'something', corpus)

1.0

In [83]:
pmi('something', 'knowing', corpus)

1.0

In [84]:
C

Unnamed: 0,different,everything,is,knowing,name,something
different,0,0,1,1,0,0
everything,0,0,1,0,0,1
is,1,1,0,0,0,1
knowing,1,0,0,0,1,2
name,0,0,0,1,0,1
something,0,1,1,2,1,0


In [86]:
u, sig, v = np.linalg.svd(X_normed)
rank1 = sig[0] * np.outer(u[:, 0], v[0, :])

In [94]:
R1 = pd.DataFrame(rank1.round(3), index=V.keys(), columns=V.keys())

In [95]:
R1

Unnamed: 0,different,everything,is,knowing,name,something
different,0.138,0.131,0.266,0.331,0.116,0.438
everything,0.163,0.155,0.314,0.391,0.137,0.517
is,0.133,0.127,0.257,0.32,0.112,0.424
knowing,0.151,0.143,0.291,0.362,0.127,0.479
name,0.178,0.169,0.343,0.427,0.149,0.565
something,0.145,0.138,0.28,0.348,0.122,0.461


In [96]:
XXT = pd.DataFrame(R1.values @ R1.values.T, index=V.keys(), columns=V.keys())

In [97]:
XXT

Unnamed: 0,different,everything,is,knowing,name,something
different,0.421822,0.498082,0.407977,0.461333,0.544032,0.443826
everything,0.498082,0.588129,0.481734,0.544736,0.642386,0.524064
is,0.407977,0.481734,0.394587,0.446191,0.526176,0.429259
knowing,0.461333,0.544736,0.446191,0.504545,0.59499,0.485398
name,0.544032,0.642386,0.526176,0.59499,0.701649,0.572411
something,0.443826,0.524064,0.429259,0.485398,0.572411,0.466978
