In [5]:
def preprocess(text):
  text = text.lower()
  text = text.replace('.', ' .')
  words = text.split(' ')

  word_id_id = {}
  id_to_word = {}

  for word in words:
    if word not in word_to_id:
      new_id = len(word_to_id)
      word_to_id[word] = new_id
      id_to_word[new_id] = word_to_id

  corpus = np.array([word_to_id[w] for w in words])

  return corpus, word_to_id, id_to_word

In [6]:
def create_co_matrix(corpus, vocab_size, window_size=1):
  corpus_size = len(corpus)
  co_matrix = np.zeros((vocab_size, vocab_size), dtype=np.int32)

  for idx, word_id in enumerate(corpus):
      for i in range(1, window_size + 1):
          left_idx = idx - i
          right_idx = idx + i

          if left_idx >= 0:
              left_word_id = corpus[left_idx]
              co_matrix[word_id, left_word_id] += 1

          if right_idx < corpus_size:
              right_word_id = corpus[right_idx]
              co_matrix[word_id, right_word_id] += 1

  return co_matrix

In [7]:
def cos_similarity(x, y, eps=1e-8):
  nx = x / (np.sqrt(np.sum(x ** 2)) + eps)
  ny = y / (np.sqrt(np.sum(y ** 2)) + eps)
  return np.dot(nx, ny)

In [8]:
import sys
sys.path.append('..')
from common.util import preprocess, create_co_matrix, cos_similarity

text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)

c0 = C[word_to_id['you']]
c1 = C[word_to_id['i']]
cos_similarity(c0, c1)

0.7071067691154799