In [None]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import nltk
import random
from sklearn.model_selection import train_test_split
import time

In [None]:
class Skipgram1(object):
  def __init__(self, train_data, learning_rate, epochs, context, k, batch):
    # k is the dimension of embedding layer.
    # so, embedding has k*1 dimension

    # d is dimension of word vectors (one-hot) -> d*1
    self.eta = learning_rate
    self.context = context
    self.d = len(set(train_data))
    self.epochs = epochs
    self.m = len(train_data)
    self.k = k
    self.batch = batch


    self.weights = []
    self.weights.append(np.random.randn(self.d, k))
    self.weights.append(np.random.randn(k, self.d))

    self.d_weights = []
    self.d_weights.append(np.random.randn(self.d, k))
    self.d_weights.append(np.random.randn(k, self.d))

    self.one_hot(train_data, self.d)

    self.train(train_data)


  def one_hot(self, tokens, d):
    # tokens has the work tokens after preprocessing
    # tokens = ['I', 'love', 'game', 'football'] from 'I love the game of football'

    # d is the number of unique words in tokens
    # d -> dimension of word vectors

    self.word_loc = {}
    self.words = {}
    loc = 0

    for i in tokens:
      try:
        temp = self.word_loc[i]
      except KeyError:
        self.word_loc[i] = loc
        # self.word_loc[i][loc] = 1
        self.words[loc] = i
        # print(loc, i)
        loc += 1

      if loc == d:
        break


  def create_input(self, train_data, J):
    # train_data is 1*n where n is the number of words
    # context is the context size
    # output is a m*d matrix -> X_train

    self.X_train = []
    self.Y_train = []



    for j in range(J, J+self.batch):
      if(j >= self.m):
        break

      x = [0]*self.d
      y = [0]*self.d
      x[self.word_loc[train_data[j]]] = 1

      if(j < self.context):
        for i in range(j):
          y[self.word_loc[train_data[i]]] = 1
      else:
        for i in range(j-self.context, j):
          y[self.word_loc[train_data[i]]] = 1

      if(j> self.m-self.context-1):
        for i in range(j+1, self.m):
          y[self.word_loc[train_data[i]]] = 1
      else:
        for i in range(j+1, j+self.context+1):
          y[self.word_loc[train_data[i]]] = 1

      self.X_train.append(x)
      self.Y_train.append(y)

  def forward(self, x):
    # x -> 1*d matrix
    # weights[0] -> d*k matrix
    # embedding -> 1*k matrix
    # weights[1] -> k*d matrix
    # output_net -> 1*d matrix
    # pred -> softmax(output_net) -> 1*d matrix

    # m -> number of words in train set
    # d -> dimension of word vectors

    self.embedding = np.matmul(x, self.weights[0])
    self.output_net = np.matmul(self.embedding, self.weights[1])
    # self.pred = np.exp(self.output_net)/np.sum(np.exp(self.output_net))
    self.pred = np.exp(self.output_net - np.max(self.output_net))/ np.sum(np.exp(self.output_net - np.max(self.output_net)))
    self.pred = np.exp(self.output_net - np.max(self.output_net)) / np.sum(
            np.exp(self.output_net - np.max(self.output_net))
        )

  def backward(self, x, y):
    # loss -> m*d matrix


    self.loss = np.array(y) - self.pred
    #print(self.embedding.shape)
    # print(self.weights[1].T.shape)
    size=x.shape
    self.d_weights[1] = np.matmul(self.embedding.reshape(self.k, size[0]), self.loss.reshape(size[0], self.d))
    self.d_weights[0] = np.matmul(x.reshape(self.d, size[0]), np.matmul(self.loss, self.weights[1].T).reshape(size[0], self.k))

    self.weights[0] = self.weights[0] - self.eta*self.d_weights[0]
    self.weights[1] = self.weights[1] - self.eta*self.d_weights[1]


  def train(self, train_data):
    for i in range(self.epochs):
      loss = 0
      j = 0
      while(j < self.m):
        t0 = time.time()
        self.create_input(train_data, j)

        # print('Epoch: ', i, " Instance: ", j)

        t1 = time.time()
        self.forward(self.X_train)
        t2 = time.time()
        self.backward(np.array(self.X_train), np.array(self.Y_train))
        t3 = time.time()

        temp = 0
        # loss = -np.sum(self.Y_train * np.log(self.pred))

        j += self.batch
        t4 = time.time()

        print('epoch: ', i+1, ' batch: ', j, 'loss: ', np.sum(self.loss), 't0, t1, t2, t3: ', t1-t0, t2-t1, t3-t2, t4-t3, 'total time: ', t4-t0)


  def predict(self, s):
    vec = [0]*self.d
    vec[self.word_loc[s]] = 1
    self.forward(np.array(vec))

    p = list(self.pred)
    length_p = len(p)
    for i in range(2*self.context):
      max_val = max(p)
      loc = p.index(max_val)
      prob = max_val
      print("Predicted word is: ", self.words[loc], " with probability: ", prob)
      p[loc] = -1
    return self.pred


In [None]:
# train_data = "The earth revolves around the sun. The moon revolves around the earth"
# # # train_data = "situation power rather much way disposition think little well disadvantage threaten alloy many enjoyment danger however present unperceived mean rank misfortune sorrow come gentle sorrow shape disagreeable consciousness miss taylor married miss taylor loss first brought grief wedding day beloved friend emma first sat mournful thought continuance wedding bride people go father left dine together prospect third cheer long even father compose sleep dinner usual sit think lose event every promise happiness friend mr weston man unexceptionable character easy fortune suitable age pleasant manner satisfaction consider self deny generous friendship always wish promote match black morning work want miss taylor would felt every hour every day recall past kindness kindness affection sixteen year taught played five year old devote power attach amuse health nurse various illness childhood large debt gratitude owe intercourse last seven year equal footing perfect unreserve soon follow isabella marriage left yet dearer tenderer recollection friend companion possess intelligent well inform useful gentle know way family interested concern peculiarly interested every pleasure every scheme one could speak every thought arose affection could never find fault bear change true friend go half mile emma aware great must difference mr weston half mile miss taylor house advantage natural domestic great danger suffer intellectual solitude dearly love father companion could meet conversation rational playful evil actual disparity age mr woodhouse married early much increase constitution habit valetudinarian life without activity mind body much old man way year though everywhere beloved friendliness heart amiable temper talent could recommend time sister though comparatively little remove matrimony settle london sixteen mile much beyond daily reach many long october november even must struggle hartfield christmas brought next visit isabella husband little child fill house give pleasant society highbury large populous village almost amount town hartfield spite separate lawn shrubbery name really belong afford equal woodhouses first consequence look many acquaintance place father universally civil one among could accepted lieu miss taylor even half day melancholy change emma could sigh wish impossible thing till father awoke make necessary cheerful spirit require support nervous man easily depressed fond every body use hat part hat change every kind matrimony origin change always disagreeable mean yet reconcile daughter marry could ever speak compassion though entirely match affection oblige part miss taylor habit gentle selfishness never able suppose people could feel differently much dispose think miss taylor do sad thing would great deal happier spent rest life hartfield emma smile chat cheerfully could keep thought tea come impossible say exactly say dinner poor miss taylor wish pity mr weston ever thought agree papa know mr weston good humour pleasant excellent man thoroughly"
# train_data = train_data.split()

In [None]:
train_data = open('/content/Dataset_new.txt', 'r').read().split()
learning_rate = 0.03
epochs = 2
context = 5
k = 100
batch_size = 100

In [None]:
model = Skipgram1(train_data, learning_rate, epochs, context, k, batch_size)

epoch:  1  batch:  100 loss:  954.0000000000005 t0, t1, t2, t3:  0.005074501037597656 0.18266558647155762 0.2710440158843994 4.291534423828125e-06 total time:  0.4587883949279785
epoch:  1  batch:  200 loss:  972.0000000000002 t0, t1, t2, t3:  0.011068344116210938 0.22922372817993164 0.2734055519104004 3.5762786865234375e-06 total time:  0.5137012004852295
epoch:  1  batch:  300 loss:  988.0000000000003 t0, t1, t2, t3:  0.011019468307495117 0.2350142002105713 0.265653133392334 4.5299530029296875e-06 total time:  0.5116913318634033
epoch:  1  batch:  400 loss:  986.0000000000003 t0, t1, t2, t3:  0.010630369186401367 0.22225499153137207 0.3051722049713135 5.0067901611328125e-06 total time:  0.538062572479248
epoch:  1  batch:  500 loss:  985.9999999999995 t0, t1, t2, t3:  0.01129460334777832 0.23186612129211426 0.2791104316711426 4.5299530029296875e-06 total time:  0.5222756862640381
epoch:  1  batch:  600 loss:  999.0000000000008 t0, t1, t2, t3:  0.011075258255004883 0.2277238368988037 

In [None]:
# test = "advantage natural domestic great danger suffer intellectual solitude dearly love father companion could meet conversation rational playful evil actual disparity"
test = "around"
# test = test.split()
model.predict(test)

Predicted word is:  strut  with probability:  1.0
Predicted word is:  persuasion  with probability:  0.0
Predicted word is:  jane  with probability:  0.0
Predicted word is:  austen  with probability:  0.0
Predicted word is:  chapter  with probability:  0.0
Predicted word is:  sir  with probability:  0.0
Predicted word is:  walter  with probability:  0.0
Predicted word is:  elliot  with probability:  0.0
Predicted word is:  kellynch  with probability:  0.0
Predicted word is:  hall  with probability:  0.0


array([0., 0., 0., ..., 0., 0., 0.])

In [None]:
print(len(train_data))

1033136


In [None]:
w = model.weights
w0 = w[0]
w1 = w[1]

import csv
file = open('skipgram_weights0.csv', 'w', newline='')
write = csv.writer(file)

for i in range(model.d):
  write.writerow(list(w0[i]))
file.close()

file = open('skipgram_weights1.csv', 'w', newline='')
write = csv.writer(file)

for i in range(model.k):
  write.writerow(list(w1[i]))
file.close()

In [None]:
def predict(s):
  data = []
  for i in s:
    vec = [0]*model.d
    if (i not in train_data):
      vec = [0.5]*model.d
      # print("No Information About The Word")
      data.append(np.array(vec))
    else:
      vec = [0]*model.d
      vec[model.word_loc[i]] = 1
      model.forward(np.array(vec))
      data.append(model.pred)

  pred = list(data[0] + data[1] - data[2])
  # print(pred)
  maximum = max(pred)
  loc = model.words[pred.index(maximum)]

  print("Predicted word is:", loc)
  return loc

In [None]:
valfile=open('Validation.txt','r')
contents=valfile.read().split()
print(contents)
i=0
c=0
while (i < len(contents)):
  print("pair: ", i+1)
  s=[contents[i],contents[i+1],contents[i+2]]
  word=predict(s)
  if word==contents[i+3]:
    c=c+1
  i=i+4
tw=len(contents)/4
print("Accuracy: ", (c/tw), "Total matched:", c, "out of: ", tw)

['walk', 'walks', 'see', 'sees', 'walk', 'walks', 'shuffle', 'shuffles', 'walk', 'walks', 'sing', 'sings', 'walk', 'walks', 'sit', 'sits', 'walk', 'walks', 'slow', 'slows', 'walk', 'walks', 'speak', 'speaks', 'walk', 'walks', 'swim', 'swims', 'walk', 'walks', 'talk', 'talks', 'walk', 'walks', 'think', 'thinks', 'walk', 'walks', 'vanish', 'vanishes', 'work', 'works', 'write', 'writes', 'work', 'works', 'decrease', 'decreases', 'work', 'works', 'describe', 'describes', 'work', 'works', 'eat', 'eats', 'work', 'works', 'enhance', 'enhances', 'work', 'works', 'estimate', 'estimates', 'work', 'works', 'find', 'finds', 'work', 'works', 'generate', 'generates', 'think', 'thinks', 'say', 'says', 'think', 'thinks', 'scream', 'screams', 'think', 'thinks', 'search', 'searches', 'think', 'thinks', 'see', 'sees', 'think', 'thinks', 'shuffle', 'shuffles', 'think', 'thinks', 'sing', 'sings', 'think', 'thinks', 'sit', 'sits', 'think', 'thinks', 'slow', 'slows', 'think', 'thinks', 'speak', 'speaks', 'th

In [None]:
s=["king","queen","man"]
predict(s)

Predicted word is: strut


'strut'