In [51]:
import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import csv
import nltk
import random
from sklearn.model_selection import train_test_split
import csv
from sklearn.metrics.pairwise import cosine_similarity as similarity
from numpy.linalg import norm
import time

In [56]:
class Cbow(object):
    def __init__(self, train_data, learning_rate, epochs, context, k, batch):
        # k is the dimension of embedding layer.
        # so, embedding has k*1 dimension

        # d is dimension of word vectors (one-hot) -> d*1
        self.eta = learning_rate
        self.context = context
        self.d = len(set(train_data))
        self.epochs = epochs
        self.m = len(train_data)
        self.k = k
        self.batch = batch

        self.weights = []
        self.weights.append(np.random.randn(self.d, k))
        self.weights.append(np.random.randn(k, self.d))

        self.d_weights = []
        self.d_weights.append(np.random.randn(self.d, k))
        self.d_weights.append(np.random.randn(k, self.d))

        self.one_hot(train_data, self.d)

        self.train(train_data)

    def one_hot(self, tokens, d):
        # tokens has the work tokens after preprocessing
        # tokens = ['I', 'love', 'game', 'football'] from 'I love the game of football'

        # d is the number of unique words in tokens
        # d -> dimension of word vectors

        self.word_loc = {}
        self.words = {}
        loc = 0

        for i in tokens:
            try:
                temp = self.word_loc[i]
            except KeyError:
                self.word_loc[i] = loc
                # self.word_loc[i][loc] = 1
                self.words[loc] = i
                # print(loc, i)
                loc += 1

            if loc == d:
                break

    def create_input(self, train_data, J):
        # train_data is 1*n where n is the number of words
        # context is the context size

        # output is a m*d matrix -> X_train

        self.X_train = []
        self.Y_train = []

        x = [0] * self.d
        y = [0] * self.d

        for j in range(J, J + self.batch):
            if j >= self.m:
                break

            y[self.word_loc[train_data[j]]] = 1

            if j < self.context:
                for i in range(j):
                    x[self.word_loc[train_data[i]]] = 1
            else:
                for i in range(j - self.context, j):
                    x[self.word_loc[train_data[i]]] = 1

            if j > self.m - self.context - 1:
                for i in range(j + 1, self.m):
                    x[self.word_loc[train_data[i]]] = 1
            else:
                for i in range(j + 1, j + self.context + 1):
                    x[self.word_loc[train_data[i]]] = 1

            self.X_train.append(x)
            self.Y_train.append(y)

        # left = max(0, J-self.context)
        # right = min(J+self.context, self.m-1)

        # x = [0]*self.d
        # for i in range(left, right+1):
        #   x += self.word_loc[train_data[i]]
        # x -= self.word_loc[train_data[J]]

        # self.X_train.append(x)
        # self.Y_train.append(self.word_loc[train_data[J]])

        # for j in range(J+1, min(J+self.batch, self.m)):
        #   x -= self.word_loc[train_data[left]]
        #   left += 1
        #   right += 1
        #   x += self.word_loc[train_data[right]]
        #   x -= self.word_loc[j]

        #   self.X_train.append(x)
        #   self.Y_train.append(self.word_loc[train_data[j]])

        self.X_train = np.array(self.X_train)
        self.Y_train = np.array(self.Y_train)

    def forward(self, x):
        # x -> 1*d matrix
        # weights[0] -> d*k matrix
        # embedding -> 1*k matrix
        # weights[1] -> k*d matrix
        # output_net -> 1*d matrix
        # pred -> softmax(output_net) -> 1*d matrix

        # m -> number of words in train set
        # d -> dimension of word vectors

        self.embedding = np.matmul(x, self.weights[0])
        self.output_net = np.matmul(self.embedding, self.weights[1])
        # self.pred = np.exp(self.output_net)/np.sum(np.exp(self.output_net))
        self.pred = np.exp(self.output_net - np.max(self.output_net)) / np.sum(
            np.exp(self.output_net - np.max(self.output_net))
        )

    def backward(self, x, y):
        # loss -> m*d matrix

        self.loss = np.array(y) - self.pred

        # print(self.weights[1].T.shape)
        size = x.shape
        self.d_weights[1] = np.matmul(
            self.embedding.reshape(self.k, size[0]), self.loss.reshape(size[0], self.d)
        )
        self.d_weights[0] = np.matmul(
            x.reshape(self.d, size[0]),
            np.matmul(self.loss, self.weights[1].T).reshape(size[0], self.k),
        )

        self.weights[0] = self.weights[0] - self.eta * self.d_weights[0]
        self.weights[1] = self.weights[1] - self.eta * self.d_weights[1]

    def train(self, train_data):
        epochlosses = []
        for i in range(self.epochs):
            loss = 0
            j = 0
            while j < self.m:
                t0 = time.time()
                self.create_input(train_data, j)

                # print('Epoch: ', i, " Instance: ", j)

                t1 = time.time()
                self.forward(self.X_train)
                t2 = time.time()
                self.backward(self.X_train, self.Y_train)
                t3 = time.time()

                temp = 0
                # loss = -np.sum(self.Y_train * np.log(self.pred))

                j += self.batch
                t4 = time.time()
                
                loss += np.sum(self.loss)

#                 print("epoch: ",i + 1," batch: ",j,"loss: ",np.sum(self.loss),"t0, t1, t2, t3: ",t1 - t0,t2 - t1,t3 - t2,t4 - t3, "total time: ",t4 - t0, )
                print("epoch: ",i + 1," batch: ",j,"loss: ",np.sum(self.loss),"total time: ",t4 - t0)
            self.write_weight(i)
            epochlosses.append(loss)
        print(epochlosses)

    def predict(self, s):
        data = []
        for i in s:
            if(i not in train_data):
                print("Word not in training data")
                return
            
            vec = [0] * self.d
            vec[self.word_loc[i]] = 1
            self.forward(np.array(vec))
            data.append(self.pred)
        
        prediction = list(data[0] + data[1] - data[2])
        
        loc = prediction.index(max(prediction))

        print("Predicted word is: ", self.words[loc], " with probability: ", max(prediction))
        print("Similarity of", s[0], "and", s[1], "is:", self.get_similarity(data[0], data[1]))
        print("Similarity of", s[2], "and", self.words[loc], "is:", self.get_similarity(data[2], self.pred))
    
    def write_weight(self, i):
        file = open('epoch_'+str(i)+'_weight_0.csv', 'w', newline='')
        write = csv.writer(file)
        
        for j in range(self.d):
            write.writerow(list(self.weights[0][j]))
        file.close()
        
        file = open('epoch_'+str(i)+'_weight_1.csv', 'w', newline='')
        write = csv.writer(file)
        
        for j in range(self.k):
            write.writerow(list(self.weights[1][j]))
        file.close()
        
        print("Epoch", i, "weights are noted")

    def get_similarity(self, l):
        vec1 = [0] * self.d
        vec1[self.word_loc[i]] = 1
        
        vec2 = [0] * self.d
        vec2[self.word_loc[i]] = 1
        
        d = np.dot(vec1.T, vec2)
        d = d/(norm(vec1)*norm(vec2))
        
        return d[0][0]

In [57]:
train_data = open('Dataset_new.txt', 'r').read().split()
learning_rate = 0.0003
epochs = 5
context = 10
k = 150
batch_size = 100

In [58]:
l1 = [[1],[2],[3]]
l2 = [[9],[8],[7]]

def simi(l1, l2):
    d = np.dot(l1.T, l2)
    d = d/(norm(l1)*norm(l2))
    print(d)

simi(np.array(l1), np.array(l2))

[[0.88265899]]


In [59]:
model = Cbow(train_data, learning_rate, epochs, context, k, batch_size)

epoch:  1  batch:  100 loss:  8798.999999999984 total time:  0.4371325969696045
epoch:  1  batch:  200 loss:  8099.0000000000155 total time:  0.48169946670532227
epoch:  1  batch:  300 loss:  9198.999999999987 total time:  0.48748326301574707
epoch:  1  batch:  400 loss:  8598.999999999984 total time:  0.4963409900665283
epoch:  1  batch:  500 loss:  8398.999999999984 total time:  0.4750638008117676
epoch:  1  batch:  600 loss:  9298.999999999984 total time:  0.4809708595275879
epoch:  1  batch:  700 loss:  8698.999999999998 total time:  0.44937562942504883
epoch:  1  batch:  800 loss:  8498.999999999984 total time:  0.46777772903442383
epoch:  1  batch:  900 loss:  8699.000000000007 total time:  0.48575830459594727
epoch:  1  batch:  1000 loss:  8798.999999999996 total time:  0.4816164970397949
epoch:  1  batch:  1100 loss:  9598.999999999987 total time:  0.49791455268859863
epoch:  1  batch:  1200 loss:  9299.0 total time:  0.45111083984375
epoch:  1  batch:  1300 loss:  8698.9999999

  self.output_net = np.matmul(self.embedding, self.weights[1])
  self.pred = np.exp(self.output_net - np.max(self.output_net)) / np.sum(
  np.exp(self.output_net - np.max(self.output_net))


epoch:  4  batch:  8900 loss:  8998.999999999995 total time:  0.6571040153503418
epoch:  4  batch:  9000 loss:  8598.99999999999 total time:  0.6436007022857666
epoch:  4  batch:  9100 loss:  8798.999999999993 total time:  0.6542258262634277
epoch:  4  batch:  9200 loss:  8098.999999999987 total time:  0.6427736282348633
epoch:  4  batch:  9300 loss:  8699.0 total time:  0.672175407409668
epoch:  4  batch:  9400 loss:  8698.999999999984 total time:  0.657362699508667
epoch:  4  batch:  9500 loss:  7898.999999999988 total time:  0.6491003036499023
epoch:  4  batch:  9600 loss:  7599.0 total time:  0.6642787456512451
epoch:  4  batch:  9700 loss:  8899.0 total time:  0.6433353424072266
epoch:  4  batch:  9800 loss:  8798.999999999993 total time:  0.6665976047515869
epoch:  4  batch:  9900 loss:  8699.0 total time:  0.6650216579437256
epoch:  4  batch:  10000 loss:  8698.999999999993 total time:  0.6462423801422119
epoch:  4  batch:  10100 loss:  8198.999999999984 total time:  0.646608114

  self.pred = np.exp(self.output_net - np.max(self.output_net)) / np.sum(
  np.exp(self.output_net - np.max(self.output_net))


epoch:  4  batch:  11600 loss:  nan total time:  0.45589709281921387
epoch:  4  batch:  11700 loss:  nan total time:  0.4381697177886963
epoch:  4  batch:  11800 loss:  nan total time:  0.4601633548736572
epoch:  4  batch:  11900 loss:  nan total time:  0.4474623203277588
epoch:  4  batch:  12000 loss:  nan total time:  0.4713129997253418
epoch:  4  batch:  12100 loss:  nan total time:  0.48488950729370117
epoch:  4  batch:  12200 loss:  nan total time:  0.4787776470184326
epoch:  4  batch:  12300 loss:  nan total time:  0.4807322025299072
epoch:  4  batch:  12400 loss:  nan total time:  0.45871567726135254
epoch:  4  batch:  12500 loss:  nan total time:  0.48670172691345215
epoch:  4  batch:  12600 loss:  nan total time:  0.4897475242614746
epoch:  4  batch:  12700 loss:  nan total time:  0.48766636848449707
epoch:  4  batch:  12800 loss:  nan total time:  0.47017908096313477
epoch:  4  batch:  12900 loss:  nan total time:  0.48475027084350586
epoch:  4  batch:  13000 loss:  nan total

In [60]:
s = ['convenient', 'inconvenient', 'likely']
model.predict(s)

Predicted word is:  persuasion  with probability:  nan


TypeError: get_similarity() takes 2 positional arguments but 3 were given

In [61]:
model.get_similarity(['convenient', 'inconvenient'])

NameError: name 'i' is not defined