In [60]:
import os
import nltk
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
# nltk.download('punkt')

In [61]:
embedding_size = 8192
nrows = 10000
window_size = 3
num_neg_samples = 3
WEIGHTS_PATH = './../data/weights/'

In [62]:
df = pd.read_csv('./../data/processed/processed_data_2_no_guttenberg.csv', nrows = nrows)
df = df.dropna()

result = ''
for row in df['sentences']:
    row += ' '
    result += row

# Tokenize the book
tokens = nltk.word_tokenize(result)

# Create the vocabulary
vocab = list(set(tokens))
vocab_size = len(vocab)

print("No. of tokens : ",len(tokens))
print("Vocabulary Length : ", len(vocab))

# Create the word-to-index and index-to-word dictionaries
word2idx = {w: i for i, w in enumerate(vocab)}
idx2word = {i: w for i, w in enumerate(vocab)}

No. of tokens :  33086
Vocabulary Length :  5171


In [63]:
def relu(z):
    return np.maximum(0.1,z)
def relu_prime(z):
    return np.where(z > 0, 1.0, 0.1)
def softmax(z):
    return np.exp(z) / np.sum(np.exp(z), axis=1, keepdims=True)
def sigmoid(x):
    return 1 / (1 + np.exp(-x))

def sigmoid_prime(z):
    return z * (1 - z)

def cosine_similarity(A, B):
    C = np.squeeze(np.asarray(A))
    D = np.squeeze(np.asarray(B))
    return np.dot(C, D) / (np.linalg.norm(C) * np.linalg.norm(D))

def get_embeddings(word):
    try:
        val = word2idx[word]
        input_arr = np.zeros((vocab_size), dtype=np.float32)
        input_arr[val] = 1
        emb = (input_arr @ net.weights[0] + net.biases[0])
        return emb
    except:
        return np.array([])
    
def get_similarity(word1, word2):
    w1_emb = get_embeddings(word1)
    w2_emb = get_embeddings(word2)
    return cosine_similarity(w1_emb, w2_emb)
    
def get_result(word1, word2, word3):
    w1_emb = get_embeddings(word1)
    w2_emb = get_embeddings(word2)    
    w3_emb = get_embeddings(word3)    
    if(w1_emb.any() and w2_emb.any() and w3_emb.any()):
        w4_emb = w1_emb + w3_emb - w2_emb
        output = softmax(w4_emb @ net.weights[1] + net.biases[1])
        ans = idx2word[np.argmax(output)]
        return ans
    return "UNK"

def get_accuracy(validation):
    accuracy = np.array([])
    for _, row in validation.iterrows():
        w1_emb = get_embeddings(row['word1'])
        w2_emb = get_embeddings(row['word2'])    
        w3_emb = get_embeddings(row['word3'])
        w4_emb = get_embeddings(row['word4'])
        
        if(w1_emb.any() and w2_emb.any() and w3_emb.any()) and w4_emb.any():
            pred_emb = w1_emb + w3_emb - w2_emb
            accuracy = np.append(accuracy, cosine_similarity(pred_emb, w4_emb))
    return np.average(accuracy)

def cross_entropy_loss(y_pred, y_true):
    return -np.mean(np.sum(y_true * np.log(y_pred), axis=-1))


def train(
    net, optimizer, lamda, max_epochs, dev_input, dev_target, batch_size):
    m = int(len(tokens)*0.8)

    for e in range(max_epochs):
        epoch_loss = 0.
        for i in range(0, m, batch_size):
            batch_input, batch_target = read_data(batch_size, i)
            pred = net(batch_input)

            # Compute gradients of loss w.r.t. weights and biases
            dW, db = net.backward(batch_input, batch_target, lamda)

            # Get updated weights based on current weights and gradients
            weights_updated, biases_updated = optimizer.step(net.weights, net.biases, dW, db)

            # Update model's weights and biases
            net.weights = weights_updated
            net.biases = biases_updated
            print(e, i, cross_entropy_loss(pred, batch_target))

        dev_pred = net(dev_input)
        indices = np.argpartition(dev_pred, -1, axis=1)[:, -1:]
        converted_matrix = np.zeros_like(dev_pred)
        converted_matrix[np.arange(dev_pred.shape[0])[:, np.newaxis], indices] = 1
        print('F1 Score on dev data: {:.5f}'.format(f1_score(dev_target, converted_matrix, average='micro')))

def read_data(batch_size, index):
    
    # Initialize the input and output arrays
    input_arr = np.zeros((batch_size, vocab_size), dtype=np.float32)
    output_arr = np.zeros((batch_size, vocab_size), dtype=np.float32)

    # Loop over each word in the tokens list
    if index + batch_size > len(tokens): k = len(tokens)
    else: k = index + batch_size
    for i in range(index, k):
        # Get the context words
        context_words = tokens[max(0, i - window_size): i] + tokens[i + 1: i + window_size + 1]
        context_indices = [word2idx[w] for w in context_words]

        # # Get the negative samples
        negative_indices = np.random.choice(len(vocab), num_neg_samples, replace=False)
        negative_indices = [idx for idx in negative_indices if idx not in context_indices]

        # # Update the input and output arrays
        input_arr[i - index, word2idx[tokens[i]]] = 1
        for ctx_idx in context_indices:
            output_arr[i - index, ctx_idx] = 1
        for neg_idx in negative_indices:
            output_arr[i - index, neg_idx] = 1
    train_input = input_arr.astype(np.int32)
    train_target = output_arr.astype(np.int32)


    return train_target, train_input

def dev_data():
    # Initialize the input and output arrays
    input_arr = np.zeros((int(len(tokens)*0.2) + 1, len(vocab)), dtype=np.float32)
    output_arr = np.zeros((int(len(tokens)*0.2) + 1, len(vocab)), dtype=np.float32)
    for i in range(int(len(tokens)*0.8), len(tokens)):
        # Get the context words
        context_words = tokens[max(0, i - window_size): i] + tokens[i + 1: i + window_size + 1]
        context_indices = [word2idx[w] for w in context_words]

        input_arr[i - int(len(tokens)*0.8), word2idx[tokens[i]]] = 1
        for ctx_idx in context_indices:
            output_arr[i - int(len(tokens)*0.8), ctx_idx] = 1
    train_input = input_arr.astype(np.int32)
    train_target = output_arr.astype(np.int32)
    return train_target, train_input

In [64]:
class Net(object):
    def __init__(self):
        if os.path.exists(WEIGHTS_PATH + 'cbow_biases_' + str(vocab_size) + '.pickle'):
            with open(WEIGHTS_PATH + 'cbow_biases_' + str(vocab_size) + '.pickle', "rb") as f:
                self.weights = pickle.load(f)
            with open(WEIGHTS_PATH + 'cbow_biases_' + str(vocab_size) + '.pickle', "rb") as f:
                self.biases = pickle.load(f)
        else:
            self.weights = [np.random.randn(vocab_size, embedding_size) / np.sqrt(embedding_size)]
            self.weights.append(np.random.randn(embedding_size, vocab_size) / np.sqrt(embedding_size))
            self.biases = [np.random.randn(embedding_size) / np.sqrt(embedding_size)]
            self.biases.append(np.random.randn(vocab_size) / np.sqrt(embedding_size))

    def __call__(self, X):
        a = (X @ self.weights[0] + self.biases[0])
        output = softmax(a @ self.weights[1] + self.biases[1])
        return output

    def backward(self, X, y, lamda):
        batch_size = len(X)
        a = (X @ self.weights[0] + self.biases[0])
        output = softmax(a @ self.weights[1] + self.biases[1])
        del_W = [np.zeros(w.shape) for w in self.weights]
        del_b = [np.zeros(b.shape) for b in self.biases]

        delta = output - y
        del_b[1] = np.sum(delta, axis = 0, keepdims = True)
        del_W[1] = a.T @ delta + lamda * (self.weights[1])

        delta = delta @ self.weights[1].T * (a)
        del_b[0] = np.sum(delta, axis = 0, keepdims = True)
        del_W[0] = X.T @ delta + lamda * (self.weights[0])
        return del_W, del_b


class Optimizer(object):
    '''
    '''
    def __init__(self, learning_rate, weights, biases):
        self.m_dw = [np.zeros(w.shape) for w in weights]
        self.m_db = [np.zeros(b.shape) for b in biases]
        self.v_dw = [np.zeros(w.shape) for w in weights]
        self.v_db = [np.zeros(b.shape) for b in biases]
        self.beta1 = 0.9
        self.beta2 = 0.999
        self.epsilon = 1e-8
        self.eta = learning_rate
        self.t = 0

    def step(self, weights, biases, delta_weights, delta_biases):
        self.t += 1

        self.m_dw = [self.beta1 * m + (1 - self.beta1) * del_w for m, del_w in zip(self.m_dw, delta_weights)]
        self.m_db = [self.beta1 * m + (1 - self.beta1) * del_b for m, del_b in zip(self.m_db, delta_biases)]
        self.v_dw = [self.beta2 * v + (1 - self.beta2) * (del_w**2) for v, del_w in zip(self.v_dw, delta_weights)]
        self.v_db = [self.beta2 * v + (1 - self.beta2) * (del_b**2) for v, del_b in zip(self.v_db, delta_biases)]

        # bias correction
        m_hat_dw = [m / (1 - self.beta1 ** self.t) for m in self.m_dw]
        v_hat_dw = [v / (1 - self.beta2 ** self.t) for v in self.v_dw]

        m_hat_db = [m / (1 - self.beta1 ** self.t) for m in self.m_db]
        v_hat_db = [v / (1 - self.beta2 ** self.t) for v in self.v_db]	

        # update weights and biases
        weights = [w - self.eta * m_hat / ((np.sqrt(v_hat) + self.epsilon)) for w, m_hat, v_hat in zip(weights, m_hat_dw, v_hat_dw)] 
        biases = [b - self.eta * m_hat / ((np.sqrt(v_hat) + self.epsilon)) for b, m_hat, v_hat in zip(biases, m_hat_db, v_hat_db)]

        return weights, biases


In [65]:
# Hyper-parameters 
max_epochs = 10
learning_rate = 0.01
lamda = 1 # Regularization Parameter
batch_size = 16192


net = Net()
optimizer = Optimizer(learning_rate, net.weights, net.biases)
dev_input, dev_target = dev_data()
train(net, optimizer, lamda, max_epochs, dev_input, dev_target, batch_size)

0 0 8.551934865338378
0 16192 7.956681470743348
F1 Score on dev data: 0.05455
1 0 7.54775618994377
1 16192 8.125862748326966
F1 Score on dev data: 0.11574
2 0 6.431450885334984
2 16192 5.662905938491707
F1 Score on dev data: 0.17860
3 0 4.682940031758605
3 16192 4.241846158073566
F1 Score on dev data: 0.23602
4 0 3.6918613236441082
4 16192 3.6437318248218014
F1 Score on dev data: 0.29556
5 0 3.1823407418542407
5 16192 3.5199231430059412
F1 Score on dev data: 0.27667
6 0 3.4096018553335874
6 16192 3.6825483603646805
F1 Score on dev data: 0.29601
7 0 3.8305430963804534
7 16192 3.922825709002432
F1 Score on dev data: 0.31006
8 0 4.132589193531979
8 16192 4.074528880526385
F1 Score on dev data: 0.31596
9 0 4.5136348736679075
9 16192 4.884191046286319
F1 Score on dev data: 0.31520


In [None]:
with open(WEIGHTS_PATH + 'cbow_weights_' + str(vocab_size) + '_' + str(embedding_size) + '.pickle', 'wb') as handle:
    pickle.dump(net.weights, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open(WEIGHTS_PATH + 'cbow_biases_' + str(vocab_size) + '_' + str(embedding_size) + '.pickle', 'wb') as handle:
    pickle.dump(net.biases, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Get Results

In [52]:
validation = pd.read_csv('./../data/Validation.txt', sep=' ', names=['word1','word2','word3','word4'])

validation['word1'] = validation['word1'].apply(lambda x : x.lower())
validation['word2'] = validation['word2'].apply(lambda x : x.lower())
validation['word3'] = validation['word3'].apply(lambda x : x.lower())
validation['word4'] = validation['word4'].apply(lambda x : x.lower())

validation['result'] = validation[['word1','word2','word3']].apply(lambda x : get_result(x['word1'], x['word2'], x['word3']), axis = 1)

print("Accuracy is :", get_accuracy(validation))
print("Coverage is :", validation[validation['result']!=""].shape[0]/991)
validation[validation['result']==validation['word4']]

Accuracy is : 0.9596947795349271
Coverage is : 0.6952573158425832


Unnamed: 0,word1,word2,word3,word4,result


In [53]:
value = validation[validation['result']!=""].shape[0]

In [54]:
validation

Unnamed: 0,word1,word2,word3,word4,result
0,walk,walks,see,sees,must
1,walk,walks,shuffle,shuffles,must
2,walk,walks,sing,sings,must
3,walk,walks,sit,sits,must
4,walk,walks,slow,slows,must
...,...,...,...,...,...
986,argentina,peso,nigeria,naira,man
987,argentina,peso,iran,rial,world
988,argentina,peso,japan,yen,world
989,india,rupee,iran,rial,world


In [None]:
validation.to_csv("./../data/results.csv",index=False)

In [None]:
word1 = 'king'
word2 = 'man'
word3 = 'queen'

print(get_result('king','man','queen'))

get_similarity('queen', 'woman')

In [59]:
validation = pd.read_csv('./../data/Analogy_dataset.txt', sep=' ', names=['word1','word2','word3','word4'])

validation['word1'] = validation['word1'].apply(lambda x : x.lower())
validation['word2'] = validation['word2'].apply(lambda x : x.lower())
validation['word3'] = validation['word3'].apply(lambda x : x.lower())
validation['word4'] = validation['word4'].apply(lambda x : x.lower())

validation['result'] = validation[['word1','word2','word3']].apply(lambda x : get_result(x['word1'], x['word2'], x['word3']), axis = 1)

print("Accuracy is :", get_accuracy(validation))
print("Coverage is :", validation[validation['result']!=""].shape[0]/991)
validation[validation['result']==validation['word4']]

Accuracy is : 0.977057841861813
Coverage is : 0.05247225025227043


Unnamed: 0,word1,word2,word3,word4,result


In [27]:
df = pd.read_csv('./../data/Validation.txt', sep=' ', names=['w1','w2','w3','w4'])
df2 = df[['w3','w4']].rename(columns={'w3':'w1', 'w4':'w2'})
df = pd.concat([df[['w1','w2']],df2], ignore_index=True)
df = df.drop_duplicates()

df['w1'] = df['w1'].apply(lambda x : x.lower())
df['w2'] = df['w2'].apply(lambda x : x.lower())

df['w1_present'] = df['w1'].apply(lambda x : x in result)
df['w2_present'] = df['w2'].apply(lambda x : x in result)

print(df['w1_present'].value_counts())
print(df['w2_present'].value_counts())

True     324
False      2
Name: w1_present, dtype: int64
True     299
False     27
Name: w2_present, dtype: int64


In [None]:
import pickle
with open('weights.pickle', 'wb') as handle:
    pickle.dump(net.weights, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('biases.pickle', 'wb') as handle:
    pickle.dump(net.biases, handle, protocol=pickle.HIGHEST_PROTOCOL)