We will build the Skipgram and CBOW models from scratch, train them on a relatively small corpus, i.e, on BBC Data set.

In [1]:
import numpy as np

import tensorflow as tf
import operator
from tensorflow import keras
import keras.backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Reshape, Lambda
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing import sequence
from sklearn.metrics.pairwise import cosine_distances

from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors as nn
from matplotlib import pylab
import pandas as pd
from gensim.models import Word2Vec
from tensorflow.keras.models import load_model



In [2]:
df = pd.read_csv('bbc-text.csv')
print(df)
sentences = ''
articles = list(df['text'])

           category                                               text
0              tech  tv future in the hands of viewers with home th...
1          business  worldcom boss  left books alone  former worldc...
2             sport  tigers wary of farrell  gamble  leicester say ...
3             sport  yeading face newcastle in fa cup premiership s...
4     entertainment  ocean s twelve raids box office ocean s twelve...
...             ...                                                ...
2220       business  cars pull down us retail figures us retail sal...
2221       politics  kilroy unveils immigration policy ex-chatshow ...
2222  entertainment  rem announce new glasgow concert us band rem h...
2223       politics  how political squabbles snowball it s become c...
2224          sport  souness delight at euro progress boss graeme s...

[2225 rows x 2 columns]


## Skip Gram

In [3]:
%%time

sentences = []

for i in articles[:200]:
    sentences += i.split('.')

# Remove sentences with fewer than 3 words
corpus = [sentence for sentence in sentences if sentence.count(" ") >= 12]

# Remove punctuation in text and fit tokenizer on entire corpus
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n'+"'")
tokenizer.fit_on_texts(corpus)

# Convert text to sequence of integer values
corpus = tokenizer.texts_to_sequences(corpus)
n_samples = sum(len(s) for s in corpus) # Total number of words in the corpus
V = len(tokenizer.word_index) + 1 # Total number of unique words in the corpus

CPU times: user 82.6 ms, sys: 4.06 ms, total: 86.7 ms
Wall time: 84.7 ms


In [4]:
len(corpus), n_samples, V

(3172, 70848, 9088)

In [5]:
# Example of how word to integer mapping looks like in the tokenizer
print(list((tokenizer.word_index.items()))[:5])

[('the', 1), ('to', 2), ('of', 3), ('and', 4), ('a', 5)]


In [6]:

# Parameters
window_size = 2
window_size_corpus = 4

# Set numpy seed for reproducible results
np.random.seed(42)


In [7]:

# Prepare data for the skipgram model
# The function returns two arrays: all_in, which contains the target words, and 
# all_out, which contains the corresponding one-hot encoded context words.

def generate_data_skipgram(corpus, window_size, V):
    maxlen = window_size * 2
    all_in = []
    all_out = []
    for words in corpus:
        L = len(words)
        for index, word in enumerate(words):
            p = index - window_size
            n = index + window_size + 1

            in_words = []
            labels = []
            for i in range(p, n):
                if i != index and 0 <= i < L:
                    # Add the input word
                    all_in.append(word)
                    # Add one-hot of the context words
                    all_out.append(to_categorical(words[i], V))

    return (np.array(all_in), np.array(all_out))

In [8]:
%%time

# Create training data
X_skip, y_skip = generate_data_skipgram(corpus, window_size, V)
X_skip.shape, y_skip.shape

CPU times: user 1.57 s, sys: 3.06 s, total: 4.63 s
Wall time: 5.84 s


((264360,), (264360, 9088))

In [9]:
print(X_skip[0:13])
print(y_skip[0:13])

[205 205 385 385 385   6   6   6   6   1   1   1   1]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [10]:
%%time

# Create skipgram architecture

dim = 300
skipgram_models = []

# Initialize a Keras Sequential model
skipgram = Sequential()

# Add an Embedding layer
skipgram.add(Embedding(input_dim=V,
                        output_dim=dim,
                        input_length=1,
                        embeddings_initializer='glorot_uniform'))

# Add a Reshape layer, which reshapes the output of the embedding layer (1,dim) to (dim,)
skipgram.add(Reshape((dim, )))

# Add a final Dense layer with the same size as in [1]
skipgram.add(Dense(V, activation='softmax', kernel_initializer='glorot_uniform'))

# Compile the model with a suitable loss function and select an optimizer.
# Optimizer Adagrad was used in paper
skipgram.compile(optimizer=keras.optimizers.Adam(),
                loss='categorical_crossentropy',
                metrics=['accuracy'])

skipgram.summary()
print("")
skipgram_models.append(skipgram)



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1, 300)            2726400   
                                                                 
 reshape (Reshape)           (None, 300)               0         
                                                                 
 dense (Dense)               (None, 9088)              2735488   
                                                                 
Total params: 5,461,888
Trainable params: 5,461,888
Non-trainable params: 0
_________________________________________________________________

CPU times: user 63.9 ms, sys: 45.8 ms, total: 110 ms
Wall time: 94.7 ms


In [11]:
%%time


# Training the skipgram models
for skipgram in skipgram_models:
    skipgram.fit(X_skip, y_skip, batch_size=64, epochs=10, verbose=1)
    print("")

Epoch 1/10
   1/4131 [..............................] - ETA: 10:28 - loss: 9.1146 - accuracy: 0.0000e+00

2023-11-30 01:46:38.885518: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

CPU times: user 36min 14s, sys: 9min 56s, total: 46min 11s
Wall time: 9min 10s


In [12]:

for skipgram in skipgram_models:
    # Save embeddings for vectors of length 50, 150 and 300 using skipgram model
    weights = skipgram.get_weights()

    # Get the embedding matrix
    embedding = weights[0]

    # Get word embeddings for each word in the vocabulary, write to file
    f = open(f"vectors_skipgram_{len(embedding[0])}.txt", "w")

    # Create columns for the words and the values in the matrix, makes it easier to read as dataframe
    columns = ["word"] + [f"value_{i+1}" for i in range(embedding.shape[1])]

    # Start writing to the file, start with the column names
    f.write(" ".join(columns))

    # Start a new line
    f.write("\n")

    for word, i in tokenizer.word_index.items():
        f.write(word)
        f.write(" ")
        f.write(" ".join(map(str, list(embedding[i,:]))))
        f.write("\n")
    f.close()

In [13]:
skipgram.get_weights()[0]

array([[-0.00214251, -0.0121221 , -0.01510321, ..., -0.00877267,
         0.01666906,  0.01842307],
       [-0.02199289,  0.03579663, -0.08151455, ...,  0.09725461,
        -0.1169652 , -0.03868316],
       [ 0.03874718,  0.0458426 , -0.2206451 , ..., -0.22560804,
        -0.02905687,  0.10799085],
       ...,
       [-0.17046127,  0.25113133,  0.0097746 , ..., -0.01196183,
         0.06048154, -0.04334079],
       [-0.22128603,  0.19090584, -0.31348667, ..., -0.06036747,
        -0.00521338, -0.0671095 ],
       [-0.3874851 ,  0.30454206, -0.25570133, ..., -0.34445858,
         0.18289243,  0.06144356]], dtype=float32)

In [14]:
len(skipgram.get_weights())

3

In [15]:
len(skipgram.get_weights()[0])

9088

In [16]:
len(skipgram.get_weights()[0][0])

300

In [17]:
skipgram.get_weights()[0][1]

array([-0.02199289,  0.03579663, -0.08151455, -0.1271285 ,  0.0899231 ,
        0.09212574, -0.13506354,  0.01736196, -0.10423771, -0.12857589,
       -0.07840447,  0.14748764, -0.02269663,  0.1504304 ,  0.08632143,
        0.08043239,  0.233483  , -0.17061205,  0.21647973, -0.06183635,
       -0.15550244,  0.03549013, -0.13641395, -0.12326496, -0.06626974,
       -0.12543938,  0.11566662, -0.13591912, -0.09772446,  0.04019888,
       -0.20902066, -0.25967604,  0.14875628, -0.06806808,  0.03003336,
        0.13652973, -0.26442957, -0.06407244, -0.04192676, -0.10788668,
        0.20165382, -0.12643032, -0.07505339, -0.0200877 ,  0.11866522,
        0.02889065,  0.22051248,  0.0500199 , -0.10028892, -0.03501618,
       -0.00347714, -0.10132656,  0.12784095,  0.18562132, -0.059568  ,
        0.1118096 , -0.0298992 , -0.158858  ,  0.02338173, -0.03961867,
        0.09792748,  0.1591202 , -0.05342468, -0.17649677,  0.29455078,
       -0.32039142, -0.11046364,  0.10844326, -0.0681861 , -0.02

To get the word embedding:

In [18]:
index = tokenizer.word_index['king']

In [19]:
skipgram.get_weights()[0][index]

array([-2.92883784e-01,  2.17304260e-01,  8.81510004e-02, -1.46533027e-01,
        7.45297968e-02, -6.23810142e-02,  1.28183931e-01, -2.75252044e-01,
        3.03194314e-01, -7.32355639e-02,  3.08283120e-01,  1.03407875e-01,
        2.07831711e-01, -3.96618620e-02,  1.54671028e-01, -2.04340547e-01,
        2.34738082e-01, -3.80781710e-01, -7.57350102e-02,  2.28793994e-01,
        4.72351551e-01,  1.73734844e-01,  3.90264481e-01, -1.38064316e-02,
        1.53361797e-01, -1.34653762e-01, -1.99924886e-01, -1.53927356e-02,
        1.71992108e-02,  2.41581514e-01, -4.41187203e-01, -9.62597281e-02,
        5.02280146e-02,  1.64076075e-01,  3.07196468e-01,  6.52240932e-01,
        3.65278751e-01, -2.21509159e-01,  5.04673898e-01,  4.57919359e-01,
        2.88236111e-01,  5.91901958e-01,  8.61732513e-02,  1.82284012e-01,
       -1.20459944e-01,  4.69002984e-02, -4.55317438e-01, -1.97141275e-01,
       -1.06191911e-01,  6.60986602e-02,  3.52048650e-02, -1.28402174e-01,
        6.70622289e-02,  

In [20]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load your pre-trained word embeddings into a dictionary or a matrix
# word_vectors should be a dictionary where keys are words and values are their corresponding vectors
# Or word_vectors can be a matrix where rows correspond to words and columns are vector dimensions
# You should replace this with your actual word embeddings

# Sample code for loading pre-trained word vectors into a dictionary
word_vectors = {}
i=0

target_word = "king"


with open("vectors_skipgram_300.txt", "r", encoding="utf-8") as file:
    for line in file:
        i+=1
        if i == 1:
            continue
        parts = line.strip().split()
        word = parts[0]
        vector = np.array([float(x) for x in parts[1:]])
        word_vectors[word] = vector

# Target word for which you want to find the k-nearest words


# Calculate cosine similarities with all words in the vocabulary
similarities = {}
target_vector = word_vectors[target_word]
for word, vector in word_vectors.items():
    if word != target_word:
        cosine_sim = cosine_similarity([target_vector], [vector])
        similarities[word] = cosine_sim[0][0]

# Sort the words by their cosine similarity scores in descending order
sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

# Select the top-k words as the k-nearest words
k = 10  # Number of nearest words you want to find
nearest_words = [(word, e) for word, e in sorted_similarities[:k]]

# Print the k-nearest words
print(f"The {k} nearest words to '{target_word}' are: ")
for i in (nearest_words):
    print(i)


skipgram_word_emd = word_vectors

The 10 nearest words to 'king' are: 
('mervyn', 0.37212365375762424)
('comeback', 0.3562591653005176)
('macfarlane', 0.35449350877782304)
('supreme', 0.3504704872297048)
('governor', 0.34194228643766056)
('adaptation', 0.3326355888781102)
('hails', 0.3194552383976981)
('teen', 0.31453885023265993)
('duke', 0.3102329390138116)
('prince', 0.3079823184950587)


## CBOW

In [21]:

from keras.preprocessing import sequence

# The function returns two arrays: all_in, which contains the context words, 
# and all_out, which contains the corresponding one-hot encoded target words.

def generate_data_cbow(corpus, window_size, V):
    all_in = []
    all_out = []

    # Iterate over all sentences
    for sentence in corpus:
        L = len(sentence)
        for index, word in enumerate(sentence):
            start = index - window_size
            end = index + window_size + 1

            # Empty list which will store the context words
            context_words = []
            for i in range(start, end):
                # Skip the 'same' word
                if i != index:
                    # Add a word as a context word if it is within the window size
                    if 0 <= i < L:
                        context_words.append(sentence[i])
                    else:
                        # Pad with zero if there are no words
                        context_words.append(0)
            # Append the list with context words
            all_in.append(context_words)

            # Add one-hot encoding of the target word
            all_out.append(to_categorical(word, V))

    return (np.array(all_in), np.array(all_out))

In [22]:
%%time


# Create the training data
X_cbow, y_cbow = generate_data_cbow(corpus, window_size, V)
X_cbow.shape, y_cbow.shape

CPU times: user 447 ms, sys: 868 ms, total: 1.31 s
Wall time: 1.67 s


((70848, 4), (70848, 9088))

In [23]:
print(X_cbow[:10])
print(y_cbow[:10])


[[   0    0  385    6]
 [   0  205    6    1]
 [ 205  385    1 1802]
 [ 385    6 1802    3]
 [   6    1    3  797]
 [   1 1802  797   17]
 [1802    3   17  150]
 [   3  797  150  798]
 [ 797   17  798 1803]
 [  17  150 1803 4896]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [24]:
%%time

# Create the CBOW architecture
cbow_models = []
dim = 300
cbow = Sequential()

# Add an Embedding layer
cbow.add(Embedding(input_dim=V,
                    output_dim=dim,
                    input_length=window_size*2, # Note that we now have 2L words for each input entry
                    embeddings_initializer='glorot_uniform'))

cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim, )))

cbow.add(Dense(V, activation='softmax', kernel_initializer='glorot_uniform'))

cbow.compile(optimizer=keras.optimizers.Adam(),
                loss='categorical_crossentropy',
                metrics=['accuracy'])

cbow.summary()
print("")
cbow_models.append(cbow)



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 4, 300)            2726400   
                                                                 
 lambda (Lambda)             (None, 300)               0         
                                                                 
 dense_1 (Dense)             (None, 9088)              2735488   
                                                                 
Total params: 5,461,888
Trainable params: 5,461,888
Non-trainable params: 0
_________________________________________________________________

CPU times: user 43.4 ms, sys: 23.8 ms, total: 67.2 ms
Wall time: 41 ms


In [25]:
%%time

# Train CBOW model
for cbow in cbow_models:
    cbow.fit(X_cbow, y_cbow, batch_size=64, epochs=50, verbose=1)
    print("")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

CPU times: user 48min 17s, sys: 13min 9s, total: 1h 1min 27s
Wall time: 12min 27s


In [26]:

for cbow in cbow_models:
    # Save embeddings for vectors of length 50, 150 and 300 using cbow model
    weights = cbow.get_weights()

    # Get the embedding matrix
    embedding = weights[0]

    # Get word embeddings for each word in the vocabulary, write to file
    f = open(f'vectors_cbow_{len(embedding[0])}.txt', 'w')

    # Create columns for the words and the values in the matrix, makes it easier to read as dataframe
    columns = ["word"] + [f"value_{i+1}" for i in range(embedding.shape[1])]

    # Start writing to the file, start with the column names
    f.write(" ".join(columns))
    f.write("\n")

    for word, i in tokenizer.word_index.items():
        f.write(word)
        f.write(" ")
        f.write(" ".join(map(str, list(embedding[i,:]))))
        f.write("\n")
    f.close()

In [27]:
from sklearn.metrics.pairwise import cosine_similarity

# Loading pre-trained word embeddings into a dictionary or a matrix
# word_vectors should be a dictionary where keys are words and values are their corresponding vectors
# Or word_vectors can be a matrix where rows correspond to words and columns are vector dimensions



word_vectors = {}
i=0

target_word = "king"


with open("vectors_cbow_300.txt", "r", encoding="utf-8") as file:
    for line in file:
        i+=1
        if i == 1:
            continue
        parts = line.strip().split()
        word = parts[0]
        vector = np.array([float(x) for x in parts[1:]])
        word_vectors[word] = vector

# Target word for which you want to find the k-nearest words


# Calculate cosine similarities with all words in the vocabulary
similarities = {}
target_vector = word_vectors[target_word]
for word, vector in word_vectors.items():
    if word != target_word:
        cosine_sim = cosine_similarity([target_vector], [vector])
        similarities[word] = cosine_sim[0][0]

# Sort the words by their cosine similarity scores in descending order
sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

# Select the top-k words as the k-nearest words
k = 10  # Number of nearest words you want to find
nearest_words = [(word,e) for word, e in sorted_similarities[:k]]

# Print the k-nearest words
print(f"The {k} nearest words to '{target_word}' are: ")
for i in (nearest_words):
    print(i)


cbow_word_emd = word_vectors

The 10 nearest words to 'king' are: 
('libertine', 0.3516973069914492)
('supreme', 0.34110856190558064)
('noted', 0.34027599147963167)
('godfather', 0.33859327999732125)
('closet', 0.33632438378379603)
('prince', 0.3302863194394854)
('builder', 0.3234979292828204)
('wishes', 0.31825724747904155)
('heineken', 0.31698993556321864)
('creators', 0.31483609386753375)


To get the word embedding:

In [28]:
len(skipgram_word_emd),len(cbow_word_emd)

(9087, 9087)

In [29]:
skipgram_word_emd['king']

array([-2.92883780e-01,  2.17304260e-01,  8.81510000e-02, -1.46533030e-01,
        7.45298000e-02, -6.23810140e-02,  1.28183930e-01, -2.75252040e-01,
        3.03194300e-01, -7.32355640e-02,  3.08283120e-01,  1.03407875e-01,
        2.07831710e-01, -3.96618620e-02,  1.54671030e-01, -2.04340550e-01,
        2.34738080e-01, -3.80781700e-01, -7.57350100e-02,  2.28794000e-01,
        4.72351550e-01,  1.73734840e-01,  3.90264480e-01, -1.38064320e-02,
        1.53361800e-01, -1.34653760e-01, -1.99924890e-01, -1.53927360e-02,
        1.71992100e-02,  2.41581510e-01, -4.41187200e-01, -9.62597300e-02,
        5.02280150e-02,  1.64076070e-01,  3.07196470e-01,  6.52240930e-01,
        3.65278750e-01, -2.21509160e-01,  5.04673900e-01,  4.57919360e-01,
        2.88236100e-01,  5.91901960e-01,  8.61732500e-02,  1.82284010e-01,
       -1.20459944e-01,  4.69003000e-02, -4.55317440e-01, -1.97141270e-01,
       -1.06191910e-01,  6.60986600e-02,  3.52048650e-02, -1.28402170e-01,
        6.70622300e-02,  

In [30]:
cbow_word_emd['king']

array([-0.6037104 , -0.14906797, -0.7002209 , -0.23938021, -0.52839136,
        0.31163195, -1.2044381 ,  1.145831  ,  0.09468543, -1.8114032 ,
        0.91334033, -0.00373117, -0.35494474,  0.3004534 ,  0.41489485,
       -0.56522894,  0.60035366, -0.86665237, -0.2697187 ,  0.07676263,
       -0.12388305,  1.3607634 , -0.35560882,  0.35863099, -1.2167796 ,
        0.43340203,  0.2445653 ,  0.8155262 , -1.5011925 , -1.2113218 ,
        0.3685103 ,  0.41266364, -0.07298842, -0.6470388 , -0.3452553 ,
       -0.46185672, -0.56018996, -0.2410603 , -0.7877787 ,  0.29061368,
       -1.349227  , -0.12626357,  0.46147326,  0.03018703, -0.5605507 ,
       -0.6152717 ,  0.569724  ,  0.02015091,  0.6176803 ,  0.34510842,
       -0.41328484, -0.85300803,  0.04969532, -0.32964206, -1.2043262 ,
        0.45830834,  0.35117427, -0.9182906 ,  0.42735833, -0.05536522,
        0.11157465, -0.05814839,  0.12074371, -0.0684804 ,  0.90823823,
       -0.03561955,  0.98094517, -0.6104293 ,  0.36698398,  1.22

In [31]:
cosine_similarity([skipgram_word_emd['king']], [cbow_word_emd['king']])

array([[-0.0260033]])

In [32]:
cosine_similarity([skipgram_word_emd['queen']], [cbow_word_emd['queen']])

array([[-0.00408487]])

In [33]:
cosine_similarity([skipgram_word_emd['king']], [skipgram_word_emd['queen']])

array([[0.28526006]])

In [34]:
cosine_similarity([cbow_word_emd['king']], [cbow_word_emd['queen']])

array([[0.24861495]])

## Analogy function

Implementing function to perform the analogy task. With this function, we want to be able to answer whether an analogy like: "a king is to a queen as a man is to a woman" (e_king - e_queen + e_woman = e_man) is true. In a perfect scenario, we would like that this analogy (e_king - e_queen + e_woman) results in the embedding of the word "man". However, it does not always result in exactly the same word embedding. In this context, we will call "man" the true or the actual word t.
We want to find the word p in the vocabulary, where the embedding of p(e_p) is the closest to the predicted embedding (i.e., the result of the formula). Then, we can check if p is the same word as the true word.

### Computing the distance between the predicted and true word

In [35]:
def embed(word, embedding, vocab_size=V, tokenizer=tokenizer):
    """ Embed a word by getting the one hot encoding and taking the dot product of this vector with the 
        embedding matrix 'word' = string type
    """
    # get the index of the word from the tokenizer, i.e. convert the string to it's corresponding integer in the vocabulary
    int_word = tokenizer.texts_to_sequences([word])[0]
    # get the one-hot encoding of the word
    bin_word = to_categorical(int_word, V)
    return np.dot(bin_word, embedding)

In [36]:
def compute_distance(word_a, word_b, word_c, word_d, models):
    """ Returns the cosine distance between the predicted and the true word (word_d)

    Our analogy function is: 'word_a is to word_b as word_c is to ?'
    Here, ? is predicted based on the embeddings. Then, we compare ? to word_d, which is the true word. 
    """
    embeddings = [model.get_weights()[0] for model in models]
    for embedding in embeddings:
        predicted_embedding = embed(word_b, embedding) - embed(word_a, embedding) + embed(word_c, embedding)
        dist_exp_true = cosine_distances(predicted_embedding, embed(word_d, embedding))
        print(dist_exp_true[0][0])

In [None]:

# Example distances between the predicted and true word for skipgram and cbow
compute_distance('king', 'queen', 'woman', 'man', models)

### Listing the top z closest words based on an analogy function

In [41]:
from scipy.spatial.distance import cosine, cdist


def embed(word, embedding, vocab_size=V, tokenizer=tokenizer):
    # Get the index of the word from the tokenizer, i.e. convert the string to it's corresponding integer in the vocabulary
    int_word = tokenizer.texts_to_sequences([word])[0]
    # Get the one-hot encoding of the word
    bin_word = to_categorical(int_word, V)
    return np.dot(bin_word, embedding).reshape(-1) 


def get_nearest_words(model_name, embed_word, used_words, nr=10):
    """Returns the `nr` nearest words to the `embed_word` for a certain `model_name`
    """
    # Load the model embedding matrix and create a list of all the words
    df = pd.read_csv(f"vectors_{model_name}.txt", sep=" ")
    
    # Filter out words that are in the analogy
    df = df[~(df["word"].isin(used_words))]

    # Store the embedded representation of the words
    embedded_words = df.iloc[:, 1:].values
    embedded_word = embed_word.reshape(1, -1)

    # Get the distances between the input embedding and the embedded words such that we can look for the smallest one
    # cdist makes it easy for us to compute the cosine distance between each pair of the two collections of inputs
    distances = cdist(embedded_word, embedded_words, "cosine").reshape(-1)
    
    # Sort distances and store the indices of the `nr` lowest distances 
    top_sorted_indices = distances.argsort()[:nr]

    # Convert the indices to actual words
    top_words = [list(df["word"])[i] for i in top_sorted_indices]
    
    # Keep the rounded values of those indices
    values = [round(distances[i], 4) for i in top_sorted_indices]
    # Concatenate the top words together with their values and return it as a list
    return list(zip(top_words, values))


def print_analogy(analogy, embeddings, model_names, nr=10):
    # Retrieve the words from the analogy we need to compute
    word_a, word_b, word_c, word_true = analogy    
        
    # Formulate the analogy task
    analogy_task = f"{word_a} is to {word_b} as {word_c} is to ?"

    print(f"Analogy Task: {analogy_task}")
    print("---------------------------------------------------")
    
    if word_a not in tokenizer.word_index: 
        print(word_a, "not in vocab")
        return
    
    if word_b not in tokenizer.word_index:
        print(word_b, "not in vocab")
        return
        
    if word_c not in tokenizer.word_index:
        print(word_c, "not in vocab")
        return
        
    if word_true not in tokenizer.word_index:
        print(word_true, "not in vocab")
        return

    # Iterate over all models available
    for model_name, embedding in zip(model_names, embeddings):
        # Obtain embeddings for all the words
        embed_true = embed(word_true, embedding)
        embed_a, embed_b, embed_c = embed(word_a, embedding), embed(word_b, embedding), embed(word_c, embedding)
        
        # Obtain the predicted embedding based on the analogy function
        embed_prediction = embed_b - embed_a + embed_c

        # The true word with distance similarity value between predicted embedding and true word embedding,
        # also denoted `sim1` in the text above
        sim1 = round(cosine(embed_true, embed_prediction), 4)

        # The predicted word with distance similarity value between predicted embedding and the embedding of the word
        # in the vocabulary that is closest to this predicted embedding
        word_prediction, sim2 = get_nearest_words(model_name, embed_prediction, [word_a, word_b, word_c], 1)[0]

        # Get the top `nr` nearest words
        nearest_words = get_nearest_words(model_name, embed_prediction, [word_a, word_b, word_c], nr)
        
        # Print whether or not the true word was in the top nr 
        partially_correct = word_true in [word[0] for word in nearest_words]
        
        print(f"Embedding: {model_name}")
        # Print all top nr words with their distance
        for word in nearest_words:
            print(f"{word[0]} => {round(word[1], 4)}")
        print(f"Predicted: {word_prediction} ({round(sim2, 4)}) - True: {word_true} ({sim1})")
        print(f"Correct? {word_prediction == word_true} - In the top {nr}? {partially_correct}")
        print("---------------------------------------------------\n\n")

#### 
The method we have created above is relatively simple. Let us consider the major steps of the method. The method boils down to: 1) concatenating all models such that it is easier to iterate over all models, 2) get the embeddings of each model such that we can easily iterate over them, 3) store the model names in a list such that we can easily iterate over them, 4) create a list of tuples of size four where each word in the tuple represents a word in the analogy, 5) iterate over each tuple in the analogies we want to look at, 6) compute the embedding of each word in the tuple, 7) fill in the analogy function using the first three words, 8) make a prediction based on the outcome of the analogy function and return the nr nearest words using the cosine distance 9) compare if the actual word (given as input parameter) is equal to the predicted word. This is the main idea behind the method. We have also made it easier to return the top nr of nearest words and print the top nr nearest words for each prediction together with the cosine distances to give us more of an idea as to what the model is predicting.



In [42]:
# Concatenate all models such that we can easily iterate over all models




models = [skipgram , cbow]
# Store the embeddings of all models such that we can easily iterate over them
word_embeddings = [model.get_weights()[0] for model in models]

# Store the model names such that we can easily iterate over them
model_names = ["skipgram_300", "cbow_300"]

# Set the number of top words to print
nr = 10

print_analogy(analogy=('queen', 'king', 'woman', 'man'), embeddings=word_embeddings, model_names=model_names, nr=nr)

Analogy Task: queen is to king as woman is to ?
---------------------------------------------------
Embedding: skipgram_300
trapped => 0.7019
stewart => 0.7104
starred => 0.7187
embroiled => 0.7285
wonder => 0.7365
ian => 0.7398
declares => 0.745
craig => 0.747
hill => 0.7499
nerve => 0.7533
Predicted: trapped (0.7019) - True: man (0.9634)
Correct? False - In the top 10? False
---------------------------------------------------


Embedding: cbow_300
venue => 0.6792
valued => 0.6878
entitled => 0.7001
libertine => 0.7009
scientist => 0.701
clarke => 0.7019
embroiled => 0.7054
sharapova => 0.7205
factor => 0.7207
distributor => 0.7215
Predicted: venue (0.6792) - True: man (0.9134)
Correct? False - In the top 10? False
---------------------------------------------------




In [43]:
analogies = [('he', 'is', 'we', 'are'), ('love', 'hate', 'little', 'large'), ('small', 'smaller', 'large', 'larger'), ('man', 'woman', 'king', 'queen'), ('mouse', 'mice', 'cat', 'cats')]
for analogy in analogies:
    print_analogy(analogy=analogy, embeddings=word_embeddings, model_names=model_names)
    

Analogy Task: he is to is as we is to ?
---------------------------------------------------
Embedding: skipgram_300
differentiate => 0.6941
waves => 0.7017
papers => 0.711
rescue => 0.7119
isps => 0.7133
demo => 0.7176
imprisoning => 0.7246
dreaming => 0.7324
owed => 0.7335
pains => 0.736
Predicted: differentiate (0.6941) - True: are (0.8858)
Correct? False - In the top 10? False
---------------------------------------------------


Embedding: cbow_300
waves => 0.6879
villages => 0.6944
thus => 0.7145
extremely => 0.7207
does => 0.7285
sell => 0.7299
earnest => 0.7322
munich => 0.7341
frustration => 0.7388
crosses => 0.7404
Predicted: waves (0.6879) - True: are (0.918)
Correct? False - In the top 10? False
---------------------------------------------------


Analogy Task: love is to hate as little is to ?
---------------------------------------------------
Embedding: skipgram_300
abandoned => 0.5758
wound => 0.5987
able => 0.6151
wiped => 0.6258
deported => 0.6286
navigate => 0.6358
i

In [None]:
import pandas as pd
df = pd.read_csv('bbc-text.csv')
corpus = df.apply(lambda row: f"{row['text']}\n", axis=1).tolist()

In [None]:
# Preprocess the text into character n-grams and whole words
def extract_tokens(word, min_n=3, max_n=6):
    ngrams = []
    for n in range(min_n, max_n + 1):
        ngrams += [word[i:i+n] for i in range(len(word) - n + 1)]
    return [word] + ngrams

sentences = []
for sentence in corpus:
    words = sentence.split()
    ngram_sentence = [extract_tokens(word) for word in words]
    sentences.append([token for tokens in ngram_sentence for token in tokens])

In [None]:
%%time
model = Word2Vec(sentences, sg=1, vector_size=300, window=5, min_count=1, workers=4, epochs=50)

In [None]:
model.save("fasttext.model")

In [75]:

loaded_model = Word2Vec.load('/Users/harshvive14/Desktop/NLP Word Embedding Project/models/fasttext.model')
word_vectors = loaded_model.wv

input_word = "racing"
if input_word in word_vectors:
    embedding_vector = word_vectors[input_word]
    similar_words = loaded_model.wv.similar_by_word(input_word)
    
    print(f"\nNearest words to '{input_word}':")
    for word, similarity in similar_words:
        print(f"{word}: {similarity}")
else:
    print(f"'{input_word}' is not present in the vocabulary.")


Nearest words to 'racing':
racin: 0.6850551962852478
bracin: 0.5699722170829773
tracin: 0.5525078773498535
mbraci: 0.5490640997886658
seraci: 0.5483604669570923
eracin: 0.5347764492034912
rserac: 0.5334939956665039
orsera: 0.529446542263031
traci: 0.515548586845398
tycoons: 0.5088850259780884


In [38]:
word_vectors = {}
i=0

target_word = "king"


with open("vectors_skipgram_300.txt", "r", encoding="utf-8") as file:
    for line in file:
        i+=1
        if i == 1:
            continue
        parts = line.strip().split()
        word = parts[0]
        vector = np.array([float(x) for x in parts[1:]])
        word_vectors[word] = vector

In [39]:
word_vectors

{'the': array([-0.02199289,  0.03579663, -0.08151455, -0.1271285 ,  0.0899231 ,
         0.09212574, -0.13506354,  0.01736196, -0.1042377 , -0.12857589,
        -0.07840447,  0.14748764, -0.02269663,  0.1504304 ,  0.08632143,
         0.08043238,  0.233483  , -0.17061205,  0.21647973, -0.06183635,
        -0.15550244,  0.03549013, -0.13641395, -0.12326496, -0.06626974,
        -0.12543938,  0.11566662, -0.13591912, -0.09772446,  0.04019888,
        -0.20902066, -0.25967604,  0.14875628, -0.06806808,  0.03003336,
         0.13652973, -0.26442957, -0.06407244, -0.04192676, -0.10788668,
         0.20165382, -0.12643032, -0.07505339, -0.0200877 ,  0.11866522,
         0.02889064,  0.22051248,  0.05001991, -0.10028892, -0.03501617,
        -0.00347714, -0.10132656,  0.12784095,  0.18562132, -0.059568  ,
         0.1118096 , -0.0298992 , -0.158858  ,  0.02338173, -0.03961867,
         0.09792748,  0.1591202 , -0.05342468, -0.17649677,  0.29455078,
        -0.32039142, -0.11046364,  0.1084432

In [51]:
analogies = [('he', 'is', 'we', 'are'), ('love', 'hate', 'little', 'large'), ('small', 'smaller', 'large', 'larger'), ('man', 'woman', 'king', 'queen'), ('mouse', 'mice', 'cat', 'cats')]

for analogy in analogies:
    print_analogy(analogy, list(word_vectors.values()), model_names)

Analogy Task: he is to is as we is to ?
---------------------------------------------------


ValueError: shapes (1,9088) and (300,) not aligned: 9088 (dim 1) != 300 (dim 0)

In [52]:
word_embeddings

[array([[-0.00214251, -0.0121221 , -0.01510321, ..., -0.00877267,
          0.01666906,  0.01842307],
        [-0.02199289,  0.03579663, -0.08151455, ...,  0.09725461,
         -0.1169652 , -0.03868316],
        [ 0.03874718,  0.0458426 , -0.2206451 , ..., -0.22560804,
         -0.02905687,  0.10799085],
        ...,
        [-0.17046127,  0.25113133,  0.0097746 , ..., -0.01196183,
          0.06048154, -0.04334079],
        [-0.22128603,  0.19090584, -0.31348667, ..., -0.06036747,
         -0.00521338, -0.0671095 ],
        [-0.3874851 ,  0.30454206, -0.25570133, ..., -0.34445858,
          0.18289243,  0.06144356]], dtype=float32),
 array([[ 0.13612565, -0.24560535, -0.03289436, ..., -0.19434509,
         -0.08385199, -0.53212786],
        [-0.28227025,  0.5302428 ,  0.34517342, ...,  0.53482145,
          0.28042147,  0.06472909],
        [-0.29531002,  0.27147973, -0.421036  , ..., -0.26960206,
          0.2286051 , -0.4589049 ],
        ...,
        [ 1.0469862 ,  0.35403857,  0.8

In [65]:
skipgram.save('sgm.h5')
cbow.save('cbm.h5')

In [61]:
ss = load_model('sgm.h5')

In [63]:
for i in ss.get_weights():
    print(i)

[[-0.00214251 -0.0121221  -0.01510321 ... -0.00877267  0.01666906
   0.01842307]
 [-0.02199289  0.03579663 -0.08151455 ...  0.09725461 -0.1169652
  -0.03868316]
 [ 0.03874718  0.0458426  -0.2206451  ... -0.22560804 -0.02905687
   0.10799085]
 ...
 [-0.17046127  0.25113133  0.0097746  ... -0.01196183  0.06048154
  -0.04334079]
 [-0.22128603  0.19090584 -0.31348667 ... -0.06036747 -0.00521338
  -0.0671095 ]
 [-0.3874851   0.30454206 -0.25570133 ... -0.34445858  0.18289243
   0.06144356]]
[[ 0.7297914   0.1471582   0.232067   ...  0.57133955  0.32845506
   0.2688621 ]
 [-0.5046034  -0.09191171 -0.16231588 ... -0.49722597 -0.37766653
  -0.16959591]
 [ 0.5734418   0.18164833  0.08242726 ...  0.9565141   0.62622017
   0.33612272]
 ...
 [ 0.6836971   0.03457901  0.10446284 ...  0.44699436  0.48894548
   0.34679094]
 [-0.6635938  -0.06483159  0.04416826 ... -0.4570687  -0.478053
  -0.5025285 ]
 [-0.474631   -0.07111412 -0.09948257 ... -0.6744731  -0.4046962
  -0.24938357]]
[-6.4804125   4.3299

In [64]:
for i in skipgram.get_weights():
    print(i)

[[-0.00214251 -0.0121221  -0.01510321 ... -0.00877267  0.01666906
   0.01842307]
 [-0.02199289  0.03579663 -0.08151455 ...  0.09725461 -0.1169652
  -0.03868316]
 [ 0.03874718  0.0458426  -0.2206451  ... -0.22560804 -0.02905687
   0.10799085]
 ...
 [-0.17046127  0.25113133  0.0097746  ... -0.01196183  0.06048154
  -0.04334079]
 [-0.22128603  0.19090584 -0.31348667 ... -0.06036747 -0.00521338
  -0.0671095 ]
 [-0.3874851   0.30454206 -0.25570133 ... -0.34445858  0.18289243
   0.06144356]]
[[ 0.7297914   0.1471582   0.232067   ...  0.57133955  0.32845506
   0.2688621 ]
 [-0.5046034  -0.09191171 -0.16231588 ... -0.49722597 -0.37766653
  -0.16959591]
 [ 0.5734418   0.18164833  0.08242726 ...  0.9565141   0.62622017
   0.33612272]
 ...
 [ 0.6836971   0.03457901  0.10446284 ...  0.44699436  0.48894548
   0.34679094]
 [-0.6635938  -0.06483159  0.04416826 ... -0.4570687  -0.478053
  -0.5025285 ]
 [-0.474631   -0.07111412 -0.09948257 ... -0.6744731  -0.4046962
  -0.24938357]]
[-6.4804125   4.3299

In [66]:
word_vectors = loaded_model.wv
for i in word_vectors:
    print(i)

NameError: name 'loaded_model' is not defined

In [None]:
with open("vectors_skipgram_300.txt", "r", encoding="utf-8") as file:
    for line in file:
        i+=1
        if i == 1:
            continue
        parts = line.strip().split()
        word = parts[0]
        vector = np.array([float(x) for x in parts[1:]])
        word_vectors[word] = vector