# Skip gram and CBOW


We will built the Skipgram and CBOW models from scratch, train them on a relatively small corpus, i.e, on BBC Data set.

In [1]:
import numpy as np

import tensorflow as tf
import operator
from tensorflow import keras
import keras.backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Reshape, Lambda
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import plot_model
from tensorflow.keras.preprocessing import sequence
from sklearn.metrics.pairwise import cosine_distances

from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors as nn
from matplotlib import pylab
import pandas as pd

In [2]:
df = pd.read_csv('bbc-text.csv')
print(df)
sentences = ''
articles = list(df['text'])

           category                                               text
0              tech  tv future in the hands of viewers with home th...
1          business  worldcom boss  left books alone  former worldc...
2             sport  tigers wary of farrell  gamble  leicester say ...
3             sport  yeading face newcastle in fa cup premiership s...
4     entertainment  ocean s twelve raids box office ocean s twelve...
...             ...                                                ...
2220       business  cars pull down us retail figures us retail sal...
2221       politics  kilroy unveils immigration policy ex-chatshow ...
2222  entertainment  rem announce new glasgow concert us band rem h...
2223       politics  how political squabbles snowball it s become c...
2224          sport  souness delight at euro progress boss graeme s...

[2225 rows x 2 columns]


## Skip Gram

In [3]:
%%time

sentences = []

for i in articles[:200]:
    sentences += i.split('.')

# Remove sentences with fewer than 3 words
corpus = [sentence for sentence in sentences if sentence.count(" ") >= 5]

# Remove punctuation in text and fit tokenizer on entire corpus
tokenizer = Tokenizer(filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~\t\n'+"'")
tokenizer.fit_on_texts(corpus)

# Convert text to sequence of integer values
corpus = tokenizer.texts_to_sequences(corpus)
n_samples = sum(len(s) for s in corpus) # Total number of words in the corpus
V = len(tokenizer.word_index) + 1 # Total number of unique words in the corpus

CPU times: user 48.9 ms, sys: 1.22 ms, total: 50.2 ms
Wall time: 49.7 ms


In [4]:
n_samples, V

(74390, 9322)

In [5]:
# Example of how word to integer mapping looks like in the tokenizer
print(list((tokenizer.word_index.items()))[:5])

[('the', 1), ('to', 2), ('of', 3), ('and', 4), ('a', 5)]


In [6]:

# Parameters
window_size = 2
window_size_corpus = 4

# Set numpy seed for reproducible results
np.random.seed(42)


In [7]:

# Prepare data for the skipgram model
# The function returns two arrays: all_in, which contains the target words, and 
# all_out, which contains the corresponding one-hot encoded context words.

def generate_data_skipgram(corpus, window_size, V):
    maxlen = window_size * 2
    all_in = []
    all_out = []
    for words in corpus:
        L = len(words)
        for index, word in enumerate(words):
            p = index - window_size
            n = index + window_size + 1

            in_words = []
            labels = []
            for i in range(p, n):
                if i != index and 0 <= i < L:
                    # Add the input word
                    all_in.append(word)
                    # Add one-hot of the context words
                    all_out.append(to_categorical(words[i], V))

    return (np.array(all_in), np.array(all_out))

In [8]:
%%time

# Create training data
X_skip, y_skip = generate_data_skipgram(corpus, window_size, V)
X_skip.shape, y_skip.shape

CPU times: user 1.67 s, sys: 3.28 s, total: 4.95 s
Wall time: 6.2 s


((275978,), (275978, 9322))

In [9]:
print(X_skip[0:13])
print(y_skip[0:13])

[208 208 407 407 407   6   6   6   6   1   1   1   1]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [10]:
%%time

# Create skipgram architecture

dim = 300
skipgram_models = []

# Initialize a Keras Sequential model
skipgram = Sequential()

# Add an Embedding layer
skipgram.add(Embedding(input_dim=V,
                        output_dim=dim,
                        input_length=1,
                        embeddings_initializer='glorot_uniform'))

# Add a Reshape layer, which reshapes the output of the embedding layer (1,dim) to (dim,)
skipgram.add(Reshape((dim, )))

# Add a final Dense layer with the same size as in [1]
skipgram.add(Dense(V, activation='softmax', kernel_initializer='glorot_uniform'))

# Compile the model with a suitable loss function and select an optimizer.
# Optimizer Adagrad was used in paper
skipgram.compile(optimizer=keras.optimizers.Adam(),
                loss='categorical_crossentropy',
                metrics=['accuracy'])

skipgram.summary()
print("")
skipgram_models.append(skipgram)



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1, 300)            2796600   
                                                                 
 reshape (Reshape)           (None, 300)               0         
                                                                 
 dense (Dense)               (None, 9322)              2805922   
                                                                 
Total params: 5,602,522
Trainable params: 5,602,522
Non-trainable params: 0
_________________________________________________________________

CPU times: user 53.5 ms, sys: 32.2 ms, total: 85.7 ms
Wall time: 54.1 ms


In [11]:
%%time


# Training the skipgram models
for skipgram in skipgram_models:
    skipgram.fit(X_skip, y_skip, batch_size=64, epochs=10, verbose=1)
    print("")

Epoch 1/10


2023-11-28 05:27:27.713762: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

CPU times: user 39min 43s, sys: 10min 50s, total: 50min 34s
Wall time: 9min 56s


In [12]:

for skipgram in skipgram_models:
    # Save embeddings for vectors of length 50, 150 and 300 using skipgram model
    weights = skipgram.get_weights()

    # Get the embedding matrix
    embedding = weights[0]

    # Get word embeddings for each word in the vocabulary, write to file
    f = open(f"vectors_skipgram_{len(embedding[0])}.txt", "w")

    # Create columns for the words and the values in the matrix, makes it easier to read as dataframe
    columns = ["word"] + [f"value_{i+1}" for i in range(embedding.shape[1])]

    # Start writing to the file, start with the column names
    f.write(" ".join(columns))

    # Start a new line
    f.write("\n")

    for word, i in tokenizer.word_index.items():
        f.write(word)
        f.write(" ")
        f.write(" ".join(map(str, list(embedding[i,:]))))
        f.write("\n")
    f.close()

In [13]:
skipgram.get_weights()[0]

array([[-0.00779338,  0.01972531,  0.01419954, ..., -0.02101514,
         0.01292091,  0.02110568],
       [-0.09163388, -0.02635568,  0.07773658, ...,  0.01926036,
        -0.03169642,  0.1490469 ],
       [ 0.16329095, -0.20949455, -0.10299218, ...,  0.18321548,
        -0.05173499,  0.0817973 ],
       ...,
       [ 0.37874895, -0.01725492,  0.3530954 , ..., -0.06208007,
        -0.08999639,  0.09179325],
       [ 0.01896744, -0.25777036, -0.20103447, ..., -0.03838814,
         0.0492724 ,  0.22370675],
       [-0.08725878,  0.01326364,  0.20278992, ..., -0.5401249 ,
         0.01658141,  0.24005595]], dtype=float32)

In [14]:
len(skipgram.get_weights())

3

In [15]:
len(skipgram.get_weights()[0])

9322

In [16]:
len(skipgram.get_weights()[0][0])

300

In [17]:
skipgram.get_weights()[0][1]

array([-0.09163388, -0.02635568,  0.07773658,  0.08947439,  0.15617695,
       -0.15438677, -0.11931635,  0.14262363, -0.08654329,  0.14073399,
       -0.11981915,  0.07665411, -0.00315618,  0.02960972,  0.04841485,
        0.12730306, -0.02558832, -0.06607738, -0.16928354,  0.04165031,
        0.21741606, -0.0013554 , -0.0768519 , -0.04429504, -0.08227623,
        0.09248411, -0.13111536, -0.09208474,  0.35656106, -0.06911632,
        0.02089642,  0.04338039, -0.13249621, -0.03986635, -0.17646879,
       -0.07252575, -0.13664111, -0.07110459, -0.07233532,  0.04831076,
       -0.03079399, -0.04340337,  0.11197045,  0.0440909 ,  0.11237387,
       -0.06821577, -0.0976437 ,  0.06544045, -0.0836357 ,  0.12521996,
       -0.21540208, -0.17066208, -0.07709809,  0.06175427, -0.13189538,
        0.11937968,  0.07496584,  0.03332197, -0.08093476, -0.14861879,
        0.10410377,  0.11623706, -0.0116831 , -0.12481596,  0.06477566,
        0.13426732,  0.1289558 ,  0.0925407 ,  0.04530833,  0.12

To get the word embedding:

In [18]:
index = tokenizer.word_index['king']

In [19]:
skipgram.get_weights()[0][index]

array([ 1.27446085e-01,  9.77579132e-02, -2.30674505e-01, -5.93292229e-02,
       -1.31638408e-01, -5.27725637e-01, -6.47630811e-01, -6.72960877e-02,
       -2.58396268e-02,  1.60156917e-02,  7.50044659e-02, -2.78036743e-01,
       -1.02429293e-01, -3.28077190e-02,  4.93224487e-02, -2.48573691e-01,
       -2.88832426e-01, -5.05673289e-01, -1.46271408e-01,  6.91479594e-02,
       -2.97959328e-01, -8.52769464e-02, -6.10925630e-02, -2.45853797e-01,
        1.58908412e-01,  4.77355540e-01,  1.39180496e-01,  7.22390227e-03,
        9.57334414e-02, -4.00800584e-03, -4.19922590e-01,  2.61898398e-01,
        2.37221509e-01,  1.05325691e-03,  2.73265511e-01,  2.07189262e-01,
        5.26713729e-01, -1.86639935e-01, -2.68109739e-01, -1.41160205e-01,
       -7.61407688e-02, -1.87115476e-01,  1.74072847e-01,  1.15389414e-01,
       -1.74125761e-01, -3.69330138e-01,  1.39214039e-01, -7.86893591e-02,
       -4.91450191e-01,  1.48090109e-01,  2.78602894e-02,  6.70169294e-02,
        2.70318270e-01, -

In [20]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load your pre-trained word embeddings into a dictionary or a matrix
# word_vectors should be a dictionary where keys are words and values are their corresponding vectors
# Or word_vectors can be a matrix where rows correspond to words and columns are vector dimensions
# You should replace this with your actual word embeddings

# Sample code for loading pre-trained word vectors into a dictionary
word_vectors = {}
i=0

target_word = "king"


with open("vectors_skipgram_300.txt", "r", encoding="utf-8") as file:
    for line in file:
        i+=1
        if i == 1:
            continue
        parts = line.strip().split()
        word = parts[0]
        vector = np.array([float(x) for x in parts[1:]])
        word_vectors[word] = vector

# Target word for which you want to find the k-nearest words


# Calculate cosine similarities with all words in the vocabulary
similarities = {}
target_vector = word_vectors[target_word]
for word, vector in word_vectors.items():
    if word != target_word:
        cosine_sim = cosine_similarity([target_vector], [vector])
        similarities[word] = cosine_sim[0][0]

# Sort the words by their cosine similarity scores in descending order
sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

# Select the top-k words as the k-nearest words
k = 10  # Number of nearest words you want to find
nearest_words = [(word, e) for word, e in sorted_similarities[:k]]

# Print the k-nearest words
print(f"The {k} nearest words to '{target_word}' are: ")
for i in (nearest_words):
    print(i)


skipgram_word_emd = word_vectors

The 10 nearest words to 'king' are: 
('tote', 0.34790592104473206)
('adaptation', 0.34316849037173064)
('name', 0.32730016328623873)
('macfarlane', 0.3272542045187647)
('governor', 0.3254736139367036)
('mervyn', 0.3173878787141464)
('charles', 0.31286011151883036)
('dead', 0.29567151691208293)
('comeback', 0.2950116375327063)
('duke', 0.29199742666194783)


## CBOW

In [21]:

from keras.preprocessing import sequence

# The function returns two arrays: all_in, which contains the context words, 
# and all_out, which contains the corresponding one-hot encoded target words.

def generate_data_cbow(corpus, window_size, V):
    all_in = []
    all_out = []

    # Iterate over all sentences
    for sentence in corpus:
        L = len(sentence)
        for index, word in enumerate(sentence):
            start = index - window_size
            end = index + window_size + 1

            # Empty list which will store the context words
            context_words = []
            for i in range(start, end):
                # Skip the 'same' word
                if i != index:
                    # Add a word as a context word if it is within the window size
                    if 0 <= i < L:
                        context_words.append(sentence[i])
                    else:
                        # Pad with zero if there are no words
                        context_words.append(0)
            # Append the list with context words
            all_in.append(context_words)

            # Add one-hot encoding of the target word
            all_out.append(to_categorical(word, V))

    return (np.array(all_in), np.array(all_out))

In [22]:
%%time


# Create the training data
X_cbow, y_cbow = generate_data_cbow(corpus, window_size, V)
X_cbow.shape, y_cbow.shape

CPU times: user 538 ms, sys: 986 ms, total: 1.52 s
Wall time: 1.92 s


((74390, 4), (74390, 9322))

In [23]:
print(X_cbow[:10])
print(y_cbow[:10])


[[   0    0  407    6]
 [   0  208    6    1]
 [ 208  407    1 1865]
 [ 407    6 1865    3]
 [   6    1    3  785]
 [   1 1865  785   17]
 [1865    3   17  160]
 [   3  785  160  843]
 [ 785   17  843 1605]
 [  17  160 1605 5047]]
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [24]:
%%time

# Create the CBOW architecture
cbow_models = []
dim = 300
cbow = Sequential()

# Add an Embedding layer
cbow.add(Embedding(input_dim=V,
                    output_dim=dim,
                    input_length=window_size*2, # Note that we now have 2L words for each input entry
                    embeddings_initializer='glorot_uniform'))

cbow.add(Lambda(lambda x: K.mean(x, axis=1), output_shape=(dim, )))

cbow.add(Dense(V, activation='softmax', kernel_initializer='glorot_uniform'))

cbow.compile(optimizer=keras.optimizers.Adam(),
                loss='categorical_crossentropy',
                metrics=['accuracy'])

cbow.summary()
print("")
cbow_models.append(cbow)



Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 4, 300)            2796600   
                                                                 
 lambda (Lambda)             (None, 300)               0         
                                                                 
 dense_1 (Dense)             (None, 9322)              2805922   
                                                                 
Total params: 5,602,522
Trainable params: 5,602,522
Non-trainable params: 0
_________________________________________________________________

CPU times: user 43.2 ms, sys: 25.1 ms, total: 68.3 ms
Wall time: 40.9 ms


In [25]:
%%time

# Train CBOW model
for cbow in cbow_models:
    cbow.fit(X_cbow, y_cbow, batch_size=64, epochs=50, verbose=1)
    print("")


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50

CPU times: user 53min 18s, sys: 14min 21s, total: 1h 7min 40s
Wall time: 13min 29s


In [26]:

for cbow in cbow_models:

    weights = cbow.get_weights()

    # Get the embedding matrix
    embedding = weights[0]

    # Get word embeddings for each word in the vocabulary, write to file
    f = open(f'vectors_cbow_{len(embedding[0])}.txt', 'w')

    # Create columns for the words and the values in the matrix, makes it easier to read as dataframe
    columns = ["word"] + [f"value_{i+1}" for i in range(embedding.shape[1])]

    # Start writing to the file, start with the column names
    f.write(" ".join(columns))
    f.write("\n")

    for word, i in tokenizer.word_index.items():
        f.write(word)
        f.write(" ")
        f.write(" ".join(map(str, list(embedding[i,:]))))
        f.write("\n")
    f.close()

In [27]:
from sklearn.metrics.pairwise import cosine_similarity

# Loading pre-trained word embeddings into a dictionary or a matrix
# word_vectors should be a dictionary where keys are words and values are their corresponding vectors
# Or word_vectors can be a matrix where rows correspond to words and columns are vector dimensions



word_vectors = {}
i=0

target_word = "king"


with open("vectors_cbow_300.txt", "r", encoding="utf-8") as file:
    for line in file:
        i+=1
        if i == 1:
            continue
        parts = line.strip().split()
        word = parts[0]
        vector = np.array([float(x) for x in parts[1:]])
        word_vectors[word] = vector

# Target word for which you want to find the k-nearest words


# Calculate cosine similarities with all words in the vocabulary
similarities = {}
target_vector = word_vectors[target_word]
for word, vector in word_vectors.items():
    if word != target_word:
        cosine_sim = cosine_similarity([target_vector], [vector])
        similarities[word] = cosine_sim[0][0]

# Sort the words by their cosine similarity scores in descending order
sorted_similarities = sorted(similarities.items(), key=lambda x: x[1], reverse=True)

# Select the top-k words as the k-nearest words
k = 10  # Number of nearest words you want to find
nearest_words = [(word,e) for word, e in sorted_similarities[:k]]

# Print the k-nearest words
print(f"The {k} nearest words to '{target_word}' are: ")
for i in (nearest_words):
    print(i)


cbow_word_emd = word_vectors

The 10 nearest words to 'king' are: 
('wishes', 0.3323208988812061)
('pops', 0.31664041545272287)
('prince', 0.3159132598837587)
('flagship', 0.3070605956052026)
('breathtaking', 0.2986077509119104)
('astounded', 0.29825930003995144)
('understands', 0.29567787820537805)
('rub', 0.2925689882841785)
('heineken', 0.29195099642702327)
('oversee', 0.2893588965674602)


To get the word embedding:

In [28]:
len(skipgram_word_emd),len(cbow_word_emd)

(9321, 9321)

In [31]:
cosine_similarity([skipgram_word_emd['king']], [cbow_word_emd['king']])

array([[-0.09209947]])

In [32]:
cosine_similarity([skipgram_word_emd['queen']], [cbow_word_emd['queen']])

array([[0.03437119]])

In [33]:
cosine_similarity([skipgram_word_emd['king']], [skipgram_word_emd['queen']])

array([[0.24368201]])

In [34]:
cosine_similarity([cbow_word_emd['king']], [cbow_word_emd['queen']])

array([[0.26642819]])