# Word embeding using fasttext

Input data

In [15]:
sentences = [
        "the quick brown fox jumps over the lazy dog",
        "the fast brown cat jumps over the sleeping dog",
        "a quick fox runs fast",
    ]

sentences

['the quick brown fox jumps over the lazy dog',
 'the fast brown cat jumps over the sleeping dog',
 'a quick fox runs fast']

### Preprocess the text and generate vocabulary

In [16]:
def build_vocabulary(sentences):
    """
    Build a vocabulary from the given sentences.
    """
    vocab = set()
    for sentence in sentences:
        words = sentence.split()
        
        vocab.update(words)
    return sorted(vocab)

vocab = build_vocabulary(sentences)
vocab

['a',
 'brown',
 'cat',
 'dog',
 'fast',
 'fox',
 'jumps',
 'lazy',
 'over',
 'quick',
 'runs',
 'sleeping',
 'the']

Fasttext generates n-grams which is used for capturing out of vocbulary words during prediction

In [17]:
def generate_n_grams(vocab, n_gram_size):
    n_grams_words = set()
    for word in vocab:
        temp = []
        for size in range(2, n_gram_size + 1):
            for i in range(len(word) - size + 1):  # Adjusted range to include valid n-grams
                n_gram = word[i:i + size]
                temp.append(n_gram)  # Append to the list of n-grams for the current word
                n_grams_words.add(n_gram)  # Add to the set of all n-grams
        print(f'N-Grams : {temp}')
    return n_grams_words

n_grams = generate_n_grams(vocab, 2)
print(len(n_grams))

N-Grams : []
N-Grams : ['br', 'ro', 'ow', 'wn']
N-Grams : ['ca', 'at']
N-Grams : ['do', 'og']
N-Grams : ['fa', 'as', 'st']
N-Grams : ['fo', 'ox']
N-Grams : ['ju', 'um', 'mp', 'ps']
N-Grams : ['la', 'az', 'zy']
N-Grams : ['ov', 've', 'er']
N-Grams : ['qu', 'ui', 'ic', 'ck']
N-Grams : ['ru', 'un', 'ns']
N-Grams : ['sl', 'le', 'ee', 'ep', 'pi', 'in', 'ng']
N-Grams : ['th', 'he']
39


Build vocabulary to index 

In [18]:
vocab.extend(n_grams)

word_to_index = {}
index_to_word = {}

for i , word in enumerate(vocab):
    word_to_index[word] = i
    index_to_word[i] = word

print(f'Vocabulary size : {len(word_to_index)}')

Vocabulary size : 52


### Train the model and generate word embedding
Skipgram approch is used for traning where the target add is used as input and context words as outputs

In [19]:
training_data = []

for index, sentence in enumerate(sentences):
    words = sentence.split()
    for i in range(len(words)):
        start = max(0,i -2)
        end = min(len(words),i + 2)
        for j in range( start,end):
            if j == i:
                continue
            training_data.append((words[i],words[j]))
            
training_data

[('the', 'quick'),
 ('quick', 'the'),
 ('quick', 'brown'),
 ('brown', 'the'),
 ('brown', 'quick'),
 ('brown', 'fox'),
 ('fox', 'quick'),
 ('fox', 'brown'),
 ('fox', 'jumps'),
 ('jumps', 'brown'),
 ('jumps', 'fox'),
 ('jumps', 'over'),
 ('over', 'fox'),
 ('over', 'jumps'),
 ('over', 'the'),
 ('the', 'jumps'),
 ('the', 'over'),
 ('the', 'lazy'),
 ('lazy', 'over'),
 ('lazy', 'the'),
 ('lazy', 'dog'),
 ('dog', 'the'),
 ('dog', 'lazy'),
 ('the', 'fast'),
 ('fast', 'the'),
 ('fast', 'brown'),
 ('brown', 'the'),
 ('brown', 'fast'),
 ('brown', 'cat'),
 ('cat', 'fast'),
 ('cat', 'brown'),
 ('cat', 'jumps'),
 ('jumps', 'brown'),
 ('jumps', 'cat'),
 ('jumps', 'over'),
 ('over', 'cat'),
 ('over', 'jumps'),
 ('over', 'the'),
 ('the', 'jumps'),
 ('the', 'over'),
 ('the', 'sleeping'),
 ('sleeping', 'over'),
 ('sleeping', 'the'),
 ('sleeping', 'dog'),
 ('dog', 'the'),
 ('dog', 'sleeping'),
 ('a', 'quick'),
 ('quick', 'a'),
 ('quick', 'fox'),
 ('fox', 'a'),
 ('fox', 'quick'),
 ('fox', 'runs'),
 ('runs'

Training and generating word embeddings without negative sampling

In [20]:
import numpy as np
import pandas as pd

# Hyperparameters
embedding_dim = 10
learning_rate = 0.001
epochs = 10000
vocab_size = len(word_to_index)

# Weight initialization
W1 = np.random.uniform(-1, 1, (vocab_size, embedding_dim))  # Input to hidden weights
W2 = np.random.uniform(-1, 1, (embedding_dim, vocab_size))  # Hidden to output weights

# One-hot encoding function
def one_hot_vector(word, word_to_index):
    one_hot = np.zeros(vocab_size)
    one_hot[word_to_index[word]] = 1
    return one_hot

# Training loop
for epoch in range(epochs):
    loss = 0
    for target, context in training_data:
        # Forward pass
        context_vectors = np.sum([one_hot_vector(target, word_to_index)], axis=0)
        h = np.dot(context_vectors, W1)  # Hidden layer
        u = np.dot(h, W2)  # Output layer
        y_pred = np.exp(u) / np.sum(np.exp(u)) # Softmax activation
        
        # Calculate loss (cross-entropy)
        target_one_hot = one_hot_vector(target, word_to_index)
        loss += -np.sum(target_one_hot * np.log(y_pred + 1e-8))

        # Backpropagation
        e = y_pred - target_one_hot
        dW2 = np.outer(h, e)
        dW1 = np.outer(context_vectors, np.dot(W2, e))

        # Update weights
        W1 -= learning_rate * dW1
        W2 -= learning_rate * dW2

    # Print loss every 1000 epochs
    if (epoch + 1) % 1000 == 0:
        print(f'Epoch {epoch + 1}, Loss: {loss:.4f}')

Epoch 1000, Loss: 1.8562
Epoch 2000, Loss: 0.6182
Epoch 3000, Loss: 0.3601
Epoch 4000, Loss: 0.2507
Epoch 5000, Loss: 0.1908
Epoch 6000, Loss: 0.1533
Epoch 7000, Loss: 0.1277
Epoch 8000, Loss: 0.1091
Epoch 9000, Loss: 0.0951
Epoch 10000, Loss: 0.0841


### Get similar words

In [21]:
 def get_word_vector(word):
        """Get vector representation of a word including its n-grams"""
        if word in word_to_index:
            word_idx = word_to_index[word]
            word_vec = W1[word_idx].copy()
        else:
            word_vec = np.zeros(embedding_dim)
        
        # Add n-gram vectors
        ngrams = generate_n_grams(word,2)
        for ngram in ngrams:
            if ngram in self.word_to_index:
                ngram_idx = word_to_index[ngram]
                word_vec += W1[ngram_idx]
        
        return word_vec / (len(ngrams) + 1)  # Average the vectors

def most_similar(word, top_n=5):
        """Find most similar words"""
        word_vec = get_word_vector(word)
        similarities = {}
        
        for other_word in word_to_index:
            other_vec = get_word_vector(other_word)
            similarity = np.dot(word_vec, other_vec) / (
                np.linalg.norm(word_vec) * np.linalg.norm(other_vec)
            )
            similarities[other_word] = similarity
        
        return sorted(similarities.items(), key=lambda x: x[1], reverse=True)[:top_n]

similar_words = most_similar("fox")
print("\nWords similar to 'fox':")
for word, similarity in similar_words:
    print(f"{word}: {similarity:.4f}")

N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []
N-Grams : []