<a href="https://colab.research.google.com/github/inderpreetsingh01/ml_machine_coding/blob/main/Word2vec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import re
from collections import Counter

class Word2Vec:
    def __init__(self, vocab_size, embed_size=50, window=2, negative_samples=5, lr=0.01, epochs=10):
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.window = window
        self.negative_samples = negative_samples
        self.lr = lr
        self.epochs = epochs

        # Initialize weights
        self.W1 = np.random.randn(vocab_size, embed_size) * 0.01  # input -> hidden
        self.W2 = np.random.randn(embed_size, vocab_size) * 0.01  # hidden -> output

    def _softmax(self, x):
        e_x = np.exp(x - np.max(x))
        return e_x / e_x.sum(axis=0)

    def generate_training_data(self, corpus, word_to_idx):
        """Create skip-gram pairs with window size"""
        training_pairs = []
        for sentence in corpus:
            for i, word in enumerate(sentence):
                center = word_to_idx[word]
                context = list(range(max(0, i - self.window), min(len(sentence), i + self.window + 1)))
                context.remove(i)
                for j in context:
                    context_word = word_to_idx[sentence[j]]
                    training_pairs.append((center, context_word))
        return training_pairs

    def train(self, training_pairs):
        for epoch in range(self.epochs):
            loss = 0
            for center, context in training_pairs:
                # Forward pass
                h = self.W1[center]                  # hidden layer
                u = np.dot(h, self.W2)               # scores for all words
                y_pred = self._softmax(u)            # predicted prob distribution

                # True output
                y_true = np.zeros(self.vocab_size)
                y_true[context] = 1.0

                # Compute error
                e = y_pred - y_true
                loss += -np.log(y_pred[context] + 1e-9)

                # Backpropagation
                dW2 = np.outer(h, e)
                dW1 = np.dot(self.W2, e)

                # Update weights
                self.W1[center] -= self.lr * dW1
                self.W2 -= self.lr * dW2

            print(f"Epoch {epoch+1}/{self.epochs}, Loss={loss:.4f}")

    def get_vector(self, word, word_to_idx):
        return self.W1[word_to_idx[word]]

In [2]:
# Example corpus
corpus = [
    "data science is fun",
    "machine learning is part of data science",
    "python makes machine learning easy"
]

# Preprocess
corpus = [re.findall(r"\b\w+\b", sentence.lower()) for sentence in corpus]
words = [word for sentence in corpus for word in sentence]
vocab = sorted(set(words))
word_to_idx = {w: i for i, w in enumerate(vocab)}
idx_to_word = {i: w for w, i in word_to_idx.items()}

# Train data
w2v = Word2Vec(vocab_size=len(vocab), embed_size=10, window=2, lr=0.05, epochs=50)
training_pairs = w2v.generate_training_data(corpus, word_to_idx)
w2v.train(training_pairs)

# Get embeddings
print("Embedding for 'data':", w2v.get_vector("data", word_to_idx))
print("Embedding for 'science':", w2v.get_vector("science", word_to_idx))

Epoch 1/50, Loss=110.3050
Epoch 2/50, Loss=110.3009
Epoch 3/50, Loss=110.2965
Epoch 4/50, Loss=110.2917
Epoch 5/50, Loss=110.2860
Epoch 6/50, Loss=110.2789
Epoch 7/50, Loss=110.2698
Epoch 8/50, Loss=110.2580
Epoch 9/50, Loss=110.2425
Epoch 10/50, Loss=110.2218
Epoch 11/50, Loss=110.1942
Epoch 12/50, Loss=110.1570
Epoch 13/50, Loss=110.1069
Epoch 14/50, Loss=110.0393
Epoch 15/50, Loss=109.9480
Epoch 16/50, Loss=109.8248
Epoch 17/50, Loss=109.6590
Epoch 18/50, Loss=109.4364
Epoch 19/50, Loss=109.1396
Epoch 20/50, Loss=108.7470
Epoch 21/50, Loss=108.2338
Epoch 22/50, Loss=107.5740
Epoch 23/50, Loss=106.7438
Epoch 24/50, Loss=105.7280
Epoch 25/50, Loss=104.5271
Epoch 26/50, Loss=103.1647
Epoch 27/50, Loss=101.6883
Epoch 28/50, Loss=100.1627
Epoch 29/50, Loss=98.6558
Epoch 30/50, Loss=97.2221
Epoch 31/50, Loss=95.8937
Epoch 32/50, Loss=94.6794
Epoch 33/50, Loss=93.5715
Epoch 34/50, Loss=92.5536
Epoch 35/50, Loss=91.6070
Epoch 36/50, Loss=90.7145
Epoch 37/50, Loss=89.8619
Epoch 38/50, Loss=8

In [3]:
# with negative sampling

In [4]:
import numpy as np
import re
from collections import Counter
import random

class Word2VecNS:
    def __init__(self, vocab_size, embed_size=50, window=2, negative_samples=5, lr=0.01, epochs=10):
        self.vocab_size = vocab_size
        self.embed_size = embed_size
        self.window = window
        self.negative_samples = negative_samples
        self.lr = lr
        self.epochs = epochs

        # Initialize embeddings
        self.W1 = np.random.randn(vocab_size, embed_size) * 0.01  # input -> hidden
        self.W2 = np.random.randn(embed_size, vocab_size) * 0.01  # hidden -> output

    def _sigmoid(self, x):
        return 1 / (1 + np.exp(-x))

    def generate_training_data(self, corpus, word_to_idx):
        """Generate skip-gram (center, context) pairs"""
        pairs = []
        for sentence in corpus:
            for i, word in enumerate(sentence):
                center = word_to_idx[word]
                context_range = range(max(0, i - self.window), min(len(sentence), i + self.window + 1))
                for j in context_range:
                    if i != j:
                        context = word_to_idx[sentence[j]]
                        pairs.append((center, context))
        return pairs

    def _get_negative_samples(self, true_idx):
        """Sample negative words (not the true context word)"""
        negatives = []
        while len(negatives) < self.negative_samples:
            neg = random.randint(0, self.vocab_size - 1)
            if neg != true_idx:
                negatives.append(neg)
        return negatives

    def train(self, training_pairs):
        for epoch in range(self.epochs):
            total_loss = 0
            for center, context in training_pairs:
                h = self.W1[center]   # hidden layer representation

                # Positive sample (true context)
                u_pos = np.dot(h, self.W2[:, context])
                p_pos = self._sigmoid(u_pos)
                loss_pos = -np.log(p_pos + 1e-9)

                # Gradient update for positive sample
                grad_pos = self.lr * (1 - p_pos)
                self.W1[center] += grad_pos * self.W2[:, context]
                self.W2[:, context] += grad_pos * h

                # Negative samples
                negatives = self._get_negative_samples(context)
                loss_neg = 0
                for neg in negatives:
                    u_neg = np.dot(h, self.W2[:, neg])
                    p_neg = self._sigmoid(-u_neg)  # target=0
                    loss_neg += -np.log(p_neg + 1e-9)

                    grad_neg = self.lr * (0 - self._sigmoid(u_neg))
                    self.W1[center] += grad_neg * self.W2[:, neg]
                    self.W2[:, neg] += grad_neg * h

                total_loss += (loss_pos + loss_neg)

            print(f"Epoch {epoch+1}/{self.epochs}, Loss={total_loss:.4f}")

    def get_vector(self, word, word_to_idx):
        return self.W1[word_to_idx[word]]

    def similarity(self, word1, word2, word_to_idx):
        v1 = self.get_vector(word1, word_to_idx)
        v2 = self.get_vector(word2, word_to_idx)
        return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

In [5]:
# Example corpus
corpus = [
    "data science is fun",
    "machine learning is part of data science",
    "python makes machine learning easy"
]

# Preprocess
corpus = [re.findall(r"\b\w+\b", sentence.lower()) for sentence in corpus]
words = [word for sentence in corpus for word in sentence]
vocab = sorted(set(words))
word_to_idx = {w: i for i, w in enumerate(vocab)}
idx_to_word = {i: w for w, i in word_to_idx.items()}

# Train
w2v = Word2VecNS(vocab_size=len(vocab), embed_size=10, window=2, negative_samples=3, lr=0.05, epochs=50)
training_pairs = w2v.generate_training_data(corpus, word_to_idx)
w2v.train(training_pairs)

# Embeddings & similarity
print("Vector for 'data':", w2v.get_vector("data", word_to_idx))
print("Similarity(data, science):", w2v.similarity("data", "science", word_to_idx))
print("Similarity(machine, python):", w2v.similarity("machine", "python", word_to_idx))

Epoch 1/50, Loss=127.5374
Epoch 2/50, Loss=127.5320
Epoch 3/50, Loss=127.5233
Epoch 4/50, Loss=127.5142
Epoch 5/50, Loss=127.5028
Epoch 6/50, Loss=127.4939
Epoch 7/50, Loss=127.4603
Epoch 8/50, Loss=127.4186
Epoch 9/50, Loss=127.3511
Epoch 10/50, Loss=127.2475
Epoch 11/50, Loss=127.0989
Epoch 12/50, Loss=126.8821
Epoch 13/50, Loss=126.5178
Epoch 14/50, Loss=125.8956
Epoch 15/50, Loss=125.1592
Epoch 16/50, Loss=123.8950
Epoch 17/50, Loss=121.9493
Epoch 18/50, Loss=119.7874
Epoch 19/50, Loss=116.9568
Epoch 20/50, Loss=114.4596
Epoch 21/50, Loss=110.9105
Epoch 22/50, Loss=108.1127
Epoch 23/50, Loss=105.3525
Epoch 24/50, Loss=103.6663
Epoch 25/50, Loss=102.7386
Epoch 26/50, Loss=101.4306
Epoch 27/50, Loss=100.0268
Epoch 28/50, Loss=98.4538
Epoch 29/50, Loss=98.5548
Epoch 30/50, Loss=99.5451
Epoch 31/50, Loss=97.3730
Epoch 32/50, Loss=97.0366
Epoch 33/50, Loss=94.6951
Epoch 34/50, Loss=95.3362
Epoch 35/50, Loss=94.5077
Epoch 36/50, Loss=92.3070
Epoch 37/50, Loss=88.9826
Epoch 38/50, Loss=90