# Training Data

##### Today we will be learning about the fundamentals of data science and statistics. 
##### Data Science and statistics are hot and growing fields with alternative names of machine learning, artificial intelligence, big data, etc. 
##### I'm really excited to talk to you about data science and statistics because data science and statistics have long been a passions of mine. 
##### I didn't used to be very good at data science and statistics but after studying data science and statistics for a long time, I got better and better at it until I became a data science and statistics expert. 
##### I'm really excited to talk to you about data science and statistics, thanks for listening to me talk about data science and statistics.

# GENSIM & SkipGram & CBOW

In [1]:
from gensim.models import Word2Vec
import torch
import torch.nn as nn
import numpy as np
import string

import torch.optim as optim


torch.manual_seed(42)
np.random.seed(42)



class TextProcessor:
    def __init__(self, stopwords_file, training_text_file):
        self.stopwords = self._load_stopwords(stopwords_file)
        self.corpus = self._process_text(training_text_file)
    
    def _load_stopwords(self, stopwords_file):
        with open(stopwords_file) as f:
            stopwords = f.read().replace('\n', ' ').split()
        return stopwords

    def _process_text(self, training_text_file):
        with open(training_text_file, encoding='utf-8') as f:
            text = f.read().replace('\n', '')
            text = text.translate(str.maketrans('', '', string.punctuation))
            text = ''.join([t for t in text if t not in list('0123456789')])
            text = text.replace('”', '').replace('“', '').replace('’', '').lower().split()
        return [w for w in text if w not in self.stopwords][:2000]

    def get_corpus(self):
        return self.corpus


class GensimWord2Vec:
    def __init__(self, corpus, embed_size, window_size, learning_rate, epochs):
        self.corpus = [corpus]
        self.embed_size = embed_size
        self.window_size = window_size
        self.learning_rate = learning_rate
        self.epochs = epochs

    def train(self):
        self.model = Word2Vec(sentences=self.corpus,
                              vector_size=self.embed_size,
                              window=self.window_size,
                              alpha=self.learning_rate,
                              epochs=self.epochs,
                              sg=1,  # Use skip-gram
                              workers=1)  # Single worker for reproducibility

    def get_word_vector(self, word):
        return self.model.wv[word]

class SkipGram(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(SkipGram, self).__init__()
        self.in_embed = nn.Embedding(vocab_size, embed_size)
        self.out_linear = nn.Linear(embed_size, vocab_size)
        
    def forward(self, target):
        in_vectors = self.in_embed(target)
        out_logits = self.out_linear(in_vectors)
        return out_logits
    
    def train_model(self, corpus, vocab, learning_rate, epochs):
        optimizer = optim.SGD(self.parameters(), lr=learning_rate)
        loss_fn = nn.CrossEntropyLoss()
        
        for epoch in range(epochs):
            for idx, word in enumerate(corpus[:-1]):  # -1 to avoid out of bounds
                target = torch.tensor([vocab[word]], dtype=torch.long)
                context_idx = vocab[corpus[idx + 1]]
                
                optimizer.zero_grad()
                
                predictions = self(target)
                
                loss = loss_fn(predictions, torch.tensor([context_idx]))
                loss.backward()
                optimizer.step()
    def get_word_vector(self, word, vocab):
        word_idx = torch.tensor([vocab[word]], dtype=torch.long)
        return self.in_embed(word_idx).detach().numpy()
    
class CBOW(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(CBOW, self).__init__()
        self.in_embed = nn.Embedding(vocab_size, embed_size)
        self.embed_size = embed_size

    def forward(self, context, window_size):
        # Get the embeddings of the context words
        in_vectors = self.in_embed(context)
        
        # Average the embeddings of the context words
        in_vectors_avg = torch.mean(in_vectors, dim=0, keepdim=True)
        
        # Pass the averaged embeddings through the linear layer
        out_logits = self.out_linear(in_vectors_avg)
        return out_logits

    def train_model(self, corpus, vocab, learning_rate, epochs, window_size=2):
        self.out_linear = nn.Linear(self.embed_size, len(vocab)) 
        
        optimizer = optim.SGD(self.parameters(), lr=learning_rate)
        loss_fn = nn.CrossEntropyLoss()
        
        for epoch in range(epochs):
            for idx, word in enumerate(corpus[window_size:-window_size]):  # Leave space for context words
                # Gather context words based on the window size
                context_words = corpus[idx-window_size:idx] + corpus[idx+1:idx+1+window_size]
                context_indices = [vocab[w] for w in context_words]
                
                # Convert to tensor
                context = torch.tensor(context_indices, dtype=torch.long)
                target_idx = vocab[word]
                
                optimizer.zero_grad()
                
                predictions = self(context, window_size)
                
                loss = loss_fn(predictions, torch.tensor([target_idx]))
                loss.backward()
                optimizer.step()

    def get_word_vector(self, word, vocab):
        word_idx = torch.tensor([vocab[word]], dtype=torch.long)
        return self.in_embed(word_idx).detach().numpy()


    

# Vocabulary builder
def build_vocabulary(corpus):
    vocab = {}
    for word in corpus:
        if word not in vocab:
            vocab[word] = len(vocab)
    return vocab






# Training

In [2]:

torch.manual_seed(42)
np.random.seed(42)

# Load and process the data
processor = TextProcessor('stopwords.txt', 'training_text.txt')
corpus = processor.get_corpus()

# Build the vocabulary
vocab = build_vocabulary(corpus)
vocab_size = len(vocab)

# Gensim Model
print("Training Gensim Model...")
modelGensim = GensimWord2Vec(corpus, 20, 2, 0.025, 1000)
modelGensim.train()

# Test the Gensim model (change 'example_word' to any word in your corpus)
example_word = "data"
if example_word in vocab:
    print(f"Gensim Vector for {example_word}: ", modelGensim.get_word_vector(example_word))
else:
    print(f"{example_word} not in vocabulary.")

# SkipGram Model
print("\nTraining SkipGram Model...")
modelSkipGram = SkipGram(vocab_size, 20)
modelSkipGram.train_model(corpus, vocab, 0.025, 1000)

# Test the SkipGram model (change 'example_word' to any word in your corpus)
if example_word in vocab:
    print(f"SkipGram Vector for {example_word}: ", modelSkipGram.get_word_vector(example_word, vocab))
else:
    print(f"{example_word} not in vocabulary.")

# CBOW Model
print("\nTraining CBOW Model...")
modelCBOW = CBOW(vocab_size, 20)
modelCBOW.train_model(corpus, vocab, 0.025, 1000, window_size=2)



# Test the CBOW model (change 'example_word' to any word in your corpus)
if example_word in vocab:
    print(f"CBOW Vector for {example_word}: ", modelCBOW.get_word_vector(example_word, vocab))
else:
    print(f"{example_word} not in vocabulary.")

Training Gensim Model...
Gensim Vector for data:  [-0.03911165  0.22466028  0.15663825  0.23718707 -0.17009805 -0.32686406
  0.04607087  0.62932575 -0.24616416 -0.03631771 -0.08447191  0.0153377
 -0.01536816  0.06978628 -0.05106831 -0.19504882  0.37443215 -0.2380562
 -0.35111517 -0.24540682]

Training SkipGram Model...
SkipGram Vector for data:  [[-0.05340152  1.3186854  -0.99223226  0.8127032   1.1575003   0.49402523
   1.6774234   0.19818336  0.08226159 -0.268004   -0.8270626   1.0272753
  -0.01924454  0.49821365  0.0542079   0.09707388  0.55477446 -0.53761685
  -1.623687   -0.7710329 ]]

Training CBOW Model...
CBOW Vector for data:  [[-1.7172759  -1.2148739  -0.21443495 -0.59840095  0.21292756  1.4785212
  -1.8608963  -2.6497774   0.11688966  0.33669746 -1.4511781   4.2494607
   0.22541954  2.0907726   0.53882873  3.0997822  -0.80189586 -2.754468
   0.9497787  -0.35829023]]


# Cosine Similarity Comparison

In [3]:
def cosine_similarity(vec1, vec2):
    vec1 = vec1.squeeze()
    vec2 = vec2.squeeze()
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)


word1 = 'data'
word2 = 'science'

if word1 in vocab and word2 in vocab:
    # Gensim Model
    vec1_gensim = modelGensim.get_word_vector(word1)
    vec2_gensim = modelGensim.get_word_vector(word2)
    similarity_gensim = cosine_similarity(vec1_gensim, vec2_gensim)
    print(f"Gensim cosine similarity between {word1} and {word2}: ", similarity_gensim)

    # SkipGram Model
    vec1_skipgram = modelSkipGram.get_word_vector(word1, vocab)
    vec2_skipgram = modelSkipGram.get_word_vector(word2, vocab)
    similarity_skipgram = cosine_similarity(vec1_skipgram, vec2_skipgram)
    print(f"SkipGram cosine similarity between {word1} and {word2}: ", similarity_skipgram)

    # CBOW Model
    vec1_cbow = modelCBOW.get_word_vector(word1, vocab)
    vec2_cbow = modelCBOW.get_word_vector(word2, vocab)
    similarity_cbow = cosine_similarity(vec1_cbow, vec2_cbow)
    print(f"CBOW cosine similarity between {word1} and {word2}: ", similarity_cbow)
else:
    print(f"One or both words are not in vocabulary.")


Gensim cosine similarity between data and science:  0.9797871
SkipGram cosine similarity between data and science:  0.097108334
CBOW cosine similarity between data and science:  -0.10008272


# Numpy Implementation attempt

In [4]:
import numpy as np

np.random.seed(42)


class Word2Vec:
    def __init__(self, corpus, embed_size, window_size, learning_rate, epochs):
        self.corpus = corpus
        self.embed_size = embed_size
        self.window_size = window_size
        self.learning_rate = learning_rate
        self.epochs = epochs
        
        # Create vocabulary
        self.words = set(corpus)
        self.word2index = {word: i for i, word in enumerate(self.words)}
        self.index2word = {i: word for i, word in enumerate(self.words)}
        self.vocab_size = len(self.words)
        
        # Initialize weights
        self.input_word = np.random.randn(self.vocab_size, self.embed_size)

        self.output_cotext_words = np.random.randn(self.embed_size, self.vocab_size)
        
    def train(self):
        for epoch in range(self.epochs):
            self.loss = 0
            for word_index, word in enumerate(self.corpus):
                # We are going to predict the context words from the current word
                # The context words are the words that fall within the window size
                # around the current word
                start = max(0, word_index - self.window_size)
                end = min(word_index + self.window_size, len(self.corpus))
                context_indices = [self.word2index[w] for w in self.corpus[start:end] if w != word]
                
                for context_index in context_indices:
                    
                    # Forward pass
                    input_word_embedding = self.input_word[context_index]
          
                    # print(self.embedding_to_word(input_word_embedding))
                    dot_product = np.dot(input_word_embedding, self.output_cotext_words)

                    softmaxed_probability = self.softmax(dot_product)
                    
                    # Compute loss
                    self.loss += -np.log(softmaxed_probability[self.word2index[word]])
                    
                    # Backpropagation
                    e = softmaxed_probability.copy()
                    # Subtract 1 from the word that was actually the context word
                    e[self.word2index[word]] -= 1
                    
                    #This is the gradient that shows how much the output embedding should change to minimize the loss
                    gradient_for_output_embedding = np.outer(input_word_embedding, e)
                    #This is the gradient that shows how much the input embedding should change to minimize the loss
                    gradient_for_input_embedding = np.dot(self.output_cotext_words, e.T)
                    
                    # Update weights
                    #Updating output embedding to take a STEP TOWARDS OPPOSITE TO THE GRADIENT
                    self.output_cotext_words -= self.learning_rate * gradient_for_output_embedding

                    #Updating input embedding ONLY FOR THE CONTEXT WORD in the context matrix
                    self.input_word[context_index] -= self.learning_rate * gradient_for_input_embedding
            
            # print(f'Epoch: {epoch + 1}/{self.epochs}, Loss: {self.loss/len(self.corpus)}')
            
   
    def softmax(self, x):
        exp_x = np.exp(x - np.max(x))
        return exp_x / exp_x.sum(axis=0)
    
    def get_word_vector(self, word):
        return self.input_word[self.word2index[word]]
    


def cosine_similarity(vec1, vec2):
    return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

def euclidean_distance(vec1, vec2):
    return np.linalg.norm(vec1 - vec2)



#read training_text.txt and split it into words
corpus = open("training_text.txt", "r").read().split()


model = Word2Vec(corpus, embed_size=19, window_size=2, learning_rate=0.01, epochs=1000)
model.train()




print("Cosine Similarity (data, science):", cosine_similarity(model.get_word_vector("data"), model.get_word_vector("science")))
print("Cosine Similarity (data, really):", cosine_similarity(model.get_word_vector("data"), model.get_word_vector("really")))

#compare above two cosine similarities with each other and print the smaller pair
if cosine_similarity(model.get_word_vector("data"), model.get_word_vector("science")) > cosine_similarity(model.get_word_vector("data"), model.get_word_vector("really")):
    print("Cosine Similarity (data, science) is larger than (data, really): TRUE")
else:
    print("Cosine Similarity (data, science) is smaller than (data, really): FALSE")


print("Euclidean Distance (data, science):", euclidean_distance(model.get_word_vector("data"), model.get_word_vector("science")))
print("Euclidean Distance (data, really):", euclidean_distance(model.get_word_vector("data"), model.get_word_vector("really")))

#compare above two euclidean distances with each other and print the smaller pair
if euclidean_distance(model.get_word_vector("data"), model.get_word_vector("science")) > euclidean_distance(model.get_word_vector("data"), model.get_word_vector("really")):
    print("Euclidean Distance (data, science) is larger than (data, really): FALSE")
else:
    print("Euclidean Distance (data, science) is smaller than (data, really): TRUE")

Cosine Similarity (data, science): 0.15628131687288663
Cosine Similarity (data, really): -0.37585027669960913
Cosine Similarity (data, science) is larger than (data, really): TRUE
Euclidean Distance (data, science): 4.40646638292033
Euclidean Distance (data, really): 5.857802244317426
Euclidean Distance (data, science) is smaller than (data, really): TRUE
