In [24]:
import pyarrow as pa
import pandas as pd

df_read = pd.read_parquet('CL_fr-en (1).parquet', engine='pyarrow')

df_read = df_read.dropna()

df_read['fr'] = df_read['fr'].str.lower()
df_read['en'] = df_read['en'].str.lower()

df_read['fr'] = df_read['fr'].str.replace('[^\w\s]','')
df_read['en'] = df_read['en'].str.replace('[^\w\s]','')


#remove punctuation
import string
df_read['fr'] = df_read['fr'].str.translate(str.maketrans('', '', string.punctuation))
df_read['en'] = df_read['en'].str.translate(str.maketrans('', '', string.punctuation))

df_read=df_read.head(1000)
concatdf = pd.concat([df_read['en'], df_read['fr']], ignore_index=True)
#save concatdf to a txt file without quotes
concatdf.to_csv('concatdf.txt', index=False, header=False)

In [58]:
from gensim.models import Word2Vec
import torch
import torch.nn as nn
import numpy as np
import string

import torch.optim as optim
import torch.nn.functional as F

torch.manual_seed(42)
np.random.seed(42)

import sentencepiece as spm

from collections import Counter

class SentencePieceTokenizer:
    def __init__(self, input_file, model_prefix='sentence_piece', vocab_size=1000, model_type='unigram', character_coverage=1.0):
        self.input_file = input_file
        self.model_prefix = model_prefix
        self.vocab_size = vocab_size
        self.model_type = model_type
        self.character_coverage = character_coverage
        self.tokenized_corpus = []

    def train_tokenizer(self):
        spm.SentencePieceTrainer.train(f'--input={self.input_file} '
                                       f'--model_prefix={self.model_prefix} '
                                       f'--vocab_size={self.vocab_size} '
                                       f'--model_type={self.model_type} '
                                       f'--character_coverage={self.character_coverage}')
        self.sp = spm.SentencePieceProcessor()
        self.sp.load(f'{self.model_prefix}.model')

    def tokenize_corpus(self):
        with open(self.input_file, 'r', encoding='utf-8') as f:
            for line in f:
                self.tokenized_corpus.append(self.sp.encode_as_pieces(line.strip()))

    def get_tokenized_corpus(self):
        return self.tokenized_corpus

    def get_vocab(self):
        return {self.sp.id_to_piece(id): id for id in range(self.sp.get_piece_size())}

    def get_vocab_size(self):
        return self.sp.get_piece_size()


class GensimWord2Vec:
    def __init__(self, corpus, embed_size, window_size, learning_rate, epochs):
        self.corpus = [corpus]
        self.embed_size = embed_size
        self.window_size = window_size
        self.learning_rate = learning_rate
        self.epochs = epochs

    def train(self):
        self.model = Word2Vec(sentences=self.corpus,
                              vector_size=self.embed_size,
                              window=self.window_size,
                              alpha=self.learning_rate,
                              epochs=self.epochs,
                              sg=1,  # Use skip-gram
                              workers=1)  # Single worker for reproducibility

    def get_word_vector(self, word):
        return self.model.wv[word]
    
    def find_closest_word(self, word):
        max_similarity = -1.0  # Start with the lowest possible similarity value
        closest_word = None

        if word not in self.model.wv:  # Check if the word exists in the model
            return f"{word} not found in model"

        word_vector = np.array(self.model.wv[word]).reshape(1, -1)
        for w in self.model.wv.index_to_key:  # Loop through the model's vocab
            if w == word:  # Skip the word itself
                continue
            w_vector = np.array(self.model.wv[w]).reshape(1, -1)
            similarity_output = cosine_similarity(word_vector, w_vector)

            # Check the type of the output
            if np.isscalar(similarity_output):
                similarity = similarity_output
            else:
                similarity = similarity_output[0][0]

            if similarity > max_similarity:
                max_similarity = similarity
                closest_word = w

        return closest_word

class SkipGram(nn.Module):
    def __init__(self, vocab_size, embed_size):
        super(SkipGram, self).__init__()
        self.in_embed = nn.Embedding(vocab_size, embed_size)
        self.out_embed = nn.Embedding(vocab_size, embed_size)

    def forward(self, target, context):
        in_vectors = self.in_embed(target)
        out_vectors = self.out_embed(context)
        scores = torch.matmul(in_vectors, out_vectors.t())
        return scores

    def train_model(self, corpus, vocab, window_size, learning_rate, epochs, negative_samples=5):
        optimizer = optim.SGD(self.parameters(), lr=learning_rate)
        loss_fn = nn.BCEWithLogitsLoss()

        for epoch in range(epochs):
            for idx, word in enumerate(corpus):
                # Get the context words based on window size
                start = max(0, idx - window_size)
                end = min(len(corpus), idx + window_size + 1)
                context_words = corpus[start:idx] + corpus[idx+1:end]

                for context_word in context_words:
                    target_tensor = torch.tensor([vocab[word]], dtype=torch.long)
                    context_tensor = torch.tensor([vocab[context_word]], dtype=torch.long)
                    
                    negative_context = torch.multinomial(torch.tensor(list(vocab.values()), dtype=torch.float), negative_samples, replacement=True)
                    
                    optimizer.zero_grad()
                    
                    scores = self(target_tensor, torch.cat([context_tensor, negative_context]))
                    
                    # Create labels: 1 for the correct context word and 0 for the negative samples
                    labels = torch.cat([torch.tensor([1.0]), torch.zeros(negative_samples)]).unsqueeze(0)
                    
                    loss = loss_fn(scores, labels)
                    loss.backward()
                    optimizer.step()

                    
    def get_word_vector(self, word, vocab):
        word_idx = torch.tensor([vocab[word]], dtype=torch.long)
        return self.in_embed(word_idx).squeeze().detach().numpy()
    
    def find_closest_word(self, word, vocab):
            max_similarity = -1.0  # Start with the lowest possible similarity value
            closest_word = None

            if word not in vocab:  # Check if the word exists in the vocab
                return f"{word} not found in vocab"

            word_idx = torch.tensor([vocab[word]], dtype=torch.long)
            word_vector = self.in_embed(word_idx).detach().numpy().reshape(1, -1)

            for w, idx in vocab.items():  # Loop through the vocab
                if w == word:  # Skip the word itself
                    continue
                w_idx = torch.tensor([idx], dtype=torch.long)
                w_vector = self.in_embed(w_idx).detach().numpy().reshape(1, -1)

                similarity_output = cosine_similarity(word_vector, w_vector)
                
                # Check the type of the output
                if np.isscalar(similarity_output):
                    similarity = similarity_output
                else:
                    similarity = similarity_output[0][0]
                
                if similarity > max_similarity:
                    max_similarity = similarity
                    closest_word = w

            return closest_word
    
    

def SP_build_vocabulary(vocab):
    # Assuming vocab is a dictionary, invert it
    return {v: k for k, v in vocab.items()}


def cosine_similarity(vec1, vec2):
    vec1 = vec1.squeeze()
    vec2 = vec2.squeeze()
    dot_product = np.dot(vec1, vec2)
    norm1 = np.linalg.norm(vec1)
    norm2 = np.linalg.norm(vec2)
    return dot_product / (norm1 * norm2)



# Training

In [80]:
word1 = 'e'
word2 = 'e'
torch.manual_seed(42)
np.random.seed(42)
# example_word = 36
embed_size = 20
window_size = 2
learning_rate = 0.025
epochs = 100

### Sentence Piece Tokenizer

In [92]:
# SentencePieceTokenizer
tokenizer = SentencePieceTokenizer('training_text.txt', model_prefix='sentence_piece', vocab_size=90)
tokenizer.train_tokenizer()

# Tokenize the entire corpus
tokenizer.tokenize_corpus()

# Get tokenized corpus, vocab, and vocab_size
corpus = tokenizer.get_tokenized_corpus()
vocab = tokenizer.get_vocab()
vocab = SP_build_vocabulary(vocab)
vocab_size = tokenizer.get_vocab_size()

print("Vocabulary Size:", vocab_size)
for sentence in corpus:
    for word in sentence:
        if not isinstance(word, str):
            print(f"Non-string element found: {word}")

print(corpus)
print(type(corpus))
print(vocab)
print(type(vocab))
print(vocab_size)
print(type(vocab_size))

Vocabulary Size: 90
[['▁', 'T', 'od', 'a', 'y', '▁w', 'e', '▁wi', 'll', '▁be', '▁learning', '▁a', 'b', 'ou', 't', '▁th', 'e', '▁f', 'u', 'nd', 'ame', 'n', 'tal', 's', '▁', 'o', 'f', '▁data', '▁s', 'ci', 'ence', '▁a', 'nd', '▁statistics', '.', '▁', 'D', 'ata', '▁', 'S', 'ci', 'ence', '▁a', 'nd', '▁statistics', '▁a', 're', '▁h', 'o', 't', '▁a', 'nd', '▁g', 'r', 'o', 'wi', 'ng', '▁', 'fi', 'el', 'd', 's', '▁wi', 'th', '▁', 'al', 'ter', 'n', 'ati', 've', '▁', 'n', 'ame', 's', '▁', 'o', 'f', '▁m', 'a', 'c', 'h', 'ine', '▁learning', ',', '▁ar', 'ti', 'fi', 'ci', 'al', '▁', 'in', 'te', 'll', 'ig', 'ence', ',', '▁b', 'ig', '▁data', ',', '▁', 'e', 't', 'c', '.', '▁I', "'", 'm', '▁', 're', 'a', 'll', 'y', '▁ex', 'ci', 'te', 'd', '▁t', 'o', '▁', 'tal', 'k', '▁t', 'o', '▁', 'y', 'ou', '▁a', 'b', 'ou', 't', '▁data', '▁s', 'ci', 'ence', '▁a', 'nd', '▁statistics', '▁beca', 'use', '▁data', '▁s', 'ci', 'ence', '▁a', 'nd', '▁statistics', '▁h', 'a', 've', '▁l', 'o', 'ng', '▁be', 'en', '▁a', '▁', 'p', 'a'

In [None]:
#random word selector
import random
example_word = random.choice(list(vocab.keys()))
print(example_word)

In [None]:
# Gensim Model Training
print("Training Gensim Model...")
modelGensim = GensimWord2Vec(corpus, embed_size, window_size, learning_rate, epochs)
modelGensim.train()

# Test the Gensim model (change 'example_word' to any word in your corpus)

if example_word in vocab:
    print(f"Gensim Vector for {example_word}: ", modelGensim.get_word_vector(example_word))
    vec1_gensim = modelGensim.get_word_vector(word1)
    vec2_gensim = modelGensim.get_word_vector(word2)
    similarity_gensim = cosine_similarity(vec1_gensim, vec2_gensim)
    print(f"Gensim cosine similarity between {word1} and {word2}: ", similarity_gensim)
else:
    print(f"{example_word} not in vocabulary.")

In [None]:
# SkipGram Model Training
print("\nTraining SkipGram Model...")
modelSkipGram = SkipGram(vocab_size, embed_size)
modelSkipGram.train_model(corpus, vocab, window_size, learning_rate, epochs)

# Test the SkipGram model (change 'example_word' to any word in your corpus)
if example_word in vocab:
    print(f"SkipGram Vector for {example_word}: ", modelSkipGram.get_word_vector(example_word, vocab))
    vec1_skipgram = modelSkipGram.get_word_vector(word1, vocab)
    vec2_skipgram = modelSkipGram.get_word_vector(word2, vocab)
    similarity_skipgram = cosine_similarity(vec1_skipgram, vec2_skipgram)
    print(f"SkipGram cosine similarity between {word1} and {word2}: ", similarity_skipgram)
else:
    print(f"{example_word} not in vocabulary.")



In [51]:
closest_gensim = modelGensim.find_closest_word('data')
print(f"Closest word to 'data' using Gensim: {closest_gensim}")

closest_skipgram = modelSkipGram.find_closest_word('data', vocab)
print(f"Closest word to 'data' using SkipGram: {closest_skipgram}")

Closest word to 'data' using Gensim: statistics
Closest word to 'data' using SkipGram: thanks


# Cosine Similarity Comparison

In [8]:



word1 = 'data'
word2 = 'science'

if word1 in vocab and word2 in vocab:
    # Gensim Model
    vec1_gensim = modelGensim.get_word_vector(word1)
    vec2_gensim = modelGensim.get_word_vector(word2)
    similarity_gensim = cosine_similarity(vec1_gensim, vec2_gensim)
    print(f"Gensim cosine similarity between {word1} and {word2}: ", similarity_gensim)

    # SkipGram Model
    vec1_skipgram = modelSkipGram.get_word_vector(word1, vocab)
    vec2_skipgram = modelSkipGram.get_word_vector(word2, vocab)
    similarity_skipgram = cosine_similarity(vec1_skipgram, vec2_skipgram)
    print(f"SkipGram cosine similarity between {word1} and {word2}: ", similarity_skipgram)



Gensim cosine similarity between data and science:  0.9797871
SkipGram cosine similarity between data and science:  0.25304967
CBOW cosine similarity between data and science:  -0.015298413
