<h1>1. Word Tokenizer Evaluation</h1>
<h4>Content:</h4>
<l>
    <li>1.1. Defining the evaluation functions and the training model</li>
    <li>1.2. Defining the model parameters and loading the model</li>
    <li>1.3. Syntactic and Semantic Evaluation</li>
</l>

<h3>1.1. Defining the evaluation functions and the training model</h3>

In [3]:
from tokenizers import Tokenizer
import torch
import torch.nn as nn
import json
import numpy as np
from numpy.linalg import norm
from tokenizers import Tokenizer

def create_word_embedding(word, vocabulary, model):
    if word in vocabulary:
        word_index = vocabulary[word]
    else:
        return torch.zeros(model.embeddings.embedding_dim)
    token_ids = torch.tensor([word_index], dtype=torch.long)
    embedding = model.embeddings(token_ids)
    embedding = embedding.sum(dim=0)
    return embedding

def load_vocabulary(vocab_path):
  with open(vocab_path, 'r', encoding='utf-8') as f:
      vocab = json.load(f)
  return vocab

class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, max_norm=1.0)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        embeds_mean = embeds.mean(dim=1)
        out = self.linear(embeds_mean)
        return out



ModuleNotFoundError: No module named 'tokenizers'

<h3>1.2. Defining the model parameters and loading the model</h3>

In [None]:
## Constants
parent_path = '/Users/kaganhitit_/Desktop/COMP442/0076757_word2vec/starter/'

unique_words_path = parent_path + 'evaluation/evaluation_training_unique_words.txt'
syntactic_file_path = parent_path + 'data/SynAnalogyTr.txt'  # Replace with the actual path
semantic_file_path = parent_path + 'data/turkish-analogy-semantic.txt'  # Replace with the actual path
vocab_path = parent_path + 'embeddings/word_tokenizer_vocab.json'


## Parameters
vocabulary = load_vocabulary(vocab_path)
vocab_size = len(vocabulary.keys())
embedding_dim = 512
model_name = 'word_model_final.pth'
embeddings_path = model_name.rsplit('.', 1)[0] + '_embeddings.pth'

model = CBOW(vocab_size, embedding_dim)
model.load_state_dict(torch.load(parent_path + model_name, map_location=torch.device('cpu')))
model.eval()

unique_words = []
with open(unique_words_path, 'r', encoding='utf-8') as file:
    for line in file:
        word = line.strip() 
        unique_words.append(word)

print('Unique word length :' + str(len(unique_words)))

<h3>1.3. Syntactic and Semantic Evaluation</h3>

In [None]:
embeddings_dict = {}

# Assuming unique_words is a list of unique words
for word in unique_words:
    embeddings_dict[word] = create_word_embedding(word, vocabulary, model).detach().numpy()

torch.save(embeddings_dict, embeddings_path)

print("Embeddings saved to", embeddings_path)

embeddings_dict = torch.load(embeddings_path)

embeddings_matrix = np.zeros((len(embeddings_dict), embedding_dim))

for i, (word, embedding) in enumerate(embeddings_dict.items()):
    embeddings_matrix[i, :] = embedding
embeddings_matrix = np.array([emb / norm(emb) if norm(emb) > 0 else emb for emb in embeddings_matrix])
print("Embeddings matrix shape:", embeddings_matrix.shape)


syntactic_data = []
# Path to your SynAnalogyTr.txt file

with open(syntactic_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        words = line.strip().split()
        if len(words) == 4:
            syntactic_data.append(words)

target_embeddings = np.zeros(shape=(len(syntactic_data), 512))

for i in range(len(syntactic_data)):
  a = create_word_embedding(syntactic_data[i][0], vocabulary, model)
  b = create_word_embedding(syntactic_data[i][1], vocabulary, model)
  c = create_word_embedding(syntactic_data[i][2], vocabulary, model)
  target_embedding = (b-a+c).detach().numpy()
  target_embedding /= norm(target_embedding)
  target_embeddings[i] = target_embedding

target_embeddings = target_embeddings.transpose()
print("Target embeddings matrix shape:", target_embeddings.shape)

cosine_similarities = np.matmul(embeddings_matrix, target_embeddings)

top_5_indices_all_columns = []
for column in range(cosine_similarities.shape[1]):
    top_5_indices = np.argsort(cosine_similarities[:, column])[-5:][::-1]
    top_5_indices_all_columns.append(top_5_indices)

words_list = list(embeddings_dict.keys())

syntactic_accuracy = 0
accurate_words = []
MRR_syntactic = 0

for i in range(len(top_5_indices_all_columns)):
    d = syntactic_data[i][3]
    rank = 0
    for j, idx in enumerate(top_5_indices_all_columns[i]):
        word_at_index = words_list[idx]
        if word_at_index == d:
            syntactic_accuracy += 1
            accurate_words.append(d)
            rank = j + 1
            break
    if rank > 0:
        MRR_syntactic += 1 / rank

syntactic_accuracy /= len(syntactic_data)
MRR_syntactic /= len(syntactic_data)

print('Syntactic accuracy:', syntactic_accuracy)
print('Syntactic MRR:', MRR_syntactic)
print('Accurate words:', str(accurate_words))

## Semantic Accuracy
semantic_data = {}
current_category = None
with open(semantic_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        if line.startswith(':'):
            current_category = line[1:].strip()  # Remove the colon at the start and strip spaces
            semantic_data[current_category] = []
        elif current_category is not None:
            words = line.split()
            if len(words) == 4:
                semantic_data[current_category].append(words)


for category_name in semantic_data.keys():

    category = semantic_data[category_name]

    target_embeddings = np.zeros(shape=(len(category), 512))

    for i in range(len(category)):
        a = create_word_embedding(category[i][0], vocabulary, model)
        b = create_word_embedding(category[i][1], vocabulary, model)
        c = create_word_embedding(category[i][2], vocabulary, model)
        target_embedding = (b-a+c).detach().numpy()
        target_embedding /= norm(target_embedding)
        target_embeddings[i] = target_embedding

    target_embeddings = target_embeddings.transpose()
    print(category_name + ", Target embeddings matrix shape:" + str(target_embeddings.shape))

    cosine_similarities = np.matmul(embeddings_matrix, target_embeddings)

    top_5_indices_all_columns = []
    for column in range(cosine_similarities.shape[1]):
        top_5_indices = np.argsort(cosine_similarities[:, column])[-5:][::-1]
        top_5_indices_all_columns.append(top_5_indices)

    words_list = list(embeddings_dict.keys())

    semantic_accuracy = 0
    MRR_semantic = 0
    accurate_words = []
    for i in range(len(top_5_indices_all_columns)):
        d = category[i][3]
        rank = 0
        for j, idx in enumerate(top_5_indices_all_columns[i]):
            word_at_index = words_list[idx]
            if word_at_index == d:
                semantic_accuracy += 1
                accurate_words.append(d)
                rank = j + 1
                break
        if rank > 0:
            MRR_semantic += 1 / rank

    semantic_accuracy /= len(category)
    MRR_semantic /= len(category)

    print(str(category_name) + ' semantic accuracy:', semantic_accuracy)
    print(str(category_name) + ' semantic MRR:', MRR_semantic)
    print('Accurate words:', str(accurate_words))



<h1>2. Character Trigram Tokenizer Evaluation</h1>
<h4>Content:</h4>
<l>
    <li>2.1. Defining the evaluation functions and the training model</li>
    <li>2.2. Defining the model parameters and loading the model</li>
    <li>2.3. Syntactic and Semantic Evaluation</li>
</l>

<h3>2.1. Defining the evaluation functions and the training model</h3>

In [None]:
from tokenizers import Tokenizer
import torch
import torch.nn as nn
import json
import numpy as np
from numpy.linalg import norm
from tokenizers import Tokenizer

def create_word_embedding(word, vocabulary, model):
    word = '<' + word + '>'
    trigrams = [word[i:i+3] for i in range(len(word) - 2)]
    encoded_trigrams = [vocabulary.get(trigram, vocabulary.get('<OOV>')) for trigram in trigrams]
    token_ids = torch.tensor(encoded_trigrams, dtype=torch.long)
    embedding = model.embeddings(token_ids)
    embedding = embedding.sum(dim=0)
    return embedding

def load_vocabulary(vocab_path):
  with open(vocab_path, 'r', encoding='utf-8') as f:
      vocab = json.load(f)
  return vocab

class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, max_norm=1.0)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        embeds_mean = embeds.mean(dim=1)
        out = self.linear(embeds_mean)
        return out



<h3>2.2. Defining the model parameters and loading the model</h3>

In [None]:
## Constants
## parent_path = 'path/to/your/directory/'
parent_path = '/Users/kaganhitit_/Desktop/COMP442/0076757_word2vec/starter/'
unique_words_path = parent_path + 'evaluation/evaluation_training_unique_words.txt'
syntactic_file_path = parent_path + 'data/SynAnalogyTr.txt'  # Replace with the actual path
semantic_file_path = parent_path +'data/turkish-analogy-semantic.txt'  # Replace with the actual path
vocab_path = parent_path + 'embeddings/trigram_tokenizer_vocab.json'


## Parameters
vocabulary = load_vocabulary(vocab_path)
vocab_size = len(vocabulary.keys())
embedding_dim = 512
model_name = 'trigram_model_final.pth'
embeddings_path = model_name.rsplit('.', 1)[0] + '_embeddings.pth'

model = CBOW(vocab_size, embedding_dim)
model.load_state_dict(torch.load(parent_path + model_name, map_location=torch.device('cpu')))
model.eval()

unique_words = []
with open(unique_words_path, 'r', encoding='utf-8') as file:
    for line in file:
        word = line.strip()
        unique_words.append(word)

print('Unique word length :' + str(len(unique_words)))



<h3>2.3. Syntactic and Semantic Evaluation</h3>

In [None]:
embeddings_dict = {}

# Assuming unique_words is a list of unique words
for word in unique_words:
    embeddings_dict[word] = create_word_embedding(word, vocabulary, model).detach().numpy()

# Saving the embeddings
torch.save(embeddings_dict, embeddings_path)

print("Embeddings saved to", embeddings_path)

embeddings_dict = torch.load(embeddings_path)

# Initialize the embeddings matrix
embeddings_matrix = np.zeros((len(embeddings_dict), embedding_dim))

# Fill the matrix with embeddings
for i, (word, embedding) in enumerate(embeddings_dict.items()):
    embeddings_matrix[i, :] = embedding
embeddings_matrix = np.array([emb / norm(emb) if norm(emb) > 0 else emb for emb in embeddings_matrix])
print("Embeddings matrix shape:", embeddings_matrix.shape)


syntactic_data = []
# Path to your SynAnalogyTr.txt file

# Read the file
with open(syntactic_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        words = line.strip().split()
        if len(words) == 4:
            syntactic_data.append(words)

target_embeddings = np.zeros(shape=(len(syntactic_data), 512))

for i in range(len(syntactic_data)):
  a = create_word_embedding(syntactic_data[i][0], vocabulary, model)
  b = create_word_embedding(syntactic_data[i][1], vocabulary, model)
  c = create_word_embedding(syntactic_data[i][2], vocabulary, model)
  target_embedding = (b-a+c).detach().numpy()
  target_embedding /= norm(target_embedding)
  target_embeddings[i] = target_embedding

target_embeddings = target_embeddings.transpose()
print("Target embeddings matrix shape:", target_embeddings.shape)

# Compute cosine similarities
cosine_similarities = np.matmul(embeddings_matrix, target_embeddings)

top_5_indices_all_columns = []
for column in range(cosine_similarities.shape[1]):
    top_5_indices = np.argsort(cosine_similarities[:, column])[-5:][::-1]
    top_5_indices_all_columns.append(top_5_indices)

words_list = list(embeddings_dict.keys())

syntactic_accuracy = 0
accurate_words = []
MRR_syntactic = 0

for i in range(len(top_5_indices_all_columns)):
    d = syntactic_data[i][3]
    rank = 0
    for j, idx in enumerate(top_5_indices_all_columns[i]):
        word_at_index = words_list[idx]
        if word_at_index == d:
            syntactic_accuracy += 1
            accurate_words.append(d)
            rank = j + 1
            break
    if rank > 0:
        MRR_syntactic += 1 / rank

syntactic_accuracy /= len(syntactic_data)
MRR_syntactic /= len(syntactic_data)

print('Syntactic accuracy:', syntactic_accuracy)
print('Syntactic MRR:', MRR_syntactic)
print('Accurate words:', str(accurate_words))

## Semantic Accuracy
semantic_data = {}
current_category = None
# Read the file
with open(semantic_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        if line.startswith(':'):
            current_category = line[1:].strip()
            semantic_data[current_category] = []
        elif current_category is not None:
            words = line.split()
            if len(words) == 4:
                semantic_data[current_category].append(words)


for category_name in semantic_data.keys():

    category = semantic_data[category_name]

    target_embeddings = np.zeros(shape=(len(category), 512))

    for i in range(len(category)):
        a = create_word_embedding(category[i][0], vocabulary, model)
        b = create_word_embedding(category[i][1], vocabulary, model)
        c = create_word_embedding(category[i][2], vocabulary, model)
        target_embedding = (b-a+c).detach().numpy()
        target_embedding /= norm(target_embedding)
        target_embeddings[i] = target_embedding

    target_embeddings = target_embeddings.transpose()
    print(category_name + ", Target embeddings matrix shape:" + str(target_embeddings.shape))

    cosine_similarities = np.matmul(embeddings_matrix, target_embeddings)

    top_5_indices_all_columns = []
    for column in range(cosine_similarities.shape[1]):
        top_5_indices = np.argsort(cosine_similarities[:, column])[-5:][::-1]
        top_5_indices_all_columns.append(top_5_indices)

    words_list = list(embeddings_dict.keys())

    semantic_accuracy = 0
    MRR_semantic = 0
    accurate_words = []
    for i in range(len(top_5_indices_all_columns)):
        d = category[i][3]
        rank = 0
        for j, idx in enumerate(top_5_indices_all_columns[i]):
            word_at_index = words_list[idx]
            if word_at_index == d:
                semantic_accuracy += 1
                accurate_words.append(d)
                rank = j + 1
                break
        if rank > 0:
            MRR_semantic += 1 / rank

    semantic_accuracy /= len(category)
    MRR_semantic /= len(category)

    print(str(category_name) + ' semantic accuracy:', semantic_accuracy)
    print(str(category_name) + ' semantic MRR:', MRR_semantic)
    print('Accurate words:', str(accurate_words))



<h1>3. BPE Tokenizer Evaluation</h1>
<h4>Content:</h4>
<l>
    <li>3.1. Defining the evaluation functions and the training model</li>
    <li>3.2. Defining the model parameters and loading the model</li>
    <li>3.3. Syntactic and Semantic Evaluation</li>
</l>

<h3>3.1. Defining the evaluation functions and the training model</h3>


In [None]:
from tokenizers import Tokenizer
import torch
import torch.nn as nn
import json
import numpy as np
from numpy.linalg import norm
from tokenizers import Tokenizer

def create_word_embedding(word, bpe_tokenizer, model):
    encoded = bpe_tokenizer.encode(word)
    # Ensure the tensor is of type Long
    token_ids = torch.tensor(encoded.ids, dtype=torch.long)
    embedding = model.embeddings(token_ids)
    embedding = embedding.sum(dim=0)
    return embedding

class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, max_norm=1.0)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        embeds_mean = embeds.mean(dim=1)
        out = self.linear(embeds_mean)
        return out



<h3>3.2. Defining the model parameters and loading the model</h3>


In [None]:
from tokenizers import Tokenizer
import torch
import torch.nn as nn
import json
import numpy as np
from numpy.linalg import norm
from tokenizers import Tokenizer

def create_word_embedding(word, bpe_tokenizer, embeddings_dict):
    encoded = bpe_tokenizer.encode(word)
    # Ensure the tensor is of type Long
    token_ids = encoded.ids
    embedding_sum = np.sum([embeddings_dict.get(sub_token, np.zeros(embedding_dim)) for sub_token in token_ids], axis=0)
    return embedding_sum

## Constants
## parent_path = 'path/to/your/directory/'
parent_path = '/Users/kaganhitit_/Desktop/COMP442/0076757_word2vec/starter/'
unique_words_path = parent_path + 'evaluation/evaluation_training_unique_words.txt'
syntactic_file_path = parent_path + 'data/SynAnalogyTr.txt'  # Replace with the actual path
semantic_file_path = parent_path +'data/turkish-analogy-semantic.txt'  # Replace with the actual path

## Parameters
vocab_size = 4000
embedding_dim = 512
tokenizer_name = 'embeddings/bpe_tokenizer_vocab.json'
embeddings_path  = parent_path + 'embeddings/bpe_tokenizer_embeddings.txt'

bpe_tokenizer = Tokenizer.from_file(parent_path + tokenizer_name)

embeddings_dict = {}

with open(embeddings_path, 'r', encoding='utf-8') as file:
    for line in file:
        token, vector = line.split(':', 1)
        vector = json.loads(vector)
        embeddings_dict[token.strip()] = np.array(vector)

print("Loaded embeddings for", len(embeddings_dict), "tokens.")

unique_words = []
with open(unique_words_path, 'r', encoding='utf-8') as file:
    for line in file:
        word = line.strip()  
        unique_words.append(word)

print('Unique word length :' + str(len(unique_words)))


<h3>3.3. Syntactic and Semantic Evaluation</h3>

In [None]:
embeddings_dict = {}

for word in unique_words:
    embeddings_dict[word] = create_word_embedding(word, bpe_tokenizer, embeddings_dict)

embeddings_matrix = np.zeros((len(embeddings_dict), embedding_dim))

for i, (word, embedding) in enumerate(embeddings_dict.items()):
    embeddings_matrix[i, :] = embedding
embeddings_matrix = np.array([emb / norm(emb) if norm(emb) > 0 else emb for emb in embeddings_matrix])
print("Embeddings matrix shape:", embeddings_matrix.shape)


syntactic_data = []

with open(syntactic_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        words = line.strip().split()
        if len(words) == 4:
            syntactic_data.append(words)

target_embeddings = np.zeros(shape=(len(syntactic_data), 512))

for i in range(len(syntactic_data)):
  a = create_word_embedding(syntactic_data[i][0], bpe_tokenizer, embeddings_dict)
  b = create_word_embedding(syntactic_data[i][1], bpe_tokenizer, embeddings_dict)
  c = create_word_embedding(syntactic_data[i][2], bpe_tokenizer, embeddings_dict)
  target_embedding = (b-a+c)
  target_embedding /= norm(target_embedding)
  target_embeddings[i] = target_embedding

target_embeddings = target_embeddings.transpose()
print("Target embeddings matrix shape:", target_embeddings.shape)

cosine_similarities = np.matmul(embeddings_matrix, target_embeddings)

top_5_indices_all_columns = []
for column in range(cosine_similarities.shape[1]):
    top_5_indices = np.argsort(cosine_similarities[:, column])[-5:][::-1]
    top_5_indices_all_columns.append(top_5_indices)

words_list = list(embeddings_dict.keys())

syntactic_accuracy = 0
accurate_words = []
MRR_syntactic = 0

for i in range(len(top_5_indices_all_columns)):
    d = syntactic_data[i][3]
    rank = 0
    for j, idx in enumerate(top_5_indices_all_columns[i]):
        word_at_index = words_list[idx]
        if word_at_index == d:
            syntactic_accuracy += 1
            accurate_words.append(d)
            rank = j + 1
            break
    if rank > 0:
        MRR_syntactic += 1 / rank

syntactic_accuracy /= len(syntactic_data)
MRR_syntactic /= len(syntactic_data)

print('Syntactic accuracy:', syntactic_accuracy)
print('Syntactic MRR:', MRR_syntactic)
print('Accurate words:', str(accurate_words))

## Semantic Accuracy
semantic_data = {}
current_category = None
with open(semantic_file_path, 'r', encoding='utf-8') as file:
    for line in file:
        line = line.strip()
        if line.startswith(':'):
            current_category = line[1:].strip() 
            semantic_data[current_category] = []
        elif current_category is not None:
            words = line.split()
            if len(words) == 4:
                semantic_data[current_category].append(words)


for category_name in semantic_data.keys():

    category = semantic_data[category_name]

    target_embeddings = np.zeros(shape=(len(category), 512))

    for i in range(len(category)):
        a = create_word_embedding(category[i][0], bpe_tokenizer, embeddings_dict)
        b = create_word_embedding(category[i][1], bpe_tokenizer, embeddings_dict)
        c = create_word_embedding(category[i][2], bpe_tokenizer, embeddings_dict)
        target_embedding = (b-a+c)
        target_embedding /= norm(target_embedding)
        target_embeddings[i] = target_embedding

    target_embeddings = target_embeddings.transpose()
    print(category_name + ", Target embeddings matrix shape:" + str(target_embeddings.shape))

    cosine_similarities = np.matmul(embeddings_matrix, target_embeddings)

    top_5_indices_all_columns = []
    for column in range(cosine_similarities.shape[1]):
        top_5_indices = np.argsort(cosine_similarities[:, column])[-5:][::-1]
        top_5_indices_all_columns.append(top_5_indices)

    words_list = list(embeddings_dict.keys())

    semantic_accuracy = 0
    MRR_semantic = 0
    accurate_words = []
    for i in range(len(top_5_indices_all_columns)):
        d = category[i][3]
        rank = 0
        for j, idx in enumerate(top_5_indices_all_columns[i]):
            word_at_index = words_list[idx]
            if word_at_index == d:
                semantic_accuracy += 1
                accurate_words.append(d)
                rank = j + 1
                break
        if rank > 0:
            MRR_semantic += 1 / rank

    semantic_accuracy /= len(category)
    MRR_semantic /= len(category)

    print(str(category_name) + ' semantic accuracy:', semantic_accuracy)
    print(str(category_name) + ' semantic MRR:', MRR_semantic)
    print('Accurate words:', str(accurate_words))