<h1>1. Word Tokenizer</h1>

<h2>1.1. Training the Word Tokenizer</h2>

In [3]:
## CHANGE YOUR PATH HERE.
drive_path = '/Users/kaganhitit_/Desktop/COMP442/HW1/starter/'
##drive_path = "path/to/your/directory"

In [5]:
import re
from collections import Counter
import re

def read_articles(file_path, max_articles=400000):
    """
    Reads articles from a text file.

    Args:
        file_path (str): Path to the text file containing articles.
        max_articles (int, optional): Maximum number of articles to read. Defaults to 400000.

    Returns:
        list: A list of dictionaries, where each dictionary represents an article with keys "title" and "text".
    """
    with open(file_path, "r", encoding='utf-8') as fi:
        content = fi.read()

    titles = re.findall("== .* ==", content)
    texts = re.split("== .* == \n\n", content)[1:]
    articles = [{"title": ti.strip("==").strip(), "text": tx.strip()} for ti, tx in list(zip(titles, texts))[:max_articles]]
    return articles

def filter_vocabulary(tokenized_articles, vocabulary_count=50000):
    """
    Filters vocabulary based on token frequency.

    Args:
        tokenized_articles (list): A list of lists, where each inner list represents the tokens of an article.
        vocabulary_count (int, optional): The maximum number of tokens to keep in the vocabulary. Defaults to 50000.

    Returns:
        A list of filtered articles and a set of the most frequent tokens.
    """
    print('Filtering tokens according to vocabulary count:', vocabulary_count)
    all_tokens = [token for article in tokenized_articles for token in article]
    token_freq = Counter(all_tokens)

    most_frequent_tokens = set([token for token, _ in token_freq.most_common(vocabulary_count - 1)])
    most_frequent_tokens.add('<OOV>')

    filtered_articles = []
    for article in tokenized_articles:
        filtered_article = [token if token in most_frequent_tokens else '<OOV>' for token in article]
        filtered_articles.append(filtered_article)

    print('Filtering finished.')
    return filtered_articles, most_frequent_tokens

def turkish_lower(text):
    """
    Converts Turkish text to lowercase and handles special characters.

    Args:
        text (str): The text to convert.

    Returns:
        str: The converted text in lowercase.
    """
    text = text.replace('I', 'ı').replace('İ', 'i')
    return text.lower()

def filter_turkish_text(articles):
    """
    Filters and processes Turkish text articles.

    Args:
        articles (list): A list of dictionaries, where each dictionary represents an article with a "text" key.

    Returns:
        list: A list of processed text strings from the articles.
    """
    # Regex pattern for Turkish characters, excluding single characters
    token_pattern = re.compile(r'\b[a-zğüşıöç]{2,}(?:\'[a-zğüşıöç]+)?\b')

    processed_articles = []

    stop_words = import_stop_words(drive_path + 'data/turkce-stop-words.txt')

    for article in articles:
        text = turkish_lower(article['text'])
        tokens = token_pattern.findall(text)
        for token in tokens:
            if(is_stop_word(stop_words, token)):
                tokens.remove(token)
        filtered_text = ' '.join(tokens)
        processed_articles.append(filtered_text)

    print(processed_articles[0])
    print('Articles are processed.')
    return processed_articles

def word_tokenizer(articles):
    """
    Performs word tokenization on Turkish articles.

    Args:
        articles (list): A list of dictionaries, where each dictionary represents an article with a "text" key.

    Returns:
        A list containing tokenized articles (lists of tokens) and the vocabulary (set of tokens).
    """
    print('Tokenizing articles...')

    processed_articles = filter_turkish_text(articles)

    tokenized_articles = []

    for article in processed_articles:
        tokens = article.split(' ')
        tokenized_articles.append(tokens)
    print(tokenized_articles[0])
    tokenized_articles, vocabulary = filter_vocabulary(tokenized_articles)
    print('Articles are tokenized.')
    return tokenized_articles, vocabulary

def import_stop_words(file_path):
    """
    Imports stop words from a text file.

    Args:
        file_path (str): Path to the text file containing stop words (one word per line).

    Returns:
        list: A list of stop words.
    """
    stop_words = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            stop_words.append(line.strip())  # .strip() removes any trailing newlines or spaces
    return stop_words

def is_stop_word(stop_words,word):
    """
    Checks if a word is a stop word.

    Args:
        stop_words (list): A list of stop words.
        word (str): The word to check.

    Returns:
        bool: True if the word is a stop word, False otherwise.
    """
    return word in stop_words

## Reading the articles
training_set_path = drive_path + "data/trwiki-67/trwiki-67.train.txt"
vocab_save_path = drive_path + 'embeddings/word_tokenizer_vocab.json'

articles = read_articles(training_set_path)
tokenized_articles, vocabulary = word_tokenizer(articles)

# Convert the set of words to a dictionary with indices
vocab_dict = {word: i for i, word in enumerate(vocabulary)}

with open(vocab_save_path, 'w', encoding='utf-8') as f:
    f.write("{\n")
    for i, (word, index) in enumerate(vocab_dict.items()):
        # Add a comma at the end of each line except the last one
        separator = "," if i < len(vocab_dict) - 1 else ""
        f.write(f'    "{word}": {index}{separator}\n')
    f.write("}")

print(f"Vocabulary saved at {vocab_save_path}")

Tokenizing articles...


KeyboardInterrupt: 

<h2>1.2. Training the Word Embeddings</h2>
<h4>Content:</h4>
<l>
    <li>1.2.1.Defining the functions and training model</li>
    <li>1.2.2.Preparing the training and validation dataset</li>
    <li>1.2.3.Training loop</li>
</l>

<h3>1.2.1. Defining the functions and training model</h3>


In [None]:
import re
from collections import Counter
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import SGD
from tqdm import tqdm
from collections import Counter
from itertools import chain
import json

def read_articles(file_path, max_articles=400000):
    with open(file_path, "r", encoding='utf-8') as fi:
        content = fi.read()

    # Find all article titles and texts
    titles = re.findall("== .* ==", content)
    texts = re.split("== .* == \n\n", content)[1:]

    # Convert zipped titles and texts to a list, then slice
    articles = [{"title": ti.strip("==").strip(), "text": tx.strip()} for ti, tx in list(zip(titles, texts))[:max_articles]]
    return articles

def turkish_lower(text):
    text = text.replace('I', 'ı').replace('İ', 'i')
    return text.lower()

def filter_turkish_text(articles):
    print('Processing articles...')
    # Regex pattern for Turkish characters, excluding single characters
    token_pattern = re.compile(r'\b[a-zğüşıöç]{2,}(?:\'[a-zğüşıöç]+)?\b')

    processed_articles = []

    stop_words = import_stop_words(drive_path + 'data/turkce-stop-words.txt')

    for article in articles:
        text = turkish_lower(article['text'])
        tokens = token_pattern.findall(text)
        for token in tokens:
            if(is_stop_word(stop_words, token)):
                tokens.remove(token)
        filtered_text = ' '.join(tokens)
        processed_articles.append(filtered_text)

    print(processed_articles[0])
    print('Articles are processed.')
    return processed_articles

def tokenize_articles(articles, vocabulary):
  processed_articles = filter_turkish_text(articles)

  tokenized_articles = []
  for article in processed_articles:
      # Tokenize each article's text
      tokens = article.split(' ')
      tokenized_articles.append(tokens)
  return tokenized_articles

def load_vocabulary(vocab_path):
  with open(vocab_path, 'r', encoding='utf-8') as f:
      vocab = json.load(f)
  return vocab

def tokens_to_indices(tokenized_articles, vocabulary):
    for article_index in range(len(tokenized_articles)):
        for token_index in range(len(tokenized_articles[article_index])):
            token = tokenized_articles[article_index][token_index]
            tokenized_articles[article_index][token_index] = vocabulary.get(token, vocabulary.get('<OOV>'))
    return tokenized_articles

def create_tuples_from_articles(indexed_articles, context_size):
    context_target_pairs = []
    for article in indexed_articles:
        # Add padding at the beginning and end of each article
        for i in range(context_size, len(article) - context_size):
            context = article[i - context_size:i] + article[i + 1:i + context_size + 1]
            target = article[i]
            context_target_pairs.append((context, target))
    return context_target_pairs

def import_stop_words(file_path):
    stop_words = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            stop_words.append(line.strip())  # .strip() removes any trailing newlines or spaces
    return stop_words

def is_stop_word(stop_words,word):
    return word in stop_words

class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, max_norm = 1.0)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        embeds_mean = embeds.mean(dim=1)
        out = self.linear(embeds_mean)
        return out

class CBOWDataset(Dataset):
    def __init__(self, context_target_pairs):
        self.context_target_pairs = context_target_pairs

    def __len__(self):
        return len(self.context_target_pairs)

    def __getitem__(self, idx):
        context, target = self.context_target_pairs[idx]
        return torch.tensor(context, dtype=torch.long), torch.tensor(target, dtype=torch.long)
    

<h3>1.2.2. Preparing the training and validation dataset</h3>

In [None]:
## Reading the articles
training_set_path = drive_path + "data/trwiki-67/trwiki-67.train.txt"
validation_set_path = drive_path + "data/trwiki-67/trwiki-67.val.txt"
tokenizer_path = drive_path + 'embeddings/word_tokenizer_vocab.json'  # Path to the saved tokenizer

## Reading the tokenizer.
vocabulary = load_vocabulary(tokenizer_path)

## Tokenized training and validation sets. The arrays are indexes, not words.
training_articles = tokens_to_indices(tokenize_articles(read_articles(training_set_path), vocabulary), vocabulary)
validation_articles = tokens_to_indices(tokenize_articles(read_articles(validation_set_path), vocabulary), vocabulary)

CONTEXT_SIZE = 2
training_articles = create_tuples_from_articles(training_articles, CONTEXT_SIZE)
validation_articles = create_tuples_from_articles(validation_articles, CONTEXT_SIZE)

# Create the datasets
train_dataset = CBOWDataset(training_articles)
val_dataset = CBOWDataset(validation_articles)

<h3>1.2.3. Training loop</h3>

In [None]:
# Hyperparameters
embedding_dim = 512
learning_rate = 0.04
batch_size = 128
epochs = 10
vocab_size = len(vocabulary.keys())

print('Vocab size : ' + str(vocab_size))

# Create the DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size)

# Model, Device, DataLoader
neural_net = CBOW(vocab_size, embedding_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
neural_net.to(device)

# Loss Function and Optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = SGD(neural_net.parameters(), lr=learning_rate)

# Training Loop
print('Starting training')
for epoch in range(epochs):
    neural_net.train()
    total_loss = 0.0
    progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{epochs}')

    for context_vector, target_index in progress_bar:
        context_vector, target_index = context_vector.to(device), target_index.to(device)

        neural_net.zero_grad()
        log_probs = neural_net(context_vector)
        loss = loss_function(log_probs, target_index)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    avg_train_loss = total_loss / len(train_dataloader)
    print(f'\nEpoch {epoch+1} Completed - Training Loss: {avg_train_loss:.4f}')

    # Validation
    neural_net.eval()
    total_val_loss = 0.0
    with torch.no_grad():
        for context_vector, target_index in val_dataloader:
            context_vector, target_index = context_vector.to(device), target_index.to(device)
            log_probs = neural_net(context_vector)
            loss = loss_function(log_probs, target_index)
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f'Validation Loss: {avg_val_loss:.4f}')

    # Save model
    model_save_path = drive_path + f'word_model_epoch_{epoch+1}.pth'
    torch.save(neural_net.state_dict(), model_save_path)

# Optionally, save the final model separatelys
final_model_save_path = drive_path + 'word_model_final.pth'
torch.save(neural_net.state_dict(), final_model_save_path)


<h1>2. Character Trigram Tokenizer</h1>

<h2>2.1. Training the Character Trigram Tokenizer</h2>

In [None]:
import re

def read_articles(file_path, max_articles=400000):
    with open(file_path, "r", encoding='utf-8') as fi:
        content = fi.read()

    # Find all article titles and texts
    titles = re.findall("== .* ==", content)
    texts = re.split("== .* == \n\n", content)[1:]

    # Convert zipped titles and texts to a list, then slice
    articles = [{"title": ti.strip("==").strip(), "text": tx.strip()} for ti, tx in list(zip(titles, texts))[:max_articles]]
    return articles

from collections import Counter
import re

def filter_vocabulary(tokenized_articles, vocabulary_count=50000):
    print('Filtering tokens according to vocabulary count:', vocabulary_count)

    all_tokens = [token for article in tokenized_articles for token in article]
    token_freq = Counter(all_tokens)

    most_frequent_tokens = set([token for token, _ in token_freq.most_common(vocabulary_count)])

    filtered_articles = []
    for article in tokenized_articles:
        filtered_article = [token if token in most_frequent_tokens else '<OOV>' for token in article]
        filtered_articles.append(filtered_article)

    print('Filtering finished.')
    return filtered_articles, most_frequent_tokens

def turkish_lower(text):
    text = text.replace('I', 'ı').replace('İ', 'i')
    return text.lower()

def filter_turkish_text(articles):
    print('Processing articles...')
    # Regex pattern for Turkish characters, excluding single characters
    token_pattern = re.compile(r'\b[a-zğüşıöç]{2,}(?:\'[a-zğüşıöç]+)?\b')

    processed_articles = []

    for article in articles:
        text = turkish_lower(article['text'])
        tokens = token_pattern.findall(text)
        filtered_text = ' '.join(tokens)
        processed_articles.append(filtered_text)

    print(processed_articles[0])
    print('Articles are processed.')
    return processed_articles


def trigram_tokenizer(articles):
    print('Tokenizing articles...')

    processed_articles = filter_turkish_text(articles)

    tokenized_articles = []

    for article in processed_articles:
        tokens = article.split(' ')
        tokenized_words = []
        for word in tokens:
          word = '<' + word + '>'
          trigrams = [word[i:i+3] for i in range(len(word) - 2)]
          tokenized_words.extend(trigrams)
        tokenized_articles.append(tokenized_words)
    print(tokenized_articles[0])
    tokenized_articles, vocabulary = filter_vocabulary(tokenized_articles)
    print('Articles are tokenized.')
    return tokenized_articles, vocabulary

## Reading the articles
training_set_path = drive_path + "data/trwiki-67/trwiki-67.train.txt"
vocab_save_path = drive_path + 'embeddings/trigram_tokenizer_vocab.json'

articles = read_articles(training_set_path)
tokenized_articles, vocabulary = trigram_tokenizer(articles)

# Convert the set of words to a dictionary with indices
vocab_dict = {word: i for i, word in enumerate(vocabulary)}

with open(vocab_save_path, 'w', encoding='utf-8') as f:
    f.write("{\n")
    for i, (word, index) in enumerate(vocab_dict.items()):
        # Add a comma at the end of each line except the last one
        separator = "," if i < len(vocab_dict) - 1 else ""
        f.write(f'    "{word}": {index}{separator}\n')
    f.write("}")

print(f"Vocabulary saved at {vocab_save_path}")

<h2>2.2. Training the Character Trigram Embeddings</h2>
<h4>Content:</h4>
<l>
    <li>2.2.1.Defining the functions and training model</li>
    <li>2.2.2.Preparing the training and validation dataset</li>
    <li>2.2.3.Training loop</li>
</l>


<h3>2.2.1. Defining the functions and training model</h3>


In [None]:
import re
import json
from tokenizers import Tokenizer
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import SGD
from tqdm import tqdm
from collections import Counter
from itertools import chain
import json
from torch.nn.utils.rnn import pad_sequence

def read_articles(file_path, max_articles=100000):
    with open(file_path, "r", encoding='utf-8') as fi:
        content = fi.read()

    # Find all article titles and texts
    titles = re.findall("== .* ==", content)
    texts = re.split("== .* == \n\n", content)[1:]

    # Convert zipped titles and texts to a list, then slice
    articles = [{"title": ti.strip("==").strip(), "text": tx.strip()} for ti, tx in list(zip(titles, texts))[:max_articles]]
    return articles

def turkish_lower(text):
    text = text.replace('I', 'ı').replace('İ', 'i')
    return text.lower()

def filter_turkish_text(articles):
    print('Processing articles...')
    # Regex pattern for Turkish characters, excluding single characters
    token_pattern = re.compile(r'\b[a-zğüşıöç]{2,}(?:\'[a-zğüşıöç]+)?\b')

    processed_articles = []

    stop_words = import_stop_words(drive_path + 'data/turkce-stop-words.txt')

    for article in articles:
        text = turkish_lower(article['text'])
        words = token_pattern.findall(text)
        for word in words:
            if(is_stop_word(stop_words, word)):
                words.remove(word)
            word = '<' + word + '>'
        processed_articles.append(words)
    print(processed_articles[0])
    print('Articles are processed.')
    return processed_articles

def create_tuples_from_articles(processed_articles, context_size, vocabulary):
    context_target_pairs = []
    skipped_pairs_count = 0
    added_pairs_count = 0
    for article in processed_articles:
        for i in range(context_size, len(article) - context_size):
            # Tokenize the context and target into trigrams
            left_context_tokens = []
            right_context_tokens = []
            for j in range(i - context_size, i):
                left_context_tokens.extend(trigram_encode(article[j], vocabulary))
            for j in range(i + 1, i + context_size + 1):
                right_context_tokens.extend(trigram_encode(article[j], vocabulary))

            target_tokens = trigram_encode(article[i], vocabulary)

            # For each token in the target, create a new context-target pair
            for target_token in target_tokens:
                # Combine left context, other tokens of the target word, and right context
                extended_context = left_context_tokens + \
                                   [token for token in target_tokens if token != target_token] + \
                                   right_context_tokens
                if extended_context and target_token:
                    context_target_pairs.append((extended_context, target_token))
                    added_pairs_count += 1
                else:
                    skipped_pairs_count += 1
    print(f"Skipped {skipped_pairs_count} pairs.")
    print(f"Added {added_pairs_count} pairs.")
    return context_target_pairs


def import_stop_words(file_path):
    stop_words = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            stop_words.append(line.strip())  # .strip() removes any trailing newlines or spaces
    return stop_words

def is_stop_word(stop_words,word):
    return word in stop_words

def load_vocabulary(vocab_path):
  with open(vocab_path, 'r', encoding='utf-8') as f:
      vocab = json.load(f)
  return vocab

def trigram_encode(text, vocabulary):
    trigrams = [text[i:i+3] for i in range(len(text) - 2)]
    encoded_trigrams = [vocabulary.get(trigram, vocabulary.get('<OOV>')) for trigram in trigrams]
    return encoded_trigrams

def get_collate_fn(pad_token_id):
    def collate_batch(batch):
        contexts, targets = zip(*batch)

        # Since contexts are already tensors, use them directly in pad_sequence
        contexts_padded = pad_sequence(contexts, batch_first=True, padding_value=pad_token_id)

        # Convert targets list to a tensor
        # Assuming targets are already tensors as per your dataset class
        targets = torch.stack(targets)

        return contexts_padded, targets

    return collate_batch


class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, max_norm = 1.0)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        embeds_mean = embeds.mean(dim=1)
        out = self.linear(embeds_mean)
        return out

class CBOWDataset(Dataset):
    def __init__(self, context_target_pairs):
        self.context_target_pairs = context_target_pairs

    def __len__(self):
        return len(self.context_target_pairs)

    def __getitem__(self, idx):
        context, target = self.context_target_pairs[idx]
        return torch.tensor(context, dtype=torch.long), torch.tensor(target, dtype=torch.long)


<h3>2.2.2. Preparing the training and validation dataset</h3>

In [None]:
## Reading the articles
training_set_path = drive_path + "data/trwiki-67/trwiki-67.train.txt"
validation_set_path = drive_path + "data/trwiki-67/trwiki-67.val.txt"
tokenizer_path = drive_path + 'embeddings/trigram_tokenizer_vocab.json'  # Path to the saved tokenizer

## Reading the tokenizer.
vocabulary = load_vocabulary(tokenizer_path)

training_articles = filter_turkish_text(read_articles(training_set_path))
validation_articles = filter_turkish_text(read_articles(validation_set_path))

CONTEXT_SIZE = 2
training_articles = create_tuples_from_articles(training_articles, CONTEXT_SIZE, vocabulary)
validation_articles = create_tuples_from_articles(validation_articles, CONTEXT_SIZE, vocabulary)

# Create the datasets
train_dataset = CBOWDataset(training_articles)
val_dataset = CBOWDataset(validation_articles)

<h3>2.2.3. Training loop</h3>

In [None]:
# Hyperparameters
embedding_dim = 512
learning_rate = 0.04
batch_size = 128
epochs = 10
vocab_size = len(vocabulary.keys())

pad_token_id = vocabulary['<PAD>']
print(pad_token_id)
collate_fn = get_collate_fn(pad_token_id)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)

# Model, Device, DataLoader
neural_net = CBOW(vocab_size, embedding_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
neural_net.to(device)

# Loss Function and Optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = SGD(neural_net.parameters(), lr=learning_rate)

# Training Loop
print('Starting training')
for epoch in range(epochs):
    neural_net.train()
    total_loss = 0.0
    progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{epochs}')

    for context_vector, target_index in progress_bar:
        context_vector, target_index = context_vector.to(device), target_index.to(device)

        neural_net.zero_grad()
        log_probs = neural_net(context_vector)
        loss = loss_function(log_probs, target_index)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    avg_train_loss = total_loss / len(train_dataloader)
    print(f'\nEpoch {epoch+1} Completed - Training Loss: {avg_train_loss:.4f}')

    # Validation
    neural_net.eval()
    total_val_loss = 0.0
    with torch.no_grad():
        for context_vector, target_index in val_dataloader:
            context_vector, target_index = context_vector.to(device), target_index.to(device)
            log_probs = neural_net(context_vector)
            loss = loss_function(log_probs, target_index)
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f'Validation Loss: {avg_val_loss:.4f}')

    # Save model
    model_save_path = drive_path + f'trigram_model_epoch_{epoch+1}.pth'
    torch.save(neural_net.state_dict(), model_save_path)

# Optionally, save the final model separatelys
final_model_save_path = drive_path + 'trigram_model_final.pth'
torch.save(neural_net.state_dict(), final_model_save_path)

<h1>3. BPE Tokenizer</h1>

<h2>3.1. Training the BPE Tokenizer</h2>

In [None]:
import re
from collections import Counter
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

def read_articles(file_path, max_articles=400000):
    with open(file_path, "r", encoding='utf-8') as fi:
        content = fi.read()

    # Find all article titles and texts
    titles = re.findall("== .* ==", content)
    texts = re.split("== .* == \n\n", content)[1:]

    # Convert zipped titles and texts to a list, then slice
    articles = [{"title": ti.strip("==").strip(), "text": tx.strip()} for ti, tx in list(zip(titles, texts))[:max_articles]]
    return articles

def turkish_lower(text):

    text = text.replace('I', 'ı').replace('İ', 'i')
    return text.lower()

def filter_turkish_text(articles):
    print('Processing articles...')
    # Regex pattern for Turkish characters, excluding single characters
    token_pattern = re.compile(r'\b[a-zğüşıöç]{2,}(?:\'[a-zğüşıöç]+)?\b')

    processed_articles = []

    for article in articles:
        text = turkish_lower(article['text'])
        tokens = token_pattern.findall(text)
        filtered_text = ' '.join(tokens)
        processed_articles.append(filtered_text)

    print(processed_articles[0])
    print('Articles are processed.')
    return processed_articles

## Reading the articles
training_set_path = drive_path + "data/trwiki-67/trwiki-67.train.txt"
## This txt file is the preprocessed articles for the BPE tokenizer train. I did not include it because it is just the processed version of the training data.
bpe_tokenizer_train_path = drive_path + 'bpe_training_text.txt' 

# Assuming you've already read the articles using your function
articles = read_articles(training_set_path)

processed_articles = filter_turkish_text(articles)

# Write the filtered text bodies to a file with <s> and </s>
with open(bpe_tokenizer_train_path, 'w', encoding='utf-8') as f:
    for text in processed_articles:
        f.write("<s> " + text + " </s>\n")

# Initialize a tokenizer
tokenizer = Tokenizer(BPE(unk_token="<OOV>"))
tokenizer.pre_tokenizer = Whitespace()

# Create a trainer
trainer = BpeTrainer(special_tokens=["<OOV>", "<s>", "</s>", "<PAD>"], vocab_size = 4000)

# Train the tokenizer
tokenizer.train(files=[bpe_tokenizer_train_path], trainer=trainer)

tokenizer.save(drive_path + "embeddings/bpe_tokenizer_vocab.json")

<h2>3.2. Training the BPE Embeddings</h2>
<h4>Content:</h4>
<l>
    <li>3.2.1. Defining the functions and training model</li>
    <li>3.2.2. Preparing the training and validation dataset</li>
    <li>3.2.3. Training loop</li>
</l>


<h3>3.2.1. Defining the functions and training model</h3>

In [None]:
import re
from collections import Counter
import torch
from torch.nn.utils.rnn import pad_sequence
from tokenizers import Tokenizer
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import SGD
from tqdm import tqdm
from itertools import chain
import json

def read_articles(file_path, max_articles=150000):
    with open(file_path, "r", encoding='utf-8') as fi:
        content = fi.read()

    # Find all article titles and texts
    titles = re.findall("== .* ==", content)
    texts = re.split("== .* == \n\n", content)[1:]

    # Convert zipped titles and texts to a list, then slice
    articles = [{"title": ti.strip("==").strip(), "text": tx.strip()} for ti, tx in list(zip(titles, texts))[:max_articles]]
    return articles

def turkish_lower(text):
    """
    Convert text to lowercase, considering Turkish-specific characters.
    """
    text = text.replace('I', 'ı').replace('İ', 'i')
    return text.lower()

def filter_turkish_text(articles):
    print('Processing articles...')
    # Regex pattern for Turkish characters, excluding single characters
    token_pattern = re.compile(r'\b[a-zğüşıöç]{2,}(?:\'[a-zğüşıöç]+)?\b')

    processed_articles = []

    stop_words = import_stop_words(drive_path + 'data/turkce-stop-words.txt')

    for article in articles:
        text = turkish_lower(article['text'])
        words = token_pattern.findall(text)
        for word in words:
            if(is_stop_word(stop_words, word)):
                words.remove(word)
        processed_articles.append(words)
    print(processed_articles[0])
    print('Articles are processed.')
    return processed_articles

def create_tuples_from_articles(processed_articles, context_size, tokenizer):
    context_target_pairs = []
    skipped_pairs_count = 0
    added_pairs_count = 0
    for article in processed_articles:
        for i in range(context_size, len(article) - context_size):
            # Tokenize the context and target into BPE tokens
            left_context_tokens = []
            right_context_tokens = []
            for j in range(i - context_size, i):
                left_context_tokens.extend(tokenizer.encode(article[j]).ids)
            for j in range(i + 1, i + context_size + 1):
                right_context_tokens.extend(tokenizer.encode(article[j]).ids)

            target_tokens = tokenizer.encode(article[i]).ids

            # For each token in the target, create a new context-target pair
            for target_token in target_tokens:
                # Combine left context, other tokens of the target word, and right context
                extended_context = left_context_tokens + \
                                   [token for token in target_tokens if token != target_token] + \
                                   right_context_tokens

                # Check if the context or target is empty before appending
                if extended_context and target_token:
                    context_target_pairs.append((extended_context, target_token))
                    added_pairs_count += 1
                else:
                    skipped_pairs_count += 1
    print('Skipped pairs: ' + str(skipped_pairs_count))
    print('Added pairs: ' + str(added_pairs_count))
    return context_target_pairs



def import_stop_words(file_path):
    stop_words = []
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            stop_words.append(line.strip())  # .strip() removes any trailing newlines or spaces
    return stop_words

def is_stop_word(stop_words,word):
    return word in stop_words

def get_collate_fn(pad_token_id):
    def collate_batch(batch):
        contexts, targets = zip(*batch)

        # Since contexts are already tensors, use them directly in pad_sequence
        contexts_padded = pad_sequence(contexts, batch_first=True, padding_value=pad_token_id)

        # Convert targets list to a tensor
        # Assuming targets are already tensors as per your dataset class
        targets = torch.stack(targets)

        return contexts_padded, targets

    return collate_batch

class CBOW(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(CBOW, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim, max_norm = 1.0)
        self.linear = nn.Linear(embedding_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs)
        embeds_mean = embeds.mean(dim=1)
        out = self.linear(embeds_mean)
        return out

class CBOWDataset(Dataset):
    def __init__(self, context_target_pairs):
        self.context_target_pairs = context_target_pairs

    def __len__(self):
        return len(self.context_target_pairs)

    def __getitem__(self, idx):
        context, target = self.context_target_pairs[idx]
        return torch.tensor(context, dtype=torch.long), torch.tensor(target, dtype=torch.long)

<h3>3.2.2. Preparing the training and validation dataset</h3>

In [None]:
## Reading the articles
training_set_path = drive_path + "data/trwiki-67/trwiki-67.train.txt"
validation_set_path = drive_path + "data/trwiki-67/trwiki-67.val.txt"
tokenizer_path = drive_path + 'embeddings/bpe_tokenizer_vocab4000_with_pad.json'  # Path to the saved tokenizer

## Reading the tokenizer.
bpe_tokenizer = Tokenizer.from_file(tokenizer_path)

training_articles = filter_turkish_text(read_articles(training_set_path))
validation_articles = filter_turkish_text(read_articles(validation_set_path))
CONTEXT_SIZE = 2
training_articles = create_tuples_from_articles(training_articles, CONTEXT_SIZE, bpe_tokenizer)
validation_articles = create_tuples_from_articles(validation_articles, CONTEXT_SIZE, bpe_tokenizer)

# Create the datasets
train_dataset = CBOWDataset(training_articles)
val_dataset = CBOWDataset(validation_articles)

<h3>3.2.3. Training loop</h3>

In [None]:
# Hyperparameters
embedding_dim = 512
learning_rate = 0.08
batch_size = 128
epochs = 10
vocab_size = len(bpe_tokenizer.get_vocab())

print('Vocab size : ' + str(vocab_size))

pad_token_id = bpe_tokenizer.token_to_id("<PAD>")
collate_fn = get_collate_fn(pad_token_id)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn)

# Model, Device, DataLoader
neural_net = CBOW(vocab_size, embedding_dim)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
neural_net.to(device)

# Loss Function and Optimizer
loss_function = nn.CrossEntropyLoss()
optimizer = SGD(neural_net.parameters(), lr=learning_rate)

# Training Loop
print('Starting training')
for epoch in range(epochs):
    neural_net.train()
    total_loss = 0.0
    progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{epochs}')

    for context_vector, target_index in progress_bar:
        context_vector, target_index = context_vector.to(device), target_index.to(device)

        neural_net.zero_grad()
        log_probs = neural_net(context_vector)
        loss = loss_function(log_probs, target_index)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        progress_bar.set_postfix(loss=loss.item())

    avg_train_loss = total_loss / len(train_dataloader)
    print(f'\nEpoch {epoch+1} Completed - Training Loss: {avg_train_loss:.4f}')

    # Validation
    neural_net.eval()
    total_val_loss = 0.0
    with torch.no_grad():
        for context_vector, target_index in val_dataloader:
            context_vector, target_index = context_vector.to(device), target_index.to(device)
            log_probs = neural_net(context_vector)
            loss = loss_function(log_probs, target_index)
            total_val_loss += loss.item()

    avg_val_loss = total_val_loss / len(val_dataloader)
    print(f'Validation Loss: {avg_val_loss:.4f}')

    # Save model
    model_save_path = drive_path + f'bpe_model_epoch_{epoch+1}.pth'
    torch.save(neural_net.state_dict(), model_save_path)

# Optionally, save the final model separatelys
final_model_save_path = drive_path + 'bpe_model_final.pth'
torch.save(neural_net.state_dict(), final_model_save_path)