In [None]:
import json
import re
import torch
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.nn.functional as F

In [None]:
def load_data(file_path):
  with open(file_path, 'r', encoding='utf-8') as file:
    data = json.load(file)
  return data

def clean_text(text):
  # Remove special characters, punctuation, and extra spaces
  text = re.sub(r"[^a-zA-Z0-9\s]", "", text)
  text = re.sub(r"\s+", " ", text)
  return text.strip().lower()

def tokenize_text(text):
  # Tokenize text by splitting on whitespace
  return text.split()

def preprocess_data(data):
  preprocessed_data = []
  for pair in data:
    kirundi = pair['rn']
    english = pair['en']
    kirundi = clean_text(kirundi)
    english = clean_text(english)
    kirundi_tokens = tokenize_text(kirundi)
    english_tokens = tokenize_text(english)
    preprocessed_data.append((kirundi_tokens, english_tokens))
  return preprocessed_data

def pad_sequences(sequences, pad_token):
    max_len = max(len(seq) for seq in sequences)
    padded_sequences = []
    for seq in sequences:
        pad_length = max_len - len(seq)
        padded_seq = seq + [pad_token] * pad_length
        padded_sequences.append(padded_seq)
    return padded_sequences, max_len

In [None]:
file_path = "../ikirundi-english.json"
raw_data = load_data(file_path)
preprocessed_data = preprocess_data(raw_data)

In [None]:
# Define special padding tokens
PAD_TOKEN = "<pad>"

# Define tokenizers for Kirundi and English
kirundi_tokenizer = get_tokenizer("basic_english")
english_tokenizer = get_tokenizer("basic_english")

# Extract Kirundi and English sentences from preprocessed data
kirundi_sentences = [pair[0] for pair in preprocessed_data]
english_sentences = [pair[1] for pair in preprocessed_data]

# Build vocabularies for Kirundi and English
def yield_tokens(sentences):
    for sentence in sentences:
        yield sentence

kirundi_vocab = build_vocab_from_iterator(yield_tokens(kirundi_sentences))
english_vocab = build_vocab_from_iterator(yield_tokens(english_sentences))

# Numericalize tokens
def numericalize_tokens(tokenizer, vocab, sentences):
    numericalized_sentences = []
    for sentence in sentences:
        numericalized_sentence = []
        for token in sentence:
            numericalized_token = vocab[token]
            numericalized_sentence.append(numericalized_token)
        numericalized_sentences.append(numericalized_sentence)
    return numericalized_sentences

kirundi_numericalized = numericalize_tokens(kirundi_tokenizer, kirundi_vocab, kirundi_sentences)
english_numericalized = numericalize_tokens(english_tokenizer, english_vocab, english_sentences)

# Define special padding tokens
PAD_TOKEN = "<pad>"

# Build vocabularies for Kirundi and English
kirundi_vocab = build_vocab_from_iterator(yield_tokens(kirundi_sentences), specials=[PAD_TOKEN])
english_vocab = build_vocab_from_iterator(yield_tokens(english_sentences), specials=[PAD_TOKEN])

# Pad sequences
kirundi_numericalized_padded, kirundi_max_len = pad_sequences(kirundi_numericalized, kirundi_vocab[PAD_TOKEN])
english_numericalized_padded, english_max_len = pad_sequences(english_numericalized, english_vocab[PAD_TOKEN])

# Convert to PyTorch tensors
kirundi_tensor = torch.tensor(kirundi_numericalized_padded)
english_tensor = torch.tensor(english_numericalized_padded)

# Store original lengths of sequences
kirundi_lengths = torch.tensor([len(seq) for seq in kirundi_numericalized])
english_lengths = torch.tensor([len(seq) for seq in english_numericalized])


In [None]:
import random

def custom_train_val_test_split(total_size, train_ratio=0.8, val_ratio=0.1, random_state=None):
    if random_state:
        random.seed(random_state)

    indices = list(range(total_size))
    random.shuffle(indices)

    train_end = int(total_size * train_ratio)
    val_end = int(total_size * (train_ratio + val_ratio))

    train_indices = indices[:train_end]
    val_indices = indices[train_end:val_end]
    test_indices = indices[val_end:]

    return train_indices, val_indices, test_indices

# Split the indices for training, validation, and test sets
train_idx, val_idx, test_idx = custom_train_val_test_split(len(kirundi_tensor), train_ratio=0.8, val_ratio=0.1, random_state=42)


In [None]:
# Split the indices for training, validation, and test sets
train_idx, test_idx = custom_train_val_test_split(list(range(len(kirundi_tensor))), test_size=0.2, random_state=42)
train_idx, val_idx = custom_train_val_test_split(train_idx, test_size=0.1, random_state=42)

In [None]:
class TranslationDataset(Dataset):
    def __init__(self, src_tensor, tgt_tensor):
        assert len(src_tensor) == len(tgt_tensor), "Number of source and target sequences must be equal"
        self.src_tensor = src_tensor
        self.tgt_tensor = tgt_tensor

    def __len__(self):
        return len(self.src_tensor)

    def __getitem__(self, index):
        src_seq = self.src_tensor[index]
        tgt_seq = self.tgt_tensor[index]
        return src_seq, tgt_seq

# Create datasets and dataloaders
train_dataset = TranslationDataset(kirundi_tensor[train_idx], english_tensor[train_idx])
val_dataset = TranslationDataset(kirundi_tensor[val_idx], english_tensor[val_idx])
test_dataset = TranslationDataset(kirundi_tensor[test_idx], english_tensor[test_idx])



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Transformer(nn.Module):
    def __init__(self, src_vocab_size, tgt_vocab_size, embed_size=256, num_heads=8, num_layers=6, dropout=0.1):
        super(Transformer, self).__init__()
        
        self.src_embedding = nn.Embedding(src_vocab_size, embed_size)
        self.tgt_embedding = nn.Embedding(tgt_vocab_size, embed_size)
        
        self.transformer = nn.Transformer(
            d_model=embed_size,
            nhead=num_heads,
            num_encoder_layers=num_layers,
            num_decoder_layers=num_layers,
            dim_feedforward=512,
            dropout=dropout
        )
        
        self.fc = nn.Linear(embed_size, tgt_vocab_size)
        
    def forward(self, src, tgt):
        src = self.src_embedding(src)
        tgt = self.tgt_embedding(tgt)
        
        out = self.transformer(src, tgt)
        out = self.fc(out)
        
        return out

In [None]:
batch_size = 64

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Instantiate the Transformer model
model = Transformer(len(kirundi_vocab), len(english_vocab)).to(device)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss(ignore_index=english_vocab[PAD_TOKEN])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    for src_batch, tgt_batch in train_dataloader:
        src_batch, tgt_batch = src_batch.to(device), tgt_batch.to(device)
        
        optimizer.zero_grad()
        
        output = model(src_batch, tgt_batch[:, :-1])  # Exclude last token from target
        output_dim = output.shape[-1]
        
        output = output.contiguous().view(-1, output_dim)
        tgt_batch = tgt_batch[:, 1:].contiguous().view(-1)  # Exclude first token from target
        
        loss = criterion(output, tgt_batch)
        loss.backward()
        
        optimizer.step()
        
        total_loss += loss.item()
    
    # Validation
    model.eval()
    val_loss = 0
    with torch.no_grad():
        for src_batch, tgt_batch in val_dataloader:
            src_batch, tgt_batch = src_batch.to(device), tgt_batch.to(device)
            
            output = model(src_batch, tgt_batch[:, :-1])  # Exclude last token from target
            output_dim = output.shape[-1]
            
            output = output.contiguous().view(-1, output_dim)
            tgt_batch = tgt_batch[:, 1:].contiguous().view(-1)  # Exclude first token from target
            
            loss = criterion(output, tgt_batch)
            val_loss += loss.item()
    
    print(f"Epoch [{epoch+1}/{num_epochs}], Train Loss: {total_loss/len(train_dataloader)}, Val Loss: {val_loss/len(val_dataloader)}")
