In [1]:
import os
import json
from datasets import load_dataset
import numpy as np
import math
from collections import Counter

from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torchtext.vocab import build_vocab_from_iterator, Vocab
import torch.optim as optim
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence

import spacy
from spacy.tokens import DocBin

import fasttext

#### one gla: 11727303 sets 14153438 tokens

In [2]:
ft = fasttext.load_model('../cc.gd.300.bin')

In [3]:
nlp = spacy.blank("gd")
nlp.add_pipe("sentencizer")
nlp.max_length = 7000000

In [4]:
with open("gla_dictionary.txt", "r", encoding="utf-8") as f:
    gaelic_words = set(line.strip() for line in f if line.strip() != "word")

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [6]:
def get_txt_file_paths(folder):
    txt_files = []
    for root, _, files in os.walk(folder):
        for file in files:
            if file.endswith(".txt"):
                txt_files.append(os.path.join(root, file))
    return txt_files

file_paths = get_txt_file_paths("gla_books")
print(f"Total .txt files found: {len(file_paths)}")

Total .txt files found: 151


In [7]:
def tokenization(data):
    doc = nlp(data)
    tokens = [token.text.lower() for token in doc if not token.is_space]
    return tokens

In [8]:
def get_gaelic_sentences(text, gaelic_words, min_length=2, excluded_words=None):
    if excluded_words is None:
        excluded_words = {"a", "i", "an", "is", "do", "so"}

    doc = nlp(text)
    gaelic_sents = []

    for sent in doc.sents:
        tokens = [token.text.lower() for token in sent if token.is_alpha]
        for token in tokens:
            if token in gaelic_words and token not in excluded_words:
                gaelic_sents.append(sent.text.strip())
                break
    return gaelic_sents

In [9]:
class GlaDataset(Dataset):
    def __init__(self, file_paths, gaelic_words, context_size=5):
        self.pairs = []

        for path in file_paths:
            with open(path, encoding='utf-8') as file:
                text = file.read()
                sentences = get_gaelic_sentences(text, gaelic_words)
                for sent in sentences:
                    tokens = tokenization(sent)
                    if len(tokens) > context_size:
                            for i in range(context_size, len(tokens)):
                                context = tokens[i - context_size:i]
                                target = tokens[i]
                                self.pairs.append((context, target))
    def  __len__(self):
        return len(self.pairs)

    def __getitem__(self, index):
        return self.pairs[index]

In [10]:
dataset = []

with open("one_million.jsonl", "r", encoding="utf-8") as f:
    for line in f:
        item = json.loads(line)
        dataset.append((item["context"], item["target"]))

In [11]:
print("Total Pairs",len(dataset))

Total Pairs 1000000


In [12]:
dataset[0:10]

[(['feadh', 'na', 'dùthcha', ',', 'air'], 'nach'),
 (["'n", 'a', 'choinnimh', ',', 'agus'], 'ghabh'),
 (['am', 'feum', 'sònruichte', 'a', 'dheanadh'], 'dàn'),
 (['agus', '12', 'an', 'déigh', 'so'], 'chaidh'),
 (['-', 'eigin', 'ann an', 'coslas', 'an'], 'duine'),
 (['eadar', 'am', 'bile', "'s", 'an'], 'deoch'),
 (['sibh', 'mar an ceudna', 'gu', 'dùth', '-'], 'ri'),
 ([':', 'le', 'gloinead', 'a', 'h'], '-'),
 (['the', 'gift', 'of', 'fred', 'norris'], 'robinson'),
 ([',', 'or', 'foinneamh', ',', '-an'], ',')]

In [13]:
counter = Counter()
for tokens, target in dataset:
    counter.update(tokens)
    counter.update([target])

In [14]:
N = 10000
most_common_tokens = [token for token, _ in counter.most_common(N)]

In [15]:
def yield_tokens_limited(dataset, allowed_tokens):
    allowed_set = set(allowed_tokens)
    for context, target in dataset:
        yield [token for token in context if token in allowed_set]

In [16]:
vocab =build_vocab_from_iterator(
    yield_tokens_limited(dataset, most_common_tokens), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

In [17]:
len(vocab)

10001

In [18]:
def collate_batch(batch):
    context_batch = []
    target_batch = []

    for context, target in batch:
        context_ids = torch.tensor([vocab[token] for token in context], dtype=torch.long)
        target_id = torch.tensor(vocab[target], dtype=torch.long)
        context_batch.append(context_ids)
        target_batch.append(target_id)

    context_batch = torch.stack(context_batch)
    target_batch = torch.stack(target_batch)

    return context_batch, target_batch

In [19]:
train_dataset, test_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
split_train, split_valid = train_test_split(train_dataset, test_size=0.2, random_state=42)

In [20]:
BATCH_SIZE = 32
train_dataloader = DataLoader(split_train, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
valid_dataloader = DataLoader(split_valid, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch )
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)

In [21]:
embedding_dim = 300
vocab_size = len(vocab)
embedding_matrix = np.zeros((vocab_size, embedding_dim))

for idx, token in enumerate(vocab.get_itos()):
    if token == "<unk>":
        embedding_matrix[idx] = np.zeros(embedding_dim)
    else:
        embedding_matrix[idx] = ft.get_word_vector(token)

In [22]:
class NextWordPredictor(nn.Module):
    def __init__(self, vocab_size, embed_dim=128, hidden_dim=256, dropout=0.3):
        super().__init__()
        embedding_tensor = torch.tensor(embedding_matrix, dtype=torch.float32)
        self.embedding =  nn.Embedding.from_pretrained(embedding_tensor, freeze=False)
        self.lstm = nn.LSTM(embed_dim, hidden_dim,batch_first=True)
        self.dropout = nn.Dropout(dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    def forward(self, input):
        embedding = self.embedding(input)
        lstm_out, (hidden_state, cell_state) = self.lstm(embedding)
        last_hidden = hidden_state[-1]
        dropped = self.dropout(last_hidden)
        output = self.fc(dropped)

        return output

In [23]:
model = NextWordPredictor(
    vocab_size = len(vocab),
    embed_dim=embedding_dim,
    hidden_dim=256,
).to(device)

model

NextWordPredictor(
  (embedding): Embedding(10001, 300)
  (lstm): LSTM(300, 256, batch_first=True)
  (dropout): Dropout(p=0.3, inplace=False)
  (fc): Linear(in_features=256, out_features=10001, bias=True)
)

In [24]:
model = model.to(device)

In [25]:
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [26]:
def evaluate(model, dataloader, loss_function, device, topk=5):
    model.eval()
    total_loss = 0.0
    total_count = 0
    top1_correct = 0
    topk_correct = 0

    with torch.no_grad():
        for context_batch, target_batch in dataloader:
            context_batch = context_batch.to(device)
            target_batch = target_batch.to(device)

            outputs = model(context_batch)
            loss = loss_function(outputs, target_batch)
            total_loss += loss.item() * target_batch.size(0)
            
            
            preds = outputs.argmax(dim=1)
            top1_correct += (preds == target_batch).sum().item()
            
            
            topk_preds = torch.topk(outputs, topk, dim=1).indices
            correct = topk_preds.eq(target_batch.view(-1, 1).expand_as(topk_preds))
            topk_correct += correct.any(dim=1).sum().item()
            
            total_count += target_batch.size(0)

    avg_loss = total_loss / total_count
    accuracy = top1_correct / total_count
    topk_accuracy = topk_correct / total_count
    perplexity = math.exp(avg_loss)

    return avg_loss, accuracy, topk_accuracy, perplexity

In [27]:
def top_k_accuracy(output, target, k=5):
    topk_preds = torch.topk(output, k, dim=1).indices
    correct = topk_preds.eq(target.view(-1, 1).expand_as(topk_preds))
    return correct.any(dim=1).float().mean().item()

In [28]:
epochs = 6
best_val_loss = float("inf")

train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []
val_perplexities = []

for epoch in range(epochs):
    model.train()
    total_train_loss = 0.0

    for context_batch, target_batch in train_dataloader:
        context_batch = context_batch.to(device)
        target_batch = target_batch.to(device)

        optimizer.zero_grad()
        outputs = model(context_batch)
        loss = loss_function(outputs, target_batch)
        loss.backward()
        optimizer.step()

        total_train_loss += loss.item() * target_batch.size(0)

    train_loss = total_train_loss / len(train_dataloader.dataset)

    train_loss_eval, train_acc, train_topk, train_ppl = evaluate(model, train_dataloader, loss_function, device, topk=5)
    val_loss, val_acc, val_topk, val_ppl = evaluate(model, valid_dataloader, loss_function, device, topk=5)

    train_losses.append(train_loss)
    val_losses.append(val_loss)
    train_accuracies.append(train_acc)
    val_accuracies.append(val_acc)
    val_perplexities.append(val_ppl)

    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), "best_nextword_model.pt")

    print(f"Epoch {epoch + 1}:")
    print(f"  Train Loss: {train_loss:.4f}, Top-1 Accuracy: {train_acc:.4f}, Top-5 Accuracy: {train_topk:.4f}")
    print(f"  Val   Loss: {val_loss:.4f}, Top-1 Accuracy: {val_acc:.4f}, Top-5 Accuracy: {val_topk:.4f}, Perplexity: {val_ppl:.4f}")

Epoch 1:
  Train Loss: 5.3582, Top-1 Accuracy: 0.2081, Top-5 Accuracy: 0.4244
  Val   Loss: 5.0187, Top-1 Accuracy: 0.2001, Top-5 Accuracy: 0.4102, Perplexity: 151.2133
Epoch 2:
  Train Loss: 4.9244, Top-1 Accuracy: 0.2265, Top-5 Accuracy: 0.4515
  Val   Loss: 4.8897, Top-1 Accuracy: 0.2091, Top-5 Accuracy: 0.4262, Perplexity: 132.9072
Epoch 3:
  Train Loss: 4.7626, Top-1 Accuracy: 0.2404, Top-5 Accuracy: 0.4704
  Val   Loss: 4.8558, Top-1 Accuracy: 0.2136, Top-5 Accuracy: 0.4328, Perplexity: 128.4846
Epoch 4:
  Train Loss: 4.6408, Top-1 Accuracy: 0.2550, Top-5 Accuracy: 0.4896
  Val   Loss: 4.8373, Top-1 Accuracy: 0.2140, Top-5 Accuracy: 0.4361, Perplexity: 126.1235
Epoch 5:
  Train Loss: 4.5349, Top-1 Accuracy: 0.2666, Top-5 Accuracy: 0.5064
  Val   Loss: 4.8412, Top-1 Accuracy: 0.2157, Top-5 Accuracy: 0.4375, Perplexity: 126.6221
Epoch 6:
  Train Loss: 4.4385, Top-1 Accuracy: 0.2813, Top-5 Accuracy: 0.5229
  Val   Loss: 4.8549, Top-1 Accuracy: 0.2164, Top-5 Accuracy: 0.4359, Perplex

In [29]:
model.load_state_dict(torch.load("best_nextword_model.pt"))
_, test_acc, test_topk, test_ppl = evaluate(model, test_dataloader, loss_function, device)
print(f"Top-1 Accuracy: {test_acc:.4f}")
print(f"Top-5 Accuracy: {test_topk:.4f}")
print(f"Test Perplexity: {test_ppl:.4f}")

Top-1 Accuracy: 0.2140
Top-5 Accuracy: 0.4356
Test Perplexity: 127.1386
