In [6]:
import os
import json
from datasets import load_dataset
import numpy as np
import math
import pickle
from collections import Counter

from sklearn.model_selection import train_test_split

import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torchtext.vocab import build_vocab_from_iterator, Vocab
import torch.optim as optim
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence

import spacy
from spacy.tokens import DocBin

import fasttext

#### one gla: 11727303 sets 14153438 tokens

In [7]:
ft = fasttext.load_model('../cc.gd.300.bin')

In [8]:
nlp = spacy.blank("gd")
nlp.add_pipe("sentencizer")
nlp.max_length = 7000000

In [10]:
with open("gla_dictionary.txt", "r", encoding="utf-8") as f:
    gaelic_words = set(line.strip() for line in f if line.strip() != "word")

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [12]:
def get_txt_file_paths(folder):
    txt_files = []
    for root, _, files in os.walk(folder):
        for file in files:
            if file.endswith(".txt"):
                txt_files.append(os.path.join(root, file))
    return txt_files

file_paths = get_txt_file_paths("gla_books")
print(f"Total .txt files found: {len(file_paths)}")

Total .txt files found: 151


In [13]:
def tokenization(data):
    doc = nlp(data)
    tokens = [token.text.lower() for token in doc if not token.is_space]
    return tokens

In [14]:
def get_gaelic_sentences(text, gaelic_words, min_length=2, excluded_words=None):
    if excluded_words is None:
        excluded_words = {"a", "i", "an", "is", "do", "so"}

    doc = nlp(text)
    gaelic_sents = []

    for sent in doc.sents:
        tokens = [token.text.lower() for token in sent if token.is_alpha]
        for token in tokens:
            if token in gaelic_words and token not in excluded_words:
                gaelic_sents.append(sent.text.strip())
                break
    return gaelic_sents

In [15]:
class GlaDataset(Dataset):
    def __init__(self, file_paths, gaelic_words, context_size=5):
        self.pairs = []

        for path in file_paths:
            with open(path, encoding='utf-8') as file:
                text = file.read()
                sentences = get_gaelic_sentences(text, gaelic_words)
                for sent in sentences:
                    tokens = tokenization(sent)
                    if len(tokens) > context_size:
                            for i in range(context_size, len(tokens)):
                                context = tokens[i - context_size:i]
                                target = tokens[i]
                                self.pairs.append((context, target))
    def  __len__(self):
        return len(self.pairs)

    def __getitem__(self, index):
        return self.pairs[index]

In [16]:
dataset = GlaDataset(file_paths, gaelic_words)

In [17]:
subset_size = 1_000_000
remaining = len(dataset) - subset_size
one_million, _ = random_split(dataset, [subset_size, remaining])

In [18]:
with open("one_million.jsonl", "w", encoding="utf-8") as f:
    for idx in one_million.indices:
        context, target = dataset[idx]
        f.write(json.dumps({"context": context, "target": target}) + "\n")

In [19]:
print("Total Pairs",len(one_million))

Total Pairs 1000000


In [20]:
total_gaelic_tokens = 0

for path in file_paths:
    with open(path, encoding='utf-8') as file:
        text = file.read()
        sentences = get_gaelic_sentences(text, gaelic_words)
        for sent in sentences:
            tokens = tokenization(sent)
            total_gaelic_tokens += len(tokens)

print("Total Gaelic tokens:", total_gaelic_tokens)

Total Gaelic tokens: 14153438


In [21]:
dataset[0:10]

[(['celt', '3193', '2.2', 'widener', 'hn'], 'zr1y'),
 (['3193', '2.2', 'widener', 'hn', 'zr1y'], '8'),
 (['2.2', 'widener', 'hn', 'zr1y', '8'], 'an'),
 (['widener', 'hn', 'zr1y', '8', 'an'], 'comh'),
 (['hn', 'zr1y', '8', 'an', 'comh'], '-'),
 (['zr1y', '8', 'an', 'comh', '-'], 'threoraiche'),
 (['8', 'an', 'comh', '-', 'threoraiche'], 'leaburan'),
 (['an', 'comh', '-', 'threoraiche', 'leaburan'], 'seou'),
 (['comh', '-', 'threoraiche', 'leaburan', 'seou'], 'a'),
 (['-', 'threoraiche', 'leaburan', 'seou', 'a'], 'chum')]