In [1]:
import torch
from transformers import (
    BertConfig, 
    BertForMaskedLM, 
    DataCollatorForLanguageModeling, 
    TrainingArguments, 
    Trainer,
    PreTrainedTokenizerFast
)
# Assuming your functions are in these files
from clean_data import build_greek_corpus
from train_tokenizer import train_tokenizer
from load_datasets import load_datasets


  from .autonotebook import tqdm as notebook_tqdm


In [3]:

# 1. Prepare Data & Tokenizer
# ---------------------------
whitelist = [
    "tlg0527", # Septuagint
    "tlg0526", # Josephus
    "tlg1216", # Ignatius of Antioch
    "tlg1311", # Polycarp
    "tlg1434", # Clement of Rome (1 Clement)
    "tlg0646", # Justin Martyr
    "tlg1443", # Irenaeus of Lyons
    "tlg0555", # Clement of Alexandria
    "tlg2042", # Origen
    "tlg2018", # Eusebius of Caesarea
    "tlg2035", # Athanasius
    "tlg1220", # Barnabas (Epistle)
    "tlg1386", # Hippolytus
    "tlg1601", # Papias
]

sentences = build_greek_corpus()
print(f"total sentences: {len(sentences)}")
raw_tokenizer = train_tokenizer(sentences)

# We need the path for the load_dataset call in your function
corpus_path = "data/greek_corpus.txt" 
train_dataset = load_datasets(raw_tokenizer, corpus_path)


Found 2618 total XML files. Filtering by whitelist...


Added additional text from data/combined_text_NT.txt
total sentences: 186279





Map: 395954 examples [00:05, 37970.25 examples/s]          


In [7]:
import os

# 1. Save and Reload (The "Reset" trick)
# ---------------------------
# Create a temporary directory to store tokenizer files
os.makedirs("temp_tokenizer", exist_ok=True)
raw_tokenizer.save("temp_tokenizer/tokenizer.json")

# Reload it using the standard Transformers 'from_pretrained' logic
# This creates a 'clean' fast_tokenizer that doesn't have the direction bug
fast_tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="temp_tokenizer/tokenizer.json",
    bos_token="<s>",
    eos_token="</s>",
    unk_token ="<unk>",
    pad_token="<pad>",
    mask_token ="<mask;"
)

# Manually set tokens just to be safe
fast_tokenizer.pad_token = "<pad>"
fast_tokenizer.mask_token = "<mask>"

# 2. Setup Collator
# ---------------------------
data_collator = DataCollatorForLanguageModeling(
    tokenizer=fast_tokenizer, 
    mlm=True, 
    mlm_probability=0.15
)

vocab_size = len(fast_tokenizer)

# 3. Setup Model
# ---------------------------
config = BertConfig(
    vocab_size=vocab_size,
    max_position_embeddings=512,
    num_hidden_layers=6,
    num_attention_heads=12,
    intermediate_size=3072
)
model = BertForMaskedLM(config=config)


In [6]:
import torch
print(f"Is CUDA available? {torch.cuda.is_available()}")
print(f"Device Name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")
# 3. Training Arguments
# ---------------------------
training_args = TrainingArguments(
    output_dir="./GreekBERT",
    num_train_epochs=5,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    fp16=torch.cuda.is_available(), # Auto-detect GPU capability
    dataloader_num_workers=4,
    report_to="none",
)

# 4. Initialize and Run Trainer
# ---------------------------
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

trainer.train()

# 5. Save the final model
trainer.save_model("./GreekBERT_v3")

Is CUDA available? True
Device Name: NVIDIA GeForce RTX 3060


Step,Training Loss
500,7.393856


/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [5385,0,0], thread: [64,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [5385,0,0], thread: [65,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [5385,0,0], thread: [66,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [5385,0,0], thread: [67,0,0] Assertion `ind >=0 && ind < ind_dim_size && "vectorized gather kernel index out of bounds"` failed.
/pytorch/aten/src/ATen/native/cuda/IndexKernelUtils.cu:16: vectorized_gather_kernel: block: [5385,0,0], thread: [68,

AcceleratorError: CUDA error: device-side assert triggered
Search for `cudaErrorAssert' in https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html for more information.
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
