# Cleaning

In [10]:
import os
import glob
from lxml import etree

whitelist_authors = [
    "tlg0543", # Polybius
    "tlg0527", # Septuagint
    "tlg0007", # Plutarch
    "tlg0557", # Epictetus
    "tlg0062", # Lucian
    "tlg0525", # Pausanius
    "tlg0057", # Galen
    "tlg0551", # Appian
    "tlg0526", # Josephus
]

def extract_clean_text(repo_path):
    # 1. Expand the ~ to /home/user or /Users/user
    repo_path = os.path.expanduser(repo_path)
    all_text = []
    
    # 2. Use a recursive glob to find all xml files
    search_pattern = os.path.join(repo_path, "data", "**", "*.xml")
    files = glob.glob(search_pattern, recursive=True)
    
    print(f"Found {len(files)} total XML files. Filtering by whitelist...")

    for xml_file in files:
        # 3. Get the folder name that contains the work (usually the author ID)
        # First1KGreek structure: data/tlg0007/tlg001/file.xml
        parts = xml_file.split(os.sep)
        
        # We look for the folder immediately following 'data'
        if 'data' in parts:
            data_index = parts.index('data')
            author_id = parts[data_index + 1]
        else:
            continue
        
        if author_id in whitelist_authors:
            try:
                tree = etree.parse(xml_file)
                root = tree.getroot()
                # Use a wildcard namespace if the URI varies
                # and target the <body> to avoid metadata/header text
                text_content = " ".join(root.xpath("//*[local-name()='body']//text()"))
                
                # Basic cleaning: remove extra whitespace
                clean_str = " ".join(text_content.split())
                if clean_str:
                    all_text.append(clean_str)
            except Exception as e:
                print(f"Error parsing {xml_file}: {e}")
                
    return all_text

path_to_greek_repo = "~/Documents/codespace/projects/First1KGreek"
all_text = extract_clean_text(path_to_greek_repo)
print(f"Extracted {len(all_text)} texts from the whitelist.")

Found 2618 total XML files. Filtering by whitelist...
Extracted 169 texts from the whitelist.


In [None]:
with open('data/combined_text_NT.txt', 'r', encoding='utf-8') as f:
    nt_greek = f.read()

In [12]:
all_text.append(nt_greek)

In [13]:
import re

def split_into_sentences(text_list):
    sentences = []
    # This regex looks for Greek punctuation: . ; (high dot) and ?
    # It also handles the Greek question mark (;) and high dot (·)
    terminal_punctuation = re.compile(r'([.;·?])\s+')
    
    for text in text_list:
        # Split by punctuation and keep the punctuation
        chunks = terminal_punctuation.split(text)
        # Re-join the punctuation with the sentence
        for i in range(0, len(chunks)-1, 2):
            sent = chunks[i] + chunks[i+1]
            if len(sent.strip()) > 5: # Ignore tiny fragments
                sentences.append(sent.strip())
    return sentences

# Process your list
all_sentences = split_into_sentences(all_text)
print(f"Total sentences: {len(all_sentences)}")

Total sentences: 197976


In [14]:
with open('data/greek_corpus.txt', 'w', encoding='utf-8') as f:
    f.write("\n".join(all_sentences))

In [15]:
len("\n".join(all_text))

23377805

# Tokenization

In [4]:
import torch

In [5]:
from tokenizers import ByteLevelBPETokenizer

tokenizer = ByteLevelBPETokenizer()

In [10]:
corpus_path = 'data/greek_corpus.txt'

tokenizer.train(
    files=[corpus_path], 
    vocab_size=30_000, 
    min_frequency=2, 
    show_progress=True,
    special_tokens=["<s>", "<pad>", "</s>", "<unk>", "<mask>"]
)
tokenizer.save_model('.')






['./vocab.json', './merges.txt']

In [11]:
test_sentence = "ἐν ἀρχῇ ἦν ὁ λόγος"
encoded = tokenizer.encode(test_sentence)

print(f"Tokens: {encoded.tokens}")
print(f"IDs: {encoded.ids}")

Tokens: ['á¼ĲÎ½', 'Ġá¼ĢÏģÏĩá¿ĩ', 'Ġá¼¦Î½', 'Ġá½ģ', 'ĠÎ»ÏĮÎ³Î¿ÏĤ']
IDs: [994, 3797, 946, 398, 4121]


# Embeddings

In [12]:
from transformers import RobertaConfig, RobertaForMaskedLM

# Configure the architecture
config = RobertaConfig(
    vocab_size=30000,       # Must match your tokenizer vocab size
    max_position_embeddings=514, 
    num_attention_heads=12,
    num_hidden_layers=6,    # Half of BERT-base for speed/efficiency
    type_vocab_size=1,
)

# Initialize the model with random weights
# This is a "Fresh" model - it knows nothing yet!
model = RobertaForMaskedLM(config=config)

In [13]:
from transformers import RobertaTokenizer

# Try the slow tokenizer first to debug
tokenizer = RobertaTokenizer.from_pretrained(
    ".",
    max_len=512,
    bos_token="<s>",
    eos_token="</s>",
    sep_token="</s>",
    cls_token="<s>",
    unk_token="<unk>",
    pad_token="<pad>",
    mask_token="<mask>" # Fixed the semicolon here
)

# If that works, save it to generate the proper 'tokenizer.json' 
# which the Fast version prefers
tokenizer.save_pretrained("./fixed_tokenizer")

# Now load the Fast version
from transformers import RobertaTokenizerFast
tokenizer_fast = RobertaTokenizerFast.from_pretrained("./fixed_tokenizer")

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=True, 
    mlm_probability=0.15
)

In [14]:
from datasets import load_dataset

# 1. Load the raw text file
# This treats each line in your txt file as one 'row' in the dataset
raw_dataset = load_dataset("text", data_files={"train": "data/greek_corpus.txt"})

# 2. Tokenize the data
def tokenize_function(examples):
    # This handles the padding and truncation for each line
    return tokenizer(
        examples["text"], 
        truncation=True, 
        max_length=128, 
        padding="max_length"
    )

# map() runs the tokenization in parallel across your CPU cores
tokenized_datasets = raw_dataset.map(
    tokenize_function, 
    batched=True, 
    remove_columns=["text"],
    load_from_cache_file=False
)

# 3. Get the train split
train_dataset = tokenized_datasets["train"]

Generating train split: 197977 examples [00:00, 582986.55 examples/s]
Map: 100%|██████████| 197977/197977 [00:06<00:00, 31489.90 examples/s]


In [15]:
import torch
print(f"Is CUDA available? {torch.cuda.is_available()}")
print(f"Device Name: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'None'}")

Is CUDA available? True
Device Name: NVIDIA GeForce RTX 3060


In [15]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./GreekBERT",
    num_train_epochs=5,
    per_device_train_batch_size=32, # Increase this if you have high VRAM (e.g., 12GB+)
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
    
    # GPU Optimizations
    fp16=True,                # Uses half-precision for massive speedup
    dataloader_num_workers=4, # Uses multiple CPU cores to feed the GPU
    report_to="none",         # Keeps things simple, no external logging
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator, # This still uses the mask_token logic from before
    train_dataset=train_dataset,
)

trainer.train()

Step,Training Loss
500,8.166827
1000,7.522499
1500,7.3268
2000,7.117297
2500,6.983081
3000,6.883788
3500,6.763104
4000,6.697385
4500,6.590288
5000,6.510223


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.43it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.54it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.28it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.25it/s]


TrainOutput(global_step=30935, training_loss=5.475901098699895, metrics={'train_runtime': 3952.0835, 'train_samples_per_second': 250.472, 'train_steps_per_second': 7.828, 'total_flos': 3.280467815958528e+16, 'train_loss': 5.475901098699895, 'epoch': 5.0})

In [16]:
# Save the model weights and configuration
trainer.save_model("./GreekBERT")

# Save the tokenizer specifically into the same folder
tokenizer.save_pretrained("./GreekBERT")

print("Model saved to ././GreekBERT")

Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.49it/s]

Model saved to ././GreekBERT





In [16]:
# Check this in your trainer output
print(f"Total steps expected: {trainer.state.max_steps}")

NameError: name 'trainer' is not defined

In [18]:
print(f"Number of sentences in list: {len(all_sentences)}")
print(f"Number of rows in train_dataset: {len(train_dataset)}")

Number of sentences in list: 197976
Number of rows in train_dataset: 197977


The loss is pretty high. Let's see if we can add more data.

In [25]:
results = fill_mask("ἐν <mask> ἦν ὁ λόγος")

print("Top 5 Greek word predictions (ignoring punctuation):")
count = 0
for res in results:
    token = res['token_str'].strip()
    # Skip common punctuation marks
    if token in [".", "·", ";", ",", "»", "«", "’", "΄", "·", '"']:
        continue
    print(f"Score: {res['score']:.4f} | Token: {token}")
    count += 1
    if count == 5: break

Top 5 Greek word predictions (ignoring punctuation):
Score: 0.0072 | Token: ὼν
Score: 0.0056 | Token: ὸς
Score: 0.0044 | Token: ὴς
Score: 0.0043 | Token: οῦσιν


In [28]:
from sentence_transformers import util

model.to("cuda")

# Helper to get the average embedding (Mean Pooling)
def get_embedding(text, model, tokenizer):
    device = next(model.parameters()).device
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to("cuda")
    with torch.no_grad():
        outputs = model.roberta(**inputs) # Use .roberta to get hidden states
    # Mean across the token dimension
    return outputs.last_hidden_state.mean(dim=1)

# Compare two words
emb1 = get_embedding("θεός", model, tokenizer)
emb2 = get_embedding("κύριος", model, tokenizer)
emb3 = get_embedding("μάχαιρα", model, tokenizer)

cos_sim_holy = util.cos_sim(emb1, emb2)
cos_sim_war = util.cos_sim(emb1, emb3)

print(f"Similarity (God/Lord): {cos_sim_holy.item():.4f}")
print(f"Similarity (God/Sword): {cos_sim_war.item():.4f}")

Similarity (God/Lord): 0.8566
Similarity (God/Sword): 0.8424


# Iteration


In [None]:
import torch
import os
from transformers import (
    RobertaConfig, RobertaForMaskedLM, RobertaTokenizer, 
    RobertaTokenizerFast, DataCollatorForLanguageModeling, 
    TrainingArguments, Trainer, pipeline
)
from datasets import load_dataset
from sentence_transformers import util

def run_greek_experiment(experiment_name, corpus_path, num_epochs=5, vocab_size=30000):
    print(f"--- Starting Experiment: {experiment_name} ---")
    
    # 1. Setup Architecture
    config = RobertaConfig(
        vocab_size=vocab_size,
        max_position_embeddings=514,
        num_attention_heads=12,
        num_hidden_layers=6,
        type_vocab_size=1,
    )
    model = RobertaForMaskedLM(config=config)
    
    # 2. Load Tokenizer (from your local files)
    tokenizer = RobertaTokenizer.from_pretrained(".", max_len=512)
    
    # 3. Prepare Dataset
    raw_dataset = load_dataset("text", data_files={"train": corpus_path})
    
    def tokenize_function(examples):
        return tokenizer(examples["text"], truncation=True, max_length=128, padding="max_length")
    
    tokenized_datasets = raw_dataset.map(
        tokenize_function, batched=True, remove_columns=["text"], load_from_cache_file=False
    )
    
    # 4. Collator & Training Args
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
    
    training_args = TrainingArguments(
        output_dir=f"./{experiment_name}",
        num_train_epochs=num_epochs,
        per_device_train_batch_size=32,
        save_total_limit=1,
        prediction_loss_only=True,
        fp16=True,
        dataloader_num_workers=4,
        report_to="none",
        logging_steps=500
    )
    
    # 5. Train
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    
    trainer = Trainer(
        model=model, args=training_args, 
        data_collator=data_collator, train_dataset=tokenized_datasets["train"]
    )
    
    trainer.train()
    
    # 6. Save
    model_path = f"./models/{experiment_name}"
    trainer.save_model(model_path)
    tokenizer.save_pretrained(model_path)
    
    return model, tokenizer, model_path

# --- UTILITY FOR TESTING ---
def test_semantic_domains(model, tokenizer):
    model.eval()
    def get_emb(text):
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True).to("cuda")
        with torch.no_grad():
            outputs = model.roberta(**inputs)
        return outputs.last_hidden_state.mean(dim=1)

    god = get_emb("θεός")
    lord = get_emb("κύριος")
    sword = get_emb("μάχαιρα")
    
    print(f"Similarity (God/Lord): {util.cos_sim(god, lord).item():.4f}")
    print(f"Similarity (God/Sword): {util.cos_sim(god, sword).item():.4f}")

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Experiment 1: The original data
model_v1, tok_v1, path_v1 = run_greek_experiment("Greek_v1_Base", "data/greek_corpus.txt", num_epochs=5)
test_semantic_domains(model_v1, tok_v1)


--- Starting Experiment: Greek_v1_Base ---


Map: 395954 examples [00:06, 29966.35 examples/s]          


Step,Training Loss
500,8.189638
1000,7.539497
1500,7.333592
2000,7.121885
2500,6.987069
3000,6.882024
3500,6.771735
4000,6.708227
4500,6.600252
5000,6.531779


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.45it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.59it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.43it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.46it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.43it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.45it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.42it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.46it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.44it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.45it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.42it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.47it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.37it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.36it/s]
Writing model shards: 100%|██████████| 1/1 [00:0

Similarity (God/Lord): 0.6471
Similarity (God/Sword): 0.7097
--- Starting Experiment: Greek_v2_Expanded ---


FileNotFoundError: Unable to find '/home/jon/Documents/codespace/projects/semantic_domain_for_biblical_greek/data/greek_corpus_large.txt'

In [3]:

# Experiment 2: More data + more epochs
# Let's say you saved a bigger file called 'data/greek_corpus_large.txt'
model_v2, tok_v2, path_v2 = run_greek_experiment("Greek_v2_Expanded", "data/greek_corpus.txt", num_epochs=15)
test_semantic_domains(model_v2, tok_v2)

--- Starting Experiment: Greek_v2_Expanded ---


Map: 395954 examples [00:06, 28966.81 examples/s]          


Step,Training Loss
500,8.178635
1000,7.532491
1500,7.326483
2000,7.111935
2500,6.973551
3000,6.87316
3500,6.751871
4000,6.683003
4500,6.566755
5000,6.486512


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.48it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.78it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.74it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.78it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.78it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.75it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.74it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.79it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.74it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.80it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.73it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.80it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.55it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.82it/s]
Writing model shards: 100%|██████████| 1/1 [00:0

Similarity (God/Lord): 0.6973
Similarity (God/Sword): 0.6839



