In [1]:
import pandas as pd

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import Counter
import re


In [4]:
print("CUDA available?       ", torch.cuda.is_available())
print("CUDA version:         ", torch.version.cuda)
print("Number of GPUs:       ", torch.cuda.device_count())
if torch.cuda.is_available():
    print("Current GPU:         ", torch.cuda.get_device_name(0))

CUDA available?        True
CUDA version:          12.8
Number of GPUs:        1
Current GPU:          NVIDIA GeForce RTX 3060


In [12]:
from cltk.data.fetch import FetchCorpus
corpus_downloader = FetchCorpus(language="grc")  # grc = ancient Greek
corpus_downloader.import_corpus("grc_models_cltk")

  return re.sub("\W+", "", string)
  """


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

file_path = 'data/combined_text.txt'

# Read and preprocess the text
with open(file_path, 'r', encoding='utf-8') as f:
    text = f.read()

# Preprocess: lowercase, remove non-Greek characters (keeping Greek letters and spaces),
# then split into words.
text = text.lower()
text = re.sub(r'[^\u0370-\u03ff\s]', '', text)  # Keep Greek Unicode range
words = re.split(r'\s+', text.strip())

# Build vocabulary and word frequencies
word_counts = Counter(words)
vocab = sorted(word_counts.keys())
vocab_size = len(vocab)
word_to_ix = {word: i for i, word in enumerate(vocab)}
ix_to_word = {i: word for word, i in word_to_ix.items()}

# Prepare unigram distribution for negative sampling (raised to 0.75 power)
freq = np.array([word_counts[word] for word in vocab]) ** 0.75
prob = freq / freq.sum()

# Generate skip-gram pairs (center word as target, surrounding as context)
def generate_pairs(words, window_size=2):
    pairs = []
    for i in range(len(words)):
        for j in range(1, window_size + 1):
            if i - j >= 0:
                pairs.append((word_to_ix[words[i]], word_to_ix[words[i - j]]))
            if i + j < len(words):
                pairs.append((word_to_ix[words[i]], word_to_ix[words[i + j]]))
    return pairs

pairs = generate_pairs(words)

# Skip-Gram model with negative sampling
class SkipGramNeg(nn.Module):
    def __init__(self, vocab_size, embed_dim):
        super(SkipGramNeg, self).__init__()
        self.in_embed = nn.Embedding(vocab_size, embed_dim)
        self.out_embed = nn.Embedding(vocab_size, embed_dim)
        # Initialize embeddings
        self.in_embed.weight.data.uniform_(-1, 1)
        self.out_embed.weight.data.uniform_(-1, 1)

    def forward(self, target, context, noise):
        emb_t = self.in_embed(target)  # (batch_size, embed_dim)
        emb_c = self.out_embed(context)  # (batch_size, embed_dim)
        emb_n = self.out_embed(noise)  # (batch_size, num_neg, embed_dim)
        
        # Positive score: dot product
        score = torch.sum(emb_t * emb_c, dim=1)  # (batch_size)
        
        # Negative scores: dot products
        neg_score = torch.sum(emb_t.unsqueeze(1) * emb_n, dim=2)  # (batch_size, num_neg)
        
        return score, neg_score

# Training parameters
embed_dim = 100  # Embedding dimension (adjust as needed)
num_neg = 5  # Number of negative samples per positive
batch_size = 512  # Batch size
epochs = 5  # Number of epochs (increase for better results)
learning_rate = 0.001

# Initialize model, optimizer
model = SkipGramNeg(vocab_size, embed_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Batch generator
def generate_batches(pairs, batch_size):
    for i in range(0, len(pairs), batch_size):
        yield pairs[i:i + batch_size]

# Training loop
for epoch in range(epochs):
    total_loss = 0
    num_batches = 0
    for batch in generate_batches(pairs, batch_size):
        if len(batch) == 0:
            continue
        targets = torch.tensor([p[0] for p in batch], dtype=torch.long).to(device)
        contexts = torch.tensor([p[1] for p in batch], dtype=torch.long).to(device)
        noises   = torch.tensor(np.random.choice(vocab_size, size=(len(batch), num_neg), p=prob), 
                        dtype=torch.long).to(device)
        
        optimizer.zero_grad()
        score, neg_score = model(targets, contexts, noises)
        
        # Loss calculation (negative log likelihood)
        pos_loss = torch.log(torch.sigmoid(score))
        neg_loss = torch.sum(torch.log(torch.sigmoid(-neg_score)), dim=1)
        loss = -torch.mean(pos_loss + neg_loss)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        num_batches += 1
    
    avg_loss = total_loss / num_batches if num_batches > 0 else 0
    print(f"Epoch {epoch + 1}/{epochs}, Average Loss: {avg_loss:.4f}")

# After training, the input embeddings are the word vectors
embeddings = model.in_embed.weight.data.cpu().numpy()

# Example: Get embedding for a word (replace 'θεος' with an actual Greek word from your vocab)
example_word = 'θεος'  # Greek for 'God'
if example_word in word_to_ix:
    word_idx = word_to_ix[example_word]
    word_embedding = embeddings[word_idx]
    print(f"Embedding for '{example_word}': {word_embedding[:10]}...")  # Print first 10 dimensions
else:
    print(f"'{example_word}' not in vocabulary.")

# To save embeddings
np.save('greek_nt_embeddings.npy', embeddings)
with open('vocab.txt', 'w', encoding='utf-8') as f:
    for word in vocab:
        f.write(word + '\n')

Using device: cuda
Epoch 1/5, Average Loss: 7.5845
Epoch 2/5, Average Loss: 5.8092
Epoch 3/5, Average Loss: 4.7247
Epoch 4/5, Average Loss: 4.0308
Epoch 5/5, Average Loss: 3.5785
Embedding for 'θεος': [ 0.32356393 -0.55579644 -0.18890837 -0.7147516   0.08426335 -0.38437507
  0.49991208 -0.57248574 -0.7918278  -0.22177766]...


In [11]:
import numpy as np
from scipy.spatial.distance import cosine

# Load
embeddings = np.load('greek_nt_embeddings.npy')
with open('vocab.txt', encoding='utf-8') as f:
    vocab = [line.strip() for line in f]
word_to_ix = {w: i for i, w in enumerate(vocab)}

def most_similar(word, topn=4):
    if word not in word_to_ix:
        # Return an empty list so the loop just skips it, 
        # or [(f"'{word}' not found", 0.0)] to show an error in the loop
        print(f"Warning: '{word}' not found in vocabulary.")
        return [] 
    
    vec = embeddings[word_to_ix[word]]
    sims = [1 - cosine(vec, embeddings[i]) for i in range(len(embeddings))]
    
    most_sim_idx = np.argsort(sims)[::-1][1:topn+1]
    return [(vocab[i], sims[i]) for i in most_sim_idx]

# Try some theologically interesting words
print("Most similar to θεός (God):")
for w, sim in most_similar('θεος'):
    print(f"  {w:12} {sim:.4f}")

print("\nMost similar to ιησους (Jesus):")
for w, sim in most_similar('ιησους'):
    print(f"  {w:12} {sim:.4f}")

print("\nMost similar to πνευμα (spirit):")
for w, sim in most_similar('πνευμα'):
    print(f"  {w:12} {sim:.4f}")

Most similar to θεός (God):
  μηδ          0.4098
  πίνουσιν     0.3933
  ξελθόντος    0.3762
  παιδίσκην    0.3754

Most similar to ιησους (Jesus):

Most similar to πνευμα (spirit):
