In [49]:
import pandas as pd
import os
from datetime import datetime
from time import time

import warnings
warnings.filterwarnings("ignore")

In [69]:
# Start time of Execution
time_start = time()

# Preprocessing

## Data Loading

In [63]:
def load_epigraph_data(file_path):
    try:
        df = pd.read_csv(file_path, sep='\t')
        return df
    except FileNotFoundError:
        print(f"File not found: {file_path}")
        return None

# Update this to the correct path where your file is located
file_path = '/Users/ioannisvelgakis/Desktop/EpigraphRestoration/data.csv'

df = load_epigraph_data(file_path)

if df is not None:
    print("Data loaded successfully")
else:
    print("Data loading failed")

Data loaded successfully


In [64]:
df.head()

Unnamed: 0,id,text,metadata,region_main_id,region_main,region_sub_id,region_sub,date_str,date_min,date_max,date_circa
0,315181,[φ]ιλεταιρος ευμενου περγαμευς μουσαις. καφισι...,Boiotia — Thespiai — mid-3rd c. BC — BCH 26 (1...,1698,Central Greece (IG VII-IX),1691,"Megaris, Oropia, and Boiotia (IG VII)",mid-3rd c. BC,-275.0,-226.0,0.0
1,201686,μαλκοιδων ηρωνος.,"Crete, W. — Tarrha — 1st-3rd c. AD — IC II xxi...",1699,"Aegean Islands, incl. Crete (IG XI-[XIII])",474,Crete,1st-3rd c. AD,1.0,300.0,0.0
2,153178,βασιλικος.,Makedonia (Bottiaia) — Pella — 3rd/2nd c. BC —...,1692,Northern Greece (IG X),1485,Macedonia,3rd/2nd c. BC,-300.0,-101.0,0.0
3,28582,αισκλαπιει μ [ανεθεκε --].,Epidauria — Epidauros — sinistr. — 6th/5th c. BC,1690,Peloponnesos (IG IV-[VI]),1643,"Epidauria (IG IV²,1)",6th/5th c. BC,-600.0,-401.0,0.0
4,333620,[---]ος αν[εθηκε δαματρι].,Italia — Herakleia (Policoro) — late 4th/early...,1696,"Sicily, Italy, and the West (IG XIV)",1689,"Italy, incl. Magna Graecia",late 4th/early 3rd c. BC,-350.0,-251.0,0.0


In [65]:
def filter_epigraphs_by_region(df, region_id):
    return df[df['region_main_id'] == region_id]
filtered_epigraphs = filter_epigraphs_by_region(df, 1683)

In [66]:
epigraph_texts = filtered_epigraphs['text'].tolist()
target_epigraph = "[---] αλεξανδρε ουδις [---]"
print(f'Target epigraph: {target_epigraph}\n epigraph samples: {epigraph_texts[:5]}')

Target epigraph: [---] αλεξανδρε ουδις [---]
 epigraph samples: ['[οσον] επαρκ[ει].', 'α[---]υς και [γυ]ν[η] φω[κ]αρια χαιρετε.', '[αφ]ροδιτηι συριηι μητρω.', 'αγαθηι τυχηι. δημοκρατην αριστογενους προεδρον [μεν εισηγη]σαμενον ενδοξως [και δημηγ]ορησαντα δις δ α[ρξαντα την] [λαμπρο]τατην α[ρχην και πλεονακις] πρεσβευ[σ]αν[τα παρα τους] [σεβα]στους προικα υπερ τω[ν συμφεροντ]ων της πατριδος και τε[ιμηθε]ντα υπο της πολεως αιδιω[ι] [αναγ]ορευσει θιασαρχησαν[τα και] [πασ]αν λειτουργιαν τελεσαν[τα] [πολ]ειτευσαμενον εν πασιν αγνως τον κτιστην και ασυνκ[ρι]τον και φιλοπατριν η κρατιστη βουλη και ο ιερωτατος δημος ευνοιας χαριν.', '[---------- εν τω] ετει υπ[ερ -----] κατεσκε[υασε --- τον πυργ]γον τη π[ολει].']


In [67]:
len(epigraph_texts)

101

## Text Encoding (Tokenization / Vectorization)

In [59]:
NGRAM_RANGE = (1, 1) 
TOP_K = 1678
TOKEN_MODE = 'word'



In [60]:
def vectorize_texts(train_texts):
    kwargs = {
        'ngram_range': NGRAM_RANGE,
        'dtype': 'float32',
        'strip_accents': 'unicode',
        'decode_error': 'replace',
        'analyzer': TOKEN_MODE,
        'max_df': 0.9,
        'norm': 'l2',
        'max_features': TOP_K,
    }
    vectorizer = TfidfVectorizer(**kwargs)
    dataOut = vectorizer.fit_transform(train_texts)
    tokens = vectorizer.get_feature_names_out()

    # Map each token to a unique integer in the range [1, 1678]
    token_to_int = {token: idx + 1 for idx, token in enumerate(tokens)}

    return dataOut, tokens, vectorizer, token_to_int

In [61]:
train_texts = epigraph_texts
dataOut, tokens, vectorizer, token_to_int = vectorize_texts(train_texts)

# Print the mapping of tokens to integers
print(token_to_int)

{'αγ': 1, 'αγα': 2, 'αγαθ': 3, 'αγαθαι': 4, 'αγαθανγελ': 5, 'αγαθη': 6, 'αγαθηι': 7, 'αγαθον': 8, 'αγαθων': 9, 'αγαλματι': 10, 'αγαμηστορι': 11, 'αγε': 12, 'αγεν': 13, 'αγεπολ': 14, 'αγεπολεως': 15, 'αγνως': 16, 'αγοραν': 17, 'αγορασαντας': 18, 'αγορασαντος': 19, 'αγορασαντων': 20, 'αγοσ': 21, 'αγουσι': 22, 'αγρωνος': 23, 'αγυαιο': 24, 'αδελφ': 25, 'αδελφεος': 26, 'αεαυτο': 27, 'αθαναιος': 28, 'αθηνα': 29, 'αθηναιος': 30, 'αθηνο': 31, 'αθροων': 32, 'αθυμως': 33, 'αι': 34, 'αιδιω': 35, 'αιμ': 36, 'αιμιλιανος': 37, 'αιο': 38, 'αιρεθεις': 39, 'ακρ': 40, 'ακροπο': 41, 'ακυλου': 42, 'αλαι': 43, 'αλεξανδρος': 44, 'αληθειας': 45, 'αλλ': 46, 'αλλα': 47, 'αλλους': 48, 'αλωτον': 49, 'αμ': 50, 'αμα': 51, 'αμαξιτογ': 52, 'αμβιωνος': 53, 'αμενου': 54, 'αμφοτερα': 55, 'αμφοτερους': 56, 'αν': 57, 'ανα': 58, 'αναγ': 59, 'αναγινωσκοντων': 60, 'αναγραφαμεν': 61, 'αναγραψαι': 62, 'αναζευξιν': 63, 'αναλωμα': 64, 'αναλωματων': 65, 'αναλωσας': 66, 'αναπραθεισης': 67, 'αναστροφην': 68, 'ανδ': 69, 'ανε': 70, 

In [62]:
print(dataOut)

  (0, 361)	0.5144092
  (0, 417)	0.6317904
  (0, 855)	0.57984835
  (1, 1315)	0.37497544
  (1, 120)	0.46053976
  (1, 1311)	0.46053976
  (1, 245)	0.42267695
  (1, 613)	0.21082458
  (1, 1276)	0.46053976
  (2, 757)	0.52095616
  (2, 1147)	0.52095616
  (2, 1070)	0.47812623
  (2, 181)	0.47812623
  (3, 1319)	0.09098094
  (3, 471)	0.107432716
  (3, 288)	0.100604616
  (3, 582)	0.11705639
  (3, 217)	0.107432716
  (3, 663)	0.11705639
  (3, 1298)	0.11705639
  (3, 1061)	0.11705639
  (3, 155)	0.11705639
  (3, 666)	0.11705639
  (3, 1219)	0.17464438
  (3, 15)	0.11705639
  :	:
  (98, 463)	0.20940278
  (98, 842)	0.18937165
  (98, 1218)	0.2236151
  (98, 137)	0.2236151
  (98, 215)	0.20940278
  (98, 71)	0.2236151
  (98, 201)	0.2236151
  (98, 609)	0.16934052
  (98, 271)	0.18175617
  (98, 1367)	0.17515934
  (98, 613)	0.22307135
  (99, 1148)	0.70710677
  (99, 916)	0.70710677
  (100, 117)	0.3095699
  (100, 600)	0.3095699
  (100, 656)	0.3095699
  (100, 895)	0.3095699
  (100, 488)	0.3095699
  (100, 890)	0.3095699


# Genetic Algorithm

In [None]:
def create_initial_population(dictionary_size, population_size):
    population = [np.random.choice(range(1, dictionary_size + 1), 2, replace=False).tolist() for _ in range(population_size)]
    print("Initial Population:", population)  # Debug
    return population


def fitness_function(individual, target_vector, vectorizer, tokens):
    words = [tokens[i - 1] for i in individual]
    completed_epigraph = " ".join(words)
    completed_vector = vectorizer.transform([completed_epigraph])
    similarity = cosine_similarity(target_vector, completed_vector)
    fitness = similarity[0][0]
    print(f"Individual: {individual}, Words: {words}, Fitness: {fitness}") 
    return fitness

def selection(population, fitnesses):
    selected_indices = np.random.choice(len(population), len(population), p=np.array(fitnesses)/sum(fitnesses))
    selected = [population[i] for i in selected_indices]
    print(selected)
    return selected

def crossover(parent1, parent2):
    crossover_point = np.random.randint(0, 2)
    child1 = parent1[:crossover_point] + parent2[crossover_point:]
    child2 = parent2[:crossover_point] + parent1[crossover_point:]
    return child1, child2

def mutate(individual, dictionary_size, mutation_rate):
    for i in range(len(individual)):
        if np.random.rand() < mutation_rate:
            individual[i] = np.random.randint(1, dictionary_size + 1)
    return individual

def genetic_algorithm(epigraphs, target_epigraph, population_size=100, generations=1000, mutation_rate=0.01):
    transformed_epigraphs, tokens, vectorizer = vectorize_texts(epigraphs)
    target_vector = vectorizer.transform([target_epigraph])
    dictionary_size = len(tokens)
    
    population = create_initial_population(dictionary_size, population_size)
    for generation in range(generations):
        fitnesses = [fitness_function(individual, target_vector, vectorizer, tokens) for individual in population]
        print(f"Generation {generation}: Fitnesses = {fitnesses}") 
        
        if max(fitnesses) == 1.0:
            break
        
        population = selection(population, fitnesses)
        next_population = []
        
        for i in range(0, len(population), 2):
            parent1, parent2 = population[i], population[i + 1]
            child1, child2 = crossover(parent1, parent2)
            next_population.extend([mutate(child1, dictionary_size, mutation_rate), mutate(child2, dictionary_size, mutation_rate)])
        
        population = next_population
    
    best_individual = population[np.argmax(fitnesses)]
    best_words = [tokens[i - 1] for i in best_individual]
    return best_words

In [None]:
if __name__ == "__main__":
    file_path = 'data.csv'
    df = load_epigraph_data(file_path)
    filtered_epigraphs = filter_epigraphs_by_region(df, 1683)
    epigraph_texts = filtered_epigraphs['text'].tolist()
    
    target_epigraph = "[...] αλεξανδρε ουδις [...]"
    
    restored_words = genetic_algorithm(epigraph_texts, target_epigraph)
    print("Restored words:", restored_words)

# Evaluation

In [70]:
print(f'Took {(time() - time_start) / 60:.2f} minutes!')

Took 0.05 minutes!
