In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import random
import pandas as pd
import numpy as np
from pathlib import Path

In [3]:
# fix seeds, make everything reproducible, etc (at least try to).
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

DATADIR = "../data"

MAX_WORDS = 4

In [4]:
# !pip install aicrowd-cli
%load_ext aicrowd.magic

In [5]:
%aicrowd login

[32mAPI Key valid[0m
[33mGitlab oauth token invalid or absent.
It is highly recommended to simply run `aicrowd login` without passing the API Key.[0m
[32mSaved details successfully![0m


In [6]:
import re
if Path(DATADIR).exists():
  !rm -rf $DATADIR
!mkdir $DATADIR
%aicrowd ds dl -c htrec-2022 -o $DATADIR

train.csv:   0%|          | 0.00/395k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/45.5k [00:00<?, ?B/s]

In [7]:
import pywer
train_df = pd.read_csv( f"{DATADIR}/train.csv")
test_df = pd.read_csv(f"{DATADIR}/test.csv")

word_regex = re.compile("\W+")
word_regex2 = re.compile("(\W+)")

train_df.head()

Unnamed: 0,HUMAN_TRANSCRIPTION,SYSTEM_TRANSCRIPTION,CENTURY,IMAGE_PATH,TEXT_LINE_NUM
0,ἐγγινομένα πάθη μὴ σβεννύντες ἀλλὰ τῆ εκλύσει,ἐγγενομεναπαδημησμεννωτες ἀλλατῆε κλησει,11,1 Bodleian-Library-MS-Barocci-102_00157_fol-75...,1
1,τοῦ βίου τοῦ καθ ΄ εαυτοὺς πολλὰ γίνεσθαι συγχ...,του β ου του καλεαυτοὺς πολλαγινεσθαι συγχωρ όν,11,1 Bodleian-Library-MS-Barocci-102_00157_fol-75...,2
2,τες ἐμπυρίζουσι τὸν ἀμπελῶνα ἀλλὰ καὶ ὁ διὰ,τες εμπυριζου σιμαμπελῶνα ἀλλακαι ὅδξα,11,1 Bodleian-Library-MS-Barocci-102_00157_fol-75...,3
3,τῆς ἡδεῖας πλεονεξίας πολλοὺς εἰς τὴν τῶν ἀλλ,της ἐδίας πλσον ἐξιας πολλους ἐις τὴν τῶν ἀλ,11,1 Bodleian-Library-MS-Barocci-102_00157_fol-75...,4
4,οτρίων ἐπιθυμίαν προκαλούμενος ἐμπυρί,λοτρλων ἐπιθυμιαν προκαλουμένος ἐμπυρι,11,1 Bodleian-Library-MS-Barocci-102_00157_fol-75...,5


In [8]:
# !pip install nltk

In [9]:
for lib_dir in ["..", "../src"]:
    if not lib_dir in sys.path:
        sys.path.append(lib_dir)
from lm_utils import *

In [10]:
language_model = make_lm(train_df.HUMAN_TRANSCRIPTION.sum())

In [11]:
from tqdm.auto import tqdm
from datastruct import *
from common import *
from space_fixer import SpaceFixer

# for ht_line, mt_line in tqdm(train_df[["HUMAN_TRANSCRIPTION", "SYSTEM_TRANSCRIPTION"]].values[7:]):
#     ht_words = word_regex.split(ht_line)
#     mt_words = word_regex.split(mt_line)
#     #words_ = [remove_cap(word) for word in words]
#     #vocab.add_sentence(words)
#     dmatrix = build_path_matrix(mt_words, vocabs)
#     finished_paths = extract_paths(dmatrix)
#     for k in resplit_paths(finished_paths, mt_words):
#         variant = " ".join(k)
#         print(variant)
#     break
CUTOFF = 3

In [12]:
def extract_spaces_dict(x):
    result = dict()
    last_index = 0
    for w in word_regex2.split(x):
        if not word_regex.match(w):
            last_index += 1
        else:
            result[last_index] = w
    if not last_index in result:
        result[last_index] = ""
    return result
    

In [13]:

train_df["SYSTEM_TRANSCRIPTION_raw"] = train_df.SYSTEM_TRANSCRIPTION
train_df["SYSTEM_TRANSCRIPTION"] = train_df.SYSTEM_TRANSCRIPTION.apply(lambda x: lmr(x, lm=language_model))

ht_sequences_train = train_df.HUMAN_TRANSCRIPTION.apply(lambda x: word_regex.split(x)).values
mt_sequences_train = train_df.SYSTEM_TRANSCRIPTION.apply(lambda x: word_regex.split(x)).values
mt_spaces_train = train_df.SYSTEM_TRANSCRIPTION.apply(extract_spaces_dict).values

mt_texts_train = train_df.SYSTEM_TRANSCRIPTION.values
fixer = SpaceFixer(MAX_WORDS)
fixer.fill(ht_sequences_train)

  0%|          | 0/1875 [00:00<?, ?it/s]

In [14]:
additional_corpus = pd.read_csv("../mgc.csv")
additional_corpus.head()
additional_corpus.loc[additional_corpus.text.isnull()]


Unnamed: 0,filename,text,genre
18,Greek_Medieval_Corpus/Ποιητικά-Λογοτεχνικά/poe...,,Ποιητικά-Λογοτεχνικά/poetry
25,Greek_Medieval_Corpus/Ποιητικά-Λογοτεχνικά/poe...,,Ποιητικά-Λογοτεχνικά/poetry


In [15]:
sep_regex = re.compile("<[^>]+>")

sentence_regex = re.compile("[.!?]")
def split_to_sentences(text):
    if len(set(sep_regex.findall(text)) | {"<NEWLINE>", "<NEWPARAGRAPH>"}) > 2:
        return
    for text_block in sep_regex.split(text):
        for sentence in sentence_regex.split(text_block):
            words = [w for w in word_regex.split(sentence) if len(w) > 0]
            if len(words) < 1:
                continue
            if len(words) > 20:
                continue
            has_short_words = np.any([len(w1) == 1 and len(w2) == 1 for w1, w2 in zip(words, words[1:])])
            if has_short_words:
                continue
            yield words
additional_corpus = additional_corpus.loc[~additional_corpus.text.isnull()]

# set([x for xs in additional_corpus.text.apply(lambda x: sep_regex.findall(x)).values for x in xs])
additional_corpus['cleaned_sentences'] = additional_corpus.text.apply(lambda x: list(split_to_sentences(x)))
#additional_corpus.text.isnull().mean()
flatten_sentences = [sentence for sentences in additional_corpus.cleaned_sentences.values for sentence in sentences if len(sentence) > 0]

fixer.fill(flatten_sentences)

  0%|          | 0/51745 [00:00<?, ?it/s]

In [16]:
# additional_corpus.text.apply(lambda x: set(sep_regex.findall(x)) )
len(fixer.vocabs[1].words)

80973

In [17]:
flatten_sentences[:3]

[['ΜΕΤΑ', 'ΓΟΥΝ', 'ΑΛΛΑ', 'ΤΑ', 'ΠΟΛΛΑ', 'ΤΩΝ', 'ΕΡΩΤΟΧΑΡΙΤΩΝ'],
 ['ΟΣΑ', 'ΜΑΝΘΑΝΕΙ', 'ΦΥΣΙΚΩΣ', 'ΕΡΩΤΙΚΗ', 'ΚΑΡΔΙΑ'],
 ['ΕΣΕΒΗΣΑΝ', 'ΕΙΣ', 'ΤΟ', 'ΛΟΥΤΡΟΝ', 'ΕΛΟΥΣΘΗΣΑΝ', 'ΕΚΕΙΝΟΙ']]

In [18]:
from common import correct_sigmas_in_word
from common import fix_accent_diphthong

def join_if_tuple(s):
    if isinstance(s, str):
        return s
    return " ".join([(x) for x in s])

def postprocess_spaces(spaces_after):
    #return spaces_after
    if len(spaces_after) < 2:
        return spaces_after
    if spaces_after[-1]!=".":
        spaces_after[-1] = ''
    return spaces_after


corrected_texts = []
# mt_sequences_train = mt_sequences_train[-1:]
# mt_spaces_train = mt_spaces_train[-1:]
# mt_texts_train = mt_texts_train[-1:]
# ht_sequences_train = ht_sequences_train[-1:]
corrected_count = 0

train_iter = zip(
    mt_sequences_train, 
    mt_spaces_train, 
    mt_texts_train, 
    ht_sequences_train)
for i, (mt_words, line_spaces, mt_orig, ht_orig) in enumerate(tqdm(train_iter, total=len(mt_sequences_train))):
    # print(line_spaces)
    # print(mt_words)
    # mt_orig = " ".join(mt_words)
    best = mt_orig
    # temporary disable the following lines
    replacements = [
        [join_if_tuple(w) + s for w, s in zip(mt_split, postprocess_spaces(spaces_after))]
        for mt_split, refs, spaces_after in fixer.split_words(mt_words, line_spaces, cutoff=0)
    ]
    replacements = ["".join(words) for words in replacements]

    #replacements = [
    #    ("".join([w + s for w, s in zip(mt_split, spaces_after)]), spaces_after) 
    #    
    #]
    # replacements, spaces_after = list(zip(*replacements))
    # replacements = list(replacements)
    if len(replacements) > 2:
        print(i, len(replacements))
        print("mt orig", mt_orig)
        print(replacements)
        # print(spaces_after)
        print(line_spaces)
        # break
    
    best, is_corrected = lm_score(mt_orig, replacements, lm=language_model, return_corrected=True)
    if is_corrected:
        corrected_count += 1
    corrected_texts.append(best)
    # break

  0%|          | 0/1875 [00:00<?, ?it/s]

703 3
mt orig δων δὲοις ότι επισυντρες
['δων δὲ οις ότι επισυντρες', 'δων δὲ ο ις ότι επισυντρες', 'δων δὲ οι ς ότι επισυντρες']
{1: ' ', 2: ' ', 3: ' ', 4: ''}
848 3
mt orig φερουσιν ιδων δεοις ἡγα
['φερουσιν ιδων δε οις ἡγα', 'φερουσιν ιδων δε ο ις ἡγα', 'φερουσιν ιδων δε οι ς ἡγα']
{1: ' ', 2: ' ', 3: ' ', 4: ''}


In [19]:
# for mt_split, refs, spaces_after in fixer.split_words(mt_words, line_spaces, cutoff=5):
#     print(mt_split, refs, spaces_after)
# print(ht_orig)

In [20]:
unicodedata.name("σ"), unicodedata.name("ς")
corrected_count
# fixer.split_matrix
# best, is_corrected = lm_score(mt_orig, replacements, lm=language_model, return_corrected=True)
# best, ht_orig

494

In [21]:
K = 6
train_iter = zip(
    mt_sequences_train[K:],
    mt_spaces_train[K:],
    mt_texts_train[K:],
    ht_sequences_train[K:]
)

for i, (mt_words, line_spaces, mt_orig, ht_orig) in enumerate(tqdm(train_iter, total=len(mt_sequences_train[K:]))):
    break
print("mt_original: ", mt_words)
print("ht:", ht_orig)
for mt_split, refs, spaces_after in fixer.split_words(mt_words, line_spaces, cutoff=0):
    print(mt_split)



  0%|          | 0/1869 [00:00<?, ?it/s]

mt_original:  ['γη', 'του', 'πτωχουεντοις', 'οικοιςχμῶν', 'ωστεκακα']
ht: ['γὴ', 'τοῦ', 'πτωχοῦ', 'ἐν', 'τοῖς', 'οἴκοις', 'ὑμῶν', 'ὥστε', 'καὶ', 'ἄ']
(('γη',), ('του',), ('πτωχου', 'εν', 'τοις'), 'οικοιςχμῶν', 'ωστεκακα')


In [22]:
# for mt_split, refs, spaces_after in fixer.resplit(mt_words, line_spaces):
#     print(mt_split, spaces_after, line_spaces)

# ht_orig, mt_orig

    

# [postprocess_sigmas(t) for t in corrected_texts]

In [23]:
# print(ht_sequences_train[-1])
#print("mt:", mt_words)
#for k, v, spaces_after in fixer.split_words(mt_words, line_spaces):
#    print(k, v, spaces_after)

In [24]:
# fixer.dmatrix
#fixer.split_matrix

In [25]:
from metrics import compute_metrics

In [26]:
ht_texts = train_df.HUMAN_TRANSCRIPTION.values
mt_texts = train_df.SYSTEM_TRANSCRIPTION_raw.values
ct = [lmr(t, lm=language_model) for t in corrected_texts]
ct2 = [fix_accent_diphthong(t) for t in ct]
ct2 = [lmr(t, word=" δ ", replacements=["δ ", " δ", " "], lm=language_model) for t in ct]
print("Unmodified:")
cerr_values_ht, cerr_values = compute_metrics(ht_texts, mt_texts, corrected_texts)
print("Corrected sigmas:")
cerr_values_ht1, cerr_values1 = compute_metrics(ht_texts, mt_texts, ct)
print("corrected deltas:")
cerr_values_ht2, cerr_values2 = compute_metrics(ht_texts, mt_texts, ct2)

Unmodified:
Candidate CER: 33.5209896862459
Candidate CERR: 0.7340852910467
Corrected sigmas:
Candidate CER: 33.5209896862459
Candidate CERR: 0.7340852910467
corrected deltas:
Candidate CER: 33.52234104599725
Candidate CERR: 0.7327339312953401


In [27]:
diff_cer = (cerr_values_ht - cerr_values)
idx = np.argsort(diff_cer)
for i in idx[:5]:
    print(diff_cer[i])
    print(repr(ct[i]))
    print(repr(ht_texts[i]), repr(mt_texts[i]))

-5.555555555555557
'τοῦ δυσιν εσαιλόγος σξωην ἐκρ'
'τοῦ Θῦ γίνεται Λόγος, ὡς ζωή νεκροῖς' 'τοῦ δυσιν εσαιλόγος ς ξωην ἐκρ'
-4.761904761904763
'ουτων πα δι ων δεξησιου'
'ουτων παιδιων δεξηται' 'ουτων πα διων δεξησιου'
-4.347826086956523
'ματιωμοι ωσ α ταιαὐτόντ ει ο κρα τοςμον τρος εεφη'
'ματι ὁμοιώσατε αὐτόν; Τὸ κράτος μου πρὸς σὲ φυ' 'ματιωμοι ωσα ταιαὐτόντ ειο κρα τοςμον τρος εεφη'
-3.846153846153843
'με του σιαθεοφιλους τορψης'
'μετουσία θεοφιλοῦς ἑορτῆς.' 'μετου σιαθεοφιλους τορψης'
-3.571428571428573
'δε τα της αλιλαιας, καὶ οὐκ'
'διὰ τῆς Γαλιλαίας, καὶ οὐκ ἤ' 'δετατης αλιλαιας, καὶ οὐκ'


In [28]:
test_df["SYSTEM_TRANSCRIPTION_raw"] = test_df.SYSTEM_TRANSCRIPTION
test_df["SYSTEM_TRANSCRIPTION"] = test_df.SYSTEM_TRANSCRIPTION.apply(lambda x: lmr(x, lm=language_model))

In [29]:
mt_orig_test = test_df.SYSTEM_TRANSCRIPTION.values
mt_sequences_test = test_df.SYSTEM_TRANSCRIPTION.apply(lambda x: word_regex.split(x)).values
mt_spaces_test = test_df.SYSTEM_TRANSCRIPTION.apply(extract_spaces_dict).values
corrected_texts_test = []
for i, (mt_words, line_spaces, mt_orig) in enumerate(tqdm(zip(mt_sequences_test, mt_spaces_test, mt_orig_test), total=len(mt_sequences_test))):
    # print(i)
    # print(mt_words)
    # mt_orig = " ".join(mt_words)
    # temporary disable the following lines
    best = mt_orig
    # replacements = ["".join([w+s for w, s in zip(mt_split, spaces_after)]) for mt_split, refs, spaces_after in fixer.split_words(mt_words, mt_spaces)]
    replacements = [
        [join_if_tuple(w) + s for w, s in zip(mt_split, postprocess_spaces(spaces_after))]
        for mt_split, refs, spaces_after in fixer.split_words(mt_words, line_spaces, cutoff=0)
    ]
    replacements = ["".join(words) for words in replacements]
    best = lm_score(mt_orig, replacements, lm=language_model)
    corrected_texts_test.append(best)

  0%|          | 0/338 [00:00<?, ?it/s]

In [30]:
corrected_texts_test1 = [lmr(t, lm=language_model) for t in corrected_texts_test]
print(np.sum([t1!= t for t1, t in zip(corrected_texts_test1, corrected_texts_test)]))
# corrected_texts_test2 = [
#     lmr(t, word=" δ ", replacements=["δ ", " δ", " "], lm=language_model)
#     for t in corrected_texts_test1
# ]
corrected_texts_test2 = [fix_accent_diphthong(t) for t in corrected_texts_test1]
print(np.sum([t1 != t for t1, t in zip(corrected_texts_test2, corrected_texts_test)]))
print(np.sum([t1 != t for t1, t in zip(corrected_texts_test2, mt_orig_test)]))

0
2
49


In [31]:
submission = pd.DataFrame(
    zip(test_df.IMAGE_PATH, corrected_texts_test2),
    columns=["ImageID", "Transcriptions"]
)
submission.sample()

Unnamed: 0,ImageID,Transcriptions
110,105 Bodleian-Library-MS-Barocci-59_00085_fol-4...,τὲ πρμον ην τὴν αν θησιν τὴν σρ


In [32]:
submission.to_csv("submission.csv", index=False)

In [33]:
%aicrowd submission create -c htrec-2022 -f submission.csv

{'submission_id': 191913, 'created_at': '2022-07-01T21:29:57.726Z'}


In [None]:
#test_spaces = test_df.SYSTEM_TRANSCRIPTION.apply(lambda x: [w for w in word_regex2.split(x) if word_regex.match(w)]).values
#train_spaces1 = train_df.SYSTEM_TRANSCRIPTION.apply(lambda x: [w for w in word_regex2.split(x) if word_regex.match(w)]).values
#train_spaces2 = train_df.HUMAN_TRANSCRIPTION.apply(lambda x: [w for w in word_regex2.split(x) if word_regex.match(w)]).values

In [None]:
#from collections import Counter
#Counter([x for xs in test_spaces for xss in xs for x in xss])

In [None]:
#Counter([x for xs in train_spaces1 for xss in xs for x in xss])

In [None]:
# Counter([x for xs in train_spaces2 for xss in xs for x in xss])

In [152]:
# n_corrected = 0
# for corrected, mt_orig, ht_orig in zip(
#         corrected_texts, mt_texts_train, ht_sequences_train):
#     if mt_orig==corrected:
#         print(repr(mt_orig))
#         print( repr(corrected))
#         print(ht_orig)
#         print("---")
#         n_corrected+=1

In [151]:
# def has_accents(text):
#     # print(text)
#     accents = [unicodedata.name(x).find(" WITH ") > 0 for word in text for x in word]
#     return np.any(accents)
# def extract_short_words(text, n=3):
#     # print(text)
#     return [word for word in text if len(word) == n]
# info = []
# for mt_text, ht_orig in zip(mt_texts_train, ht_sequences_train):
#     mt_orig = word_regex.split(mt_text)
#     info.append({
#         "mt_accent": has_accents(mt_orig),
#         "ht_accent": has_accents(ht_orig),
#         "mt_short3": extract_short_words(mt_orig, 3),
#         "ht_short3": extract_short_words(ht_orig, 3)
#     })
#     # print(info)
#     #break
# df = pd.DataFrame(info)

In [104]:
# df[["mt_accent", "ht_accent"]].corr()

In [103]:
# train_df['accent'] = df.ht_accent

In [102]:
# train_df.groupby("CENTURY").mean()["accent"]

In [149]:

# from common import remove_caps
# df.groupby("ht_accent").mean()["mt_accent"]
# ht_words = [x for xs in df[df["ht_accent"]].ht_short3.values for x in xs 
#  if remove_cap(x) == remove_cap("μου")
# ]

In [150]:
# from collections import Counter
# sorted(Counter(ht_words).items(), key=lambda x: -x[1])

In [148]:
# Counter([x for xs in df.mt_short3.values for x in xs if remove_cap(x) == remove_cap("μου")])