In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
import random
import pandas as pd
import numpy as np
from pathlib import Path

In [17]:
# fix seeds, make everything reproducible, etc (at least try to).
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

DATADIR = "../data"

MAX_WORDS = 4

In [18]:
# !pip install aicrowd-cli
%load_ext aicrowd.magic

The aicrowd.magic extension is already loaded. To reload it, use:
  %reload_ext aicrowd.magic


In [19]:
%aicrowd login

[32mAPI Key valid[0m
[33mGitlab oauth token invalid or absent.
It is highly recommended to simply run `aicrowd login` without passing the API Key.[0m
[32mSaved details successfully![0m


In [20]:
import re
if Path(DATADIR).exists():
  !rm -rf $DATADIR
!mkdir $DATADIR
%aicrowd ds dl -c htrec-2022 -o $DATADIR

train.csv:   0%|          | 0.00/395k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/45.5k [00:00<?, ?B/s]

In [21]:
import pywer
train_df = pd.read_csv( f"{DATADIR}/train.csv")
test_df = pd.read_csv(f"{DATADIR}/test.csv")

word_regex = re.compile("\W+")
word_regex2 = re.compile("(\W+)")

train_df.head()

Unnamed: 0,HUMAN_TRANSCRIPTION,SYSTEM_TRANSCRIPTION,CENTURY,IMAGE_PATH,TEXT_LINE_NUM
0,ἐγγινομένα πάθη μὴ σβεννύντες ἀλλὰ τῆ εκλύσει,ἐγγενομεναπαδημησμεννωτες ἀλλατῆε κλησει,11,1 Bodleian-Library-MS-Barocci-102_00157_fol-75...,1
1,τοῦ βίου τοῦ καθ ΄ εαυτοὺς πολλὰ γίνεσθαι συγχ...,του β ου του καλεαυτοὺς πολλαγινεσθαι συγχωρ όν,11,1 Bodleian-Library-MS-Barocci-102_00157_fol-75...,2
2,τες ἐμπυρίζουσι τὸν ἀμπελῶνα ἀλλὰ καὶ ὁ διὰ,τες εμπυριζου σιμαμπελῶνα ἀλλακαι ὅδξα,11,1 Bodleian-Library-MS-Barocci-102_00157_fol-75...,3
3,τῆς ἡδεῖας πλεονεξίας πολλοὺς εἰς τὴν τῶν ἀλλ,της ἐδίας πλσον ἐξιας πολλους ἐις τὴν τῶν ἀλ,11,1 Bodleian-Library-MS-Barocci-102_00157_fol-75...,4
4,οτρίων ἐπιθυμίαν προκαλούμενος ἐμπυρί,λοτρλων ἐπιθυμιαν προκαλουμένος ἐμπυρι,11,1 Bodleian-Library-MS-Barocci-102_00157_fol-75...,5


In [22]:
# !pip install nltk

In [23]:
for lib_dir in ["..", "../src"]:
    if not lib_dir in sys.path:
        sys.path.append(lib_dir)
from lm_utils import *

In [24]:
language_model = make_lm(train_df.HUMAN_TRANSCRIPTION.sum())

In [25]:
from tqdm.auto import tqdm
from datastruct import *
from common import *
from space_fixer import SpaceFixer

# for ht_line, mt_line in tqdm(train_df[["HUMAN_TRANSCRIPTION", "SYSTEM_TRANSCRIPTION"]].values[7:]):
#     ht_words = word_regex.split(ht_line)
#     mt_words = word_regex.split(mt_line)
#     #words_ = [remove_cap(word) for word in words]
#     #vocab.add_sentence(words)
#     dmatrix = build_path_matrix(mt_words, vocabs)
#     finished_paths = extract_paths(dmatrix)
#     for k in resplit_paths(finished_paths, mt_words):
#         variant = " ".join(k)
#         print(variant)
#     break


In [26]:
def extract_spaces_dict(x):
    result = dict()
    last_index = 0
    for w in word_regex2.split(x):
        if not word_regex.match(w):
            last_index += 1
        else:
            result[last_index] = w
    if not last_index in result:
        result[last_index] = ""
    return result
    

In [51]:

fullword_regex = re.compile("\w+")
train_df["SYSTEM_TRANSCRIPTION_raw"] = train_df.SYSTEM_TRANSCRIPTION
train_df["SYSTEM_TRANSCRIPTION"] = train_df.SYSTEM_TRANSCRIPTION.apply(lambda x: lmr(x, lm=language_model))

ht_sequences_train = train_df.HUMAN_TRANSCRIPTION.apply(
    lambda x: word_regex.split(x)
).values
mt_sequences_train = train_df.SYSTEM_TRANSCRIPTION.apply(
    lambda x: fullword_regex.findall(x)
).values
mt_spaces_train = train_df.SYSTEM_TRANSCRIPTION.apply(extract_spaces_dict).values

mt_texts_train = train_df.SYSTEM_TRANSCRIPTION.values
fixer = SpaceFixer(MAX_WORDS)
fixer.fill(ht_sequences_train)

  0%|          | 0/1875 [00:00<?, ?it/s]

In [62]:
corrected_texts = []
# mt_sequences_train = mt_sequences_train[-1:]
# mt_spaces_train = mt_spaces_train[-1:]
# mt_texts_train = mt_texts_train[-1:]
# ht_sequences_train = ht_sequences_train[-1:]
K=0
train_iter = zip(
    mt_sequences_train[K:],
    mt_spaces_train[K:],
    mt_texts_train[K:],
    ht_sequences_train[K:]
)
for i, (mt_words, line_spaces, mt_orig, ht_orig) in enumerate(tqdm(train_iter, total=len(mt_sequences_train[K:]))):
    # print(line_spaces)
    # print(mt_words)
    # mt_orig = " ".join(mt_words)
    best = mt_orig
    # temporary disable the following lines
    replacements = [
        "".join([w + s for w, s in zip(mt_split, spaces_after)]) 
        for mt_split, refs, spaces_after in fixer.resplit(mt_words, line_spaces, cutoff=1)
    ]
    best = lm_score(mt_orig, replacements, lm=language_model)
    corrected_texts.append(best)
    # break

  0%|          | 0/1875 [00:00<?, ?it/s]

In [70]:
unicodedata.name("σ"), unicodedata.name("ς")
# fixer.dmatrix
# corrected_texts, mt_words, line_spaces, mt_orig, ht_orig
#for mt_split, refs, spaces_after in fixer.resplit(mt_words, line_spaces, cutoff=1):
#    print(mt_split, spaces_after)
# corrected_texts[0].split()
print(mt_orig, ht_orig)
print(mt_words)
for mt_split, refs, spaces_after in fixer.resplit(mt_words, line_spaces, cutoff=3):
    print(mt_words, refs, spaces_after)
# wr = re.compile("\w+")
# re.compile("\W+")
# wr.findall(train_df.SYSTEM_TRANSCRIPTION.values[305])
corrected_texts[:2]

δομενους της καταρας ελευθερωσηψβοα ['δομενους', 'της', 'καταρας', 'ελευθερωση', 'και', 'βοα']
['δομενους', 'της', 'καταρας', 'ελευθερωσηψβοα']
['δομενους', 'της', 'καταρας', 'ελευθερωσηψβοα'] [('δομενους',), ('της', 'τῆς'), ('καταρας',), ('ελευθερωσηψβοα',)] [' ', ' ', ' ', '']


['ἐγγενομεναπαδημησμεννωτες ἀλλα τῆ εκλησει',
 'του β ου του καλ εαυτοὺς πολλα γινεσθαι συγχωρ όν']

In [71]:
# for mt_split, refs, spaces_after in fixer.resplit(mt_words, line_spaces):
#     print(mt_split, spaces_after, line_spaces)

# ht_orig, mt_orig
def postprocess_sigmas(sentence):
    def correct_word(word):
        if word_regex.match(word):
            return word
        #"ς"
        cw = []
        if len(word) < 1:
            return word
        for c in word[:-1]:
            name = unicodedata.name(c)
            new_char = c
            if name.find("FINAL SIGMA") >= 0:
                name = name.replace("FINAL ", "")
                try:
                    new_char = unicodedata.lookup(name)
                except:
                    new_char = c
            cw.append(new_char)
        c = word[-1]
        name = unicodedata.name(c)
        if name.find("SIGMA") >= 0 and name.find("FINAL SIGMA") < 0:
            name = name.replace("SIGMA", "FINAL SIGMA")
            try:
                new_char = unicodedata.lookup(name)
            except:
                new_char = c
        cw.append(c)
        return "".join(cw)

    words = [correct_word(x) for x in word_regex2.split(sentence)]
    return "".join(words)
    

# [postprocess_sigmas(t) for t in corrected_texts]

In [72]:
# print(ht_sequences_train[-1])
print("mt:", mt_words)
for k, v, spaces_after in fixer.resplit(mt_words, line_spaces):
    print(mt_words, line_spaces)
    print(k, v, spaces_after)
    print(repr("".join([w+s for w, s in zip(k, spaces_after)])))

mt: ['δομενους', 'της', 'καταρας', 'ελευθερωσηψβοα']
['δομενους', 'της', 'καταρας', 'ελευθερωσηψβοα'] {1: ' ', 2: ' ', 3: ' ', 4: ''}
('δομενους', 'της', 'καταρας', 'ελευθερωσηψβοα') [('δομενους',), ('της', 'τῆς'), ('καταρας',), ('ελευθερωσηψβοα',)] [' ', ' ', ' ', '']
'δομενους της καταρας ελευθερωσηψβοα'


In [73]:
# fixer.dmatrix
np.sum([
    mt!= st 
    for mt, st in zip(corrected_texts, train_df.SYSTEM_TRANSCRIPTION.values)
])

1069

In [74]:
from metrics import compute_metrics

In [75]:
ht_texts = train_df.HUMAN_TRANSCRIPTION.values
mt_texts = train_df.SYSTEM_TRANSCRIPTION_raw.values
ct = [lmr(t, lm=language_model) for t in corrected_texts]
# ct2 = [postprocess_sigmas(t) for t in ct]
ct3 = [
    lmr(t, word=" δ ", replacements=["δ ", " δ", " "], lm=language_model) for t in ct]
ct_ = [
    lmr(t, word=" ς ", replacements=["ς ", " "], lm=language_model) for t in corrected_texts]

print("Unmodified:")
cerr_values_ht, cerr_values = compute_metrics(ht_texts, mt_texts, corrected_texts)
print("Sigma:")
cerr_values_ht, cerr_values = compute_metrics(ht_texts, mt_texts, ct)
# print("Sigma orig:")
# cerr_values_ht2, cerr_values2 = compute_metrics(ht_texts, mt_texts, ct_)
print("sigma+delta, no postprocess sigma:")
cerr_values_ht3, cerr_values3 = compute_metrics(ht_texts, mt_texts, ct3)


Unmodified:
Candidate CER: 32.34161879035121
Candidate CERR: 1.9029783584867797
Sigma:
Candidate CER: 32.32737233172925
Candidate CERR: 1.9172248171087445
sigma+delta, no postprocess sigma:
Candidate CER: 32.32353275846639
Candidate CERR: 1.9210643903716018


In [76]:
diff_cer = (cerr_values_ht3 - cerr_values3)
idx = np.argsort(diff_cer)
for i in idx[:5]:
    print(i, diff_cer[i])
    print(repr(ct3[i]))
    print(repr(ht_texts[i]), repr(mt_texts[i]))

538 -10.344827586206904
'α με έ ρ ρό τοτον ὸ γαρ ὑπεεινόν'
'ἀμέρσας βίον. τὸ γὰρ ὑπέγγυον' 'αμεέρ ρότοτον ὸ γαρ ὑπεεινόν'
437 -10.34482758620689
'εγω δε τοι τὸν πλοικαμόν  ἀν αδή τ ς'
'ἐγὼ δὲ τον πλόκαμον ἀναδέτοις' 'εγω δετοι τὸν πλοικαμόν ἀναδήτ ς'
315 -8.823529411764703
'μομος κόμῶταρτου τες η τό μετα ς·'
'Νόμος νόμῳ γὰρ τοὺς θεοὺς ἡγούμεθα' 'μομος κόμῶταρ του τες η τόμετας·'
425 -8.695652173913047
'πυριῶν καταρ δ᾽ ἀν τα λ καπν'
'πύργων, κατὰ δ᾽ αἰθάλου' 'πυριῶν καταρ δ᾽ ἀνταλ καπν'
330 -8.333333333333336
'μολθουμθν ὼρχλὶ παὐ τας κ μαστενό μη'
'μοχθοῦμεν ὡς χρὴ πάντα καὶ ματεύομεν' 'μολθουμθν ὼρ χλὶ παὐτας κ μαστενόμη'


In [77]:
test_df["SYSTEM_TRANSCRIPTION_raw"] = test_df.SYSTEM_TRANSCRIPTION
test_df["SYSTEM_TRANSCRIPTION"] = test_df.SYSTEM_TRANSCRIPTION.apply(lambda x: lmr(x, lm=language_model))

In [78]:
mt_orig_test = test_df.SYSTEM_TRANSCRIPTION.values
mt_sequences_test = test_df.SYSTEM_TRANSCRIPTION.apply(lambda x: word_regex.split(x.strip())).values
mt_spaces_test = test_df.SYSTEM_TRANSCRIPTION.apply(extract_spaces_dict).values
corrected_texts_test = []
for i, (mt_words, mt_spaces, mt_orig) in enumerate(tqdm(zip(mt_sequences_test, mt_spaces_test, mt_orig_test), total=len(mt_sequences_test))):
    # print(i)
    # print(mt_words)
    # mt_orig = " ".join(mt_words)
    # temporary disable the following lines
    best = mt_orig
    replacements = [
        "".join([w+s for w, s in zip(mt_split, spaces_after)]) for mt_split, refs, spaces_after in fixer.resplit(mt_words, mt_spaces, cutoff=1)
    ]
    best = lm_score(mt_orig, replacements, lm=language_model)
    corrected_texts_test.append(best)

  0%|          | 0/338 [00:00<?, ?it/s]

In [79]:
from space_fixer import extract_paths, resplit_paths
# paths = extract_paths(fixer.dmatrix)

# # resplit_dict, spaces_dict = resplit_paths(paths, mt_words, mt_spaces) 
# #for path in paths:
# #    for a, b, c, segments in path:
# #        print(segments)
# #    print("---")
# print(mt_words)
# paths[1]
# mt_orig

In [80]:
corrected_texts_test1 = [lmr(t, lm=language_model) for t in corrected_texts_test]
# ct = [lmr(t, lm=language_model) for t in corrected_texts]
# ct2 = [postprocess_sigmas(t) for t in ct]
corrected_texts_test2 = [
    lmr(t, word=" δ ", replacements=["δ ", " δ", " "], lm=language_model)
    for t in corrected_texts_test1
]

# corrected_texts_test2 = [postprocess_sigmas(t) for t in corrected_texts_test1]

In [81]:
submission = pd.DataFrame(zip(test_df.IMAGE_PATH, corrected_texts_test2), columns=["ImageID", "Transcriptions"])
submission.sample()

Unnamed: 0,ImageID,Transcriptions
110,105 Bodleian-Library-MS-Barocci-59_00085_fol-4...,τὲ πρμον ην τὴν αν θησιν τὴν σρ


In [82]:
submission.to_csv("submission.csv", index=False)

In [83]:
%aicrowd submission create -c htrec-2022 -f submission.csv

{'submission_id': 191686, 'created_at': '2022-06-30T21:39:03.683Z'}


In [None]:
#test_spaces = test_df.SYSTEM_TRANSCRIPTION.apply(lambda x: [w for w in word_regex2.split(x) if word_regex.match(w)]).values
#train_spaces1 = train_df.SYSTEM_TRANSCRIPTION.apply(lambda x: [w for w in word_regex2.split(x) if word_regex.match(w)]).values
#train_spaces2 = train_df.HUMAN_TRANSCRIPTION.apply(lambda x: [w for w in word_regex2.split(x) if word_regex.match(w)]).values

In [None]:
#from collections import Counter
#Counter([x for xs in test_spaces for xss in xs for x in xss])

In [None]:
#Counter([x for xs in train_spaces1 for xss in xs for x in xss])

In [None]:
# Counter([x for xs in train_spaces2 for xss in xs for x in xss])