In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import random
import pandas as pd
import numpy as np
from pathlib import Path

In [3]:
# fix seeds, make everything reproducible, etc (at least try to).
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

DATADIR = "../data"

MAX_WORDS = 4

In [4]:
# !pip install aicrowd-cli
%load_ext aicrowd.magic

In [5]:
%aicrowd login

[32mAPI Key valid[0m
[33mGitlab oauth token invalid or absent.
It is highly recommended to simply run `aicrowd login` without passing the API Key.[0m
[32mSaved details successfully![0m


In [6]:
import re
if Path(DATADIR).exists():
  !rm -rf $DATADIR
!mkdir $DATADIR
%aicrowd ds dl -c htrec-2022 -o $DATADIR

train.csv:   0%|          | 0.00/395k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/45.5k [00:00<?, ?B/s]

In [7]:
import pywer
train_df = pd.read_csv( f"{DATADIR}/train.csv")
test_df = pd.read_csv(f"{DATADIR}/test.csv")

word_regex = re.compile("\W+")
word_regex2 = re.compile("(\W+)")

train_df.head()

Unnamed: 0,HUMAN_TRANSCRIPTION,SYSTEM_TRANSCRIPTION,CENTURY,IMAGE_PATH,TEXT_LINE_NUM
0,ἐγγινομένα πάθη μὴ σβεννύντες ἀλλὰ τῆ εκλύσει,ἐγγενομεναπαδημησμεννωτες ἀλλατῆε κλησει,11,1 Bodleian-Library-MS-Barocci-102_00157_fol-75...,1
1,τοῦ βίου τοῦ καθ ΄ εαυτοὺς πολλὰ γίνεσθαι συγχ...,του β ου του καλεαυτοὺς πολλαγινεσθαι συγχωρ όν,11,1 Bodleian-Library-MS-Barocci-102_00157_fol-75...,2
2,τες ἐμπυρίζουσι τὸν ἀμπελῶνα ἀλλὰ καὶ ὁ διὰ,τες εμπυριζου σιμαμπελῶνα ἀλλακαι ὅδξα,11,1 Bodleian-Library-MS-Barocci-102_00157_fol-75...,3
3,τῆς ἡδεῖας πλεονεξίας πολλοὺς εἰς τὴν τῶν ἀλλ,της ἐδίας πλσον ἐξιας πολλους ἐις τὴν τῶν ἀλ,11,1 Bodleian-Library-MS-Barocci-102_00157_fol-75...,4
4,οτρίων ἐπιθυμίαν προκαλούμενος ἐμπυρί,λοτρλων ἐπιθυμιαν προκαλουμένος ἐμπυρι,11,1 Bodleian-Library-MS-Barocci-102_00157_fol-75...,5


In [8]:
# !pip install nltk

In [9]:
for lib_dir in ["..", "../src"]:
    if not lib_dir in sys.path:
        sys.path.append(lib_dir)
from lm_utils import *

In [10]:
language_model = make_lm(train_df.HUMAN_TRANSCRIPTION.sum())

In [11]:
from tqdm.auto import tqdm
from datastruct import *
from common import *
from space_fixer import SpaceFixer

# for ht_line, mt_line in tqdm(train_df[["HUMAN_TRANSCRIPTION", "SYSTEM_TRANSCRIPTION"]].values[7:]):
#     ht_words = word_regex.split(ht_line)
#     mt_words = word_regex.split(mt_line)
#     #words_ = [remove_cap(word) for word in words]
#     #vocab.add_sentence(words)
#     dmatrix = build_path_matrix(mt_words, vocabs)
#     finished_paths = extract_paths(dmatrix)
#     for k in resplit_paths(finished_paths, mt_words):
#         variant = " ".join(k)
#         print(variant)
#     break


In [12]:
def extract_spaces_dict(x):
    result = dict()
    last_index = 0
    for w in word_regex2.split(x):
        if not word_regex.match(w):
            last_index += 1
        else:
            result[last_index] = w
    return result
    

In [24]:

ht_sequences_train = train_df.HUMAN_TRANSCRIPTION.apply(lambda x: word_regex.split(x)).values
mt_sequences_train = train_df.SYSTEM_TRANSCRIPTION.apply(lambda x: word_regex.split(x)).values
mt_spaces_train = train_df.SYSTEM_TRANSCRIPTION.apply(extract_spaces_dict).values

mt_texts_train = train_df.SYSTEM_TRANSCRIPTION.values
fixer = SpaceFixer(MAX_WORDS)
fixer.fill(ht_sequences_train)

  0%|          | 0/1875 [00:00<?, ?it/s]

In [25]:
corrected_texts = []
train_iter = zip(mt_sequences_train, mt_spaces_train, mt_texts_train, ht_sequences_train)
for i, (mt_words, line_spaces, mt_orig, ht_orig) in enumerate(tqdm(train_iter, total=len(mt_sequences_train))):
    # print(line_spaces)
    # print(mt_words)
    # mt_orig = " ".join(mt_words)
    replacements = [
        "".join([w + s for w, s in zip(mt_split, spaces_after)]) 
        for mt_split, refs, spaces_after in fixer.resplit(mt_words, line_spaces)
    ]
    best = lm_score(mt_orig, replacements, lm=language_model)
    corrected_texts.append(best)

  0%|          | 0/1875 [00:00<?, ?it/s]

In [26]:
# print(ht_sequences_train[-1])
print("mt:", mt_words)
for k, v, spaces_after in fixer.resplit(mt_words, line_spaces):
    print(k, v, spaces_after)

mt: ['δομενους', 'της', 'καταρας', 'ελευθερωσηψβοα']
('δομενους', 'της', 'καταρας', 'ελευθερωση', 'ψ', 'βοα') [('δομενους',), ('τῆς', 'της'), ('καταρας',), ('ελευθερωση',), ('και',), ('βοα',)] [' ', ' ', ' ', ' ']


In [27]:
from metrics import compute_metrics

In [28]:
ht_texts = train_df.HUMAN_TRANSCRIPTION.values
mt_texts = train_df.SYSTEM_TRANSCRIPTION.values
compute_metrics(ht_texts, mt_texts, corrected_texts)

Candidate CER: 42.89912163907974
Candidate CERR: -8.644046661787147


In [30]:
mt_orig_test = test_df.SYSTEM_TRANSCRIPTION.values
mt_sequences_test = test_df.SYSTEM_TRANSCRIPTION.apply(lambda x: word_regex.split(x)).values
mt_spaces_test = test_df.SYSTEM_TRANSCRIPTION.apply(extract_spaces_dict).values
corrected_texts_test = []
for i, (mt_words, mt_spaces, mt_orig) in enumerate(tqdm(zip(mt_sequences_test, mt_spaces_test, mt_orig_test), total=len(mt_sequences_test))):
    # print(i)
    # print(mt_words)
    # mt_orig = " ".join(mt_words)
    replacements = ["".join([w+s for w, s in zip(mt_split, spaces_after)]) for mt_split, refs, spaces_after in fixer.resplit(mt_words, mt_spaces)]
    best = lm_score(mt_orig, replacements, lm=language_model)
    corrected_texts_test.append(best)

  0%|          | 0/338 [00:00<?, ?it/s]

In [31]:
submission = pd.DataFrame(zip(test_df.IMAGE_PATH, corrected_texts_test), columns=["ImageID", "Transcriptions"])
submission.sample()

Unnamed: 0,ImageID,Transcriptions
251,12th Bodleian-Library-MS-Barocci-132_00631_fol...,"φυλάκην ον πλεμω πληροῦντες, τὴν"


In [32]:
submission.to_csv("submission.csv", index=False)

In [33]:
%aicrowd submission create -c htrec-2022 -f submission.csv

{'submission_id': 189598, 'created_at': '2022-06-19T16:10:50.026Z'}


In [34]:
#test_spaces = test_df.SYSTEM_TRANSCRIPTION.apply(lambda x: [w for w in word_regex2.split(x) if word_regex.match(w)]).values
#train_spaces1 = train_df.SYSTEM_TRANSCRIPTION.apply(lambda x: [w for w in word_regex2.split(x) if word_regex.match(w)]).values
#train_spaces2 = train_df.HUMAN_TRANSCRIPTION.apply(lambda x: [w for w in word_regex2.split(x) if word_regex.match(w)]).values

In [35]:
#from collections import Counter
#Counter([x for xs in test_spaces for xss in xs for x in xss])

In [36]:
#Counter([x for xs in train_spaces1 for xss in xs for x in xss])

In [37]:
# Counter([x for xs in train_spaces2 for xss in xs for x in xss])