In [12]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:
import random
import pandas as pd
import numpy as np
from pathlib import Path

In [30]:
# fix seeds, make everything reproducible, etc (at least try to).
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

DATADIR = "../data"

MAX_WORDS = 4

In [15]:
# !pip install aicrowd-cli
%load_ext aicrowd.magic

The aicrowd.magic extension is already loaded. To reload it, use:
  %reload_ext aicrowd.magic


In [16]:
%aicrowd login

[32mAPI Key valid[0m
[33mGitlab oauth token invalid or absent.
It is highly recommended to simply run `aicrowd login` without passing the API Key.[0m
[32mSaved details successfully![0m


In [17]:
import re
if Path(DATADIR).exists():
  !rm -rf $DATADIR
!mkdir $DATADIR
%aicrowd ds dl -c htrec-2022 -o $DATADIR

train.csv:   0%|          | 0.00/395k [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/45.5k [00:00<?, ?B/s]

In [18]:
import pywer
train_df = pd.read_csv( f"{DATADIR}/train.csv")
test_df = pd.read_csv(f"{DATADIR}/test.csv")

word_regex = re.compile("\W+")
word_regex2 = re.compile("(\W+)")

train_df.head()

Unnamed: 0,HUMAN_TRANSCRIPTION,SYSTEM_TRANSCRIPTION,CENTURY,IMAGE_PATH,TEXT_LINE_NUM
0,ἐγγινομένα πάθη μὴ σβεννύντες ἀλλὰ τῆ εκλύσει,ἐγγενομεναπαδημησμεννωτες ἀλλατῆε κλησει,11,1 Bodleian-Library-MS-Barocci-102_00157_fol-75...,1
1,τοῦ βίου τοῦ καθ ΄ εαυτοὺς πολλὰ γίνεσθαι συγχ...,του β ου του καλεαυτοὺς πολλαγινεσθαι συγχωρ όν,11,1 Bodleian-Library-MS-Barocci-102_00157_fol-75...,2
2,τες ἐμπυρίζουσι τὸν ἀμπελῶνα ἀλλὰ καὶ ὁ διὰ,τες εμπυριζου σιμαμπελῶνα ἀλλακαι ὅδξα,11,1 Bodleian-Library-MS-Barocci-102_00157_fol-75...,3
3,τῆς ἡδεῖας πλεονεξίας πολλοὺς εἰς τὴν τῶν ἀλλ,της ἐδίας πλσον ἐξιας πολλους ἐις τὴν τῶν ἀλ,11,1 Bodleian-Library-MS-Barocci-102_00157_fol-75...,4
4,οτρίων ἐπιθυμίαν προκαλούμενος ἐμπυρί,λοτρλων ἐπιθυμιαν προκαλουμένος ἐμπυρι,11,1 Bodleian-Library-MS-Barocci-102_00157_fol-75...,5


In [19]:
# !pip install nltk

In [20]:
for lib_dir in ["..", "../src"]:
    if not lib_dir in sys.path:
        sys.path.append(lib_dir)
from lm_utils import *

In [21]:
language_model = make_lm(train_df.HUMAN_TRANSCRIPTION.sum())

In [27]:
from tqdm.auto import tqdm
from datastruct import *
from common import *
from space_fixer import SpaceFixer

for ht_line, mt_line in tqdm(train_df[["HUMAN_TRANSCRIPTION", "SYSTEM_TRANSCRIPTION"]].values[7:]):
    ht_words = word_regex.split(ht_line)
    mt_words = word_regex.split(mt_line)
    #words_ = [remove_cap(word) for word in words]
    #vocab.add_sentence(words)
    dmatrix = build_path_matrix(mt_words, vocabs)
    finished_paths = extract_paths(dmatrix)
    for k in resplit_paths(finished_paths, mt_words):
        variant = " ".join(k)
        print(variant)
    break


  0%|          | 0/1868 [00:00<?, ?it/s]

In [40]:

ht_sequences = train_df.HUMAN_TRANSCRIPTION.apply(lambda x: word_regex.split(x)).values
mt_sequences = train_df.SYSTEM_TRANSCRIPTION.apply(lambda x: word_regex.split(x)).values
fixer = SpaceFixer(MAX_WORDS)
fixer.fill(ht_sequences)

  0%|          | 0/1875 [00:00<?, ?it/s]

In [60]:
corrected_texts = []
for i, mt_words in enumerate(tqdm(mt_sequences)):
    # print(i)
    # print(mt_words)
    mt_orig = " ".join(mt_words)
    replacements = [" ".join(mt_split) for mt_split, refs in fixer.resplit(mt_words)]
    best = lm_score(mt_orig, replacements, lm=language_model)
    corrected_texts.append(best)

  0%|          | 0/1875 [00:00<?, ?it/s]

In [58]:
for k, v in fixer.resplit(mt_words):
    print(k, v)

('λοτρλων', 'ἐπιθυμιαν', 'προκαλουμένος', 'ἐμπυρι') [('λοτρλων',), ('επιθυμιαν', 'ἐπιθυμίαν'), ('προκαλούμενος',), ('ἐμπυρί',)]


In [61]:
from metrics import compute_metrics

In [65]:
ht_texts = train_df.HUMAN_TRANSCRIPTION.values
mt_texts = train_df.SYSTEM_TRANSCRIPTION.values
compute_metrics(ht_texts, mt_texts, corrected_texts)

Candidate CER: 31.750845474841352
Candidate CERR: -2.5042295024512384


In [66]:
mt_sequences_test = test_df.SYSTEM_TRANSCRIPTION.apply(lambda x: word_regex.split(x)).values
corrected_texts_test = []
for i, mt_words in enumerate(tqdm(mt_sequences_test)):
    # print(i)
    # print(mt_words)
    mt_orig = " ".join(mt_words)
    replacements = [" ".join(mt_split) for mt_split, refs in fixer.resplit(mt_words)]
    best = lm_score(mt_orig, replacements, lm=language_model)
    corrected_texts_test.append(best)

  0%|          | 0/338 [00:00<?, ?it/s]

In [67]:
submission = pd.DataFrame(zip(test_df.IMAGE_PATH, corrected_texts_test), columns=["ImageID", "Transcriptions"])
submission.sample()

Unnamed: 0,ImageID,Transcriptions
110,105 Bodleian-Library-MS-Barocci-59_00085_fol-4...,τὲ πρμον ην τὴν αν θησιν τὴν σρ


In [68]:
submission.to_csv("submission.csv", index=False)

In [69]:
%aicrowd submission create -c htrec-2022 -f submission.csv

{'submission_id': 189577, 'created_at': '2022-06-19T12:54:46.876Z'}
