In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

In [3]:
from pathlib import Path

data_base_dir = Path('/Users/janneke/Documents/Documents – Janneke’s MacBook/data/ocrpostcorrection')
data_base_dir.mkdir(exist_ok=True, parents=True)

In [4]:
test_data_dir = Path('/Users/janneke/Documents/Documents – Janneke’s MacBook/data/ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_evaluation_4M_without_Finnish')

In [5]:
from ocrpostcorrection.icdar_data import generate_data

data_test, md_test = generate_data(test_data_dir)

11it [00:07,  1.55it/s]


In [6]:
from ocrpostcorrection.utils import create_perfect_icdar_output

output = create_perfect_icdar_output(data_test)

In [7]:
# in: results json (detected ocr mistakes) + dict of Text objects
# out: input for task 2: df with columns: ocr, gs, start
# derived columns: len_ocr, language, subset, dataset, len_gs, diff

from typing import Any

from ocrpostcorrection.icdar_data import Text


def icdar_output2correction_dataset(output: dict[str, dict[str,dict]], data: dict[str, Text], dataset: str='test') -> pd.DataFrame:
    samples = []
    for key, mistakes in output.items():
        text = data[key]
        for token in mistakes:
            sample = {}
            parts = token.split(':')
            start_idx = int(parts[0])
            num_tokens = int(parts[0])
            for at in text.tokens:
                if at.start == start_idx:
                    sample['ocr'] = at.ocr
                    sample['gs'] = at.gs
                    sample['start'] = at.start
                    sample['text'] = key
                    sample['len_ocr'] = at.len_ocr
                    sample['len_gs'] = len(at.gs)
                    parts = key.split('/')
                    sample['language'] = parts[0]
                    sample['subset'] = parts[1]
                    sample['dataset'] = dataset
    
            if sample == {}:
                raise ValueError(f'No token found for {key}, start index: {start_idx}')
            samples.append(sample)
    return pd.DataFrame(samples)

task2_input = icdar_output2correction_dataset(output, data_test)

In [8]:
task2_input

Unnamed: 0,ocr,gs,start,text,len_ocr,len_gs,language,subset,dataset
0,Sab ftojn. 4 Sha li ti. S$ le. J She lje. Si d...,,0,SL/SL1/29.txt,527,0,SL,SL1,test
1,Zha I,Zha,528,SL/SL1/29.txt,5,3,SL,SL1,test
2,tefts,227 tests,0,SL/SL1/15.txt,5,10,SL,SL1,test
3,je -,je,18,SL/SL1/15.txt,4,2,SL,SL1,test
4,she.viija,shevina,23,SL/SL1/15.txt,9,7,SL,SL1,test
...,...,...,...,...,...,...,...,...,...
385360,’Tis,'Tis,1479,EN/EN1/24.txt,4,4,EN,EN1,test
385361,keeper.,keeper. 'Tis,1513,EN/EN1/24.txt,7,12,EN,EN1,test
385362,’Tis,"TRISTRAM SHANDY, Gent. 13 I hope, continued th...",1521,EN/EN1/24.txt,4,297,EN,EN1,test
385363,HINDU,HINDU,0,EN/EN1/30.txt,5,6,EN,EN1,test


In [9]:
# Create 'perfect' task2 output
task2_output = task2_input.copy().set_index('text')
task2_output['pred'] = task2_output['gs']
task2_output

Unnamed: 0_level_0,ocr,gs,start,len_ocr,len_gs,language,subset,dataset,pred
text,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
SL/SL1/29.txt,Sab ftojn. 4 Sha li ti. S$ le. J She lje. Si d...,,0,527,0,SL,SL1,test,
SL/SL1/29.txt,Zha I,Zha,528,5,3,SL,SL1,test,Zha
SL/SL1/15.txt,tefts,227 tests,0,5,10,SL,SL1,test,227 tests
SL/SL1/15.txt,je -,je,18,4,2,SL,SL1,test,je
SL/SL1/15.txt,she.viija,shevina,23,9,7,SL,SL1,test,shevina
...,...,...,...,...,...,...,...,...,...
EN/EN1/24.txt,’Tis,'Tis,1479,4,4,EN,EN1,test,'Tis
EN/EN1/24.txt,keeper.,keeper. 'Tis,1513,7,12,EN,EN1,test,keeper. 'Tis
EN/EN1/24.txt,’Tis,"TRISTRAM SHANDY, Gent. 13 I hope, continued th...",1521,4,297,EN,EN1,test,"TRISTRAM SHANDY, Gent. 13 I hope, continued th..."
EN/EN1/30.txt,HINDU,HINDU,0,5,6,EN,EN1,test,HINDU


In [10]:
for key, mistakes in output.items():
    samples = task2_output.loc[key]
    # If there is only 1 erronous token, samples is not a DataFrame but a Series
    if isinstance(samples, pd.DataFrame):
        for token, (i, row) in zip(mistakes, samples.iterrows()):
            output[key][token][row.gs] = 1.0
    else:
        token = list(mistakes.keys())[0]
        output[key][token][samples.gs] = 1.0
    

In [11]:
import json

out_dir = Path('data')
out_dir.mkdir(parents=True, exist_ok=True)

out_json = out_dir/'results_task1+2_perfect.json'

with open(out_json, 'w') as f:
    json.dump(output, f)

In [12]:
def read_results(csv_file):
    data = pd.read_csv(csv_file, sep=';')
    data['language'] = data.File.apply(lambda x: x[:2])
    data['subset'] = data.File.apply(lambda x: x.split('/')[1])

    return data

## Result for 'perfect' output with original version of evalTool

In [13]:
out_dir = Path('data')
in_json = out_dir/'results_task1+2_perfect.json'
out_csv = out_dir/'results_task1+2_perfect_original_evalTool.csv'

In [14]:
!python evalTool_ICDAR2017.py "{test_data_dir}" {out_json} {out_csv}

Using:
datasetDirPath: /Users/janneke/Documents/Documents – Janneke’s MacBook/data/ICDAR2019_POCR_competition_dataset/ICDAR2019_POCR_competition_evaluation_4M_without_Finnish 
pathInputJsonErrorsCorrections: data/results_task1+2_perfect.json 
pathOutputCsv: data/results_task1+2_perfect_original_evalTool.csv

File	NbTokens	NbErroneousTokens	NbSymbolsConsidered	T1_Precision	T1_Recall	T1_Fmesure	T2_AvgLVDistOriginal	T2_AvgLVDistCorrected
SL/SL1/29.txt	2	2	532	1.00	1.00	1.00	0.99	0.00
SL/SL1/15.txt	83	28	404	1.00	1.00	1.00	0.15	0.00
SL/SL1/14.txt	115	50	624	1.00	1.00	1.00	0.16	0.00
SL/SL1/28.txt	273	21	1270	1.00	1.00	1.00	0.03	0.00
SL/SL1/16.txt	1004	182	4838	1.00	1.00	1.00	0.08	0.00
SL/SL1/17.txt	1009	333	4929	1.00	1.00	1.00	0.19	0.00
SL/SL1/13.txt	139	49	715	1.00	1.00	1.00	0.09	0.00
SL/SL1/12.txt	333	30	1491	1.00	1.00	1.00	0.03	0.00
SL/SL1/10.txt	314	36	1278	1.00	1.00	1.00	0.05	0.00
SL/SL1/38.txt	405	33	1995	1.00	1.00	1.00	0.03	0.00
SL/SL1/39.txt	143	73	694	1.00	1.00	1.00	0.15	0.00
SL/SL

In [15]:
data_orig = read_results(out_csv)

In [16]:
data_orig[data_orig.T1_Precision != 1.0].shape[0]

1

In [17]:
data_orig[data_orig.T1_Recall != 1.0].shape[0]

2079

In [18]:
print(data_orig.groupby('language').mean(numeric_only=True)[['T1_Precision', 'T1_Recall', 'T1_Fmesure', 'T2_AvgLVDistCorrected']].to_markdown())

| language   |   T1_Precision |   T1_Recall |   T1_Fmesure |   T2_AvgLVDistCorrected |
|:-----------|---------------:|------------:|-------------:|------------------------:|
| BG         |         1      |    1        |     1        |             0           |
| CZ         |         1      |    1        |     1        |             0           |
| DE         |         1      |    0.990685 |     0.99678  |             6.41026e-05 |
| EN         |         1      |    1        |     1        |             0           |
| ES         |         1      |    1        |     1        |             0           |
| FI         |         1      |    0.931875 |     0.962    |             0.011625    |
| FR         |         1      |    0.920084 |     0.954595 |             0.000371622 |
| NL         |         1      |    1        |     1        |             0           |
| PL         |         0.9998 |    1        |     0.9998   |             0           |
| SL         |         1      |    0.999583

## Result for 'perfect' output with improved version of evalTool

In [19]:
out_dir = Path('data')
in_json = out_dir/'results_task1+2_perfect.json'
out_csv = out_dir/'results_task1+2_perfect_improved_evalTool.csv'

In [20]:
from ocrpostcorrection.utils import runEvaluation

runEvaluation(test_data_dir, out_json, out_csv)

File	NbTokens	NbErroneousTokens	NbSymbolsConsidered	T1_Precision	T1_Recall	T1_Fmesure	T2_AvgLVDistOriginal	T2_AvgLVDistCorrected
SL/SL1/29.txt	2	2	532	1.00	1.00	1.00	0.99	0.00
SL/SL1/15.txt	83	28	404	1.00	1.00	1.00	0.15	0.00
SL/SL1/14.txt	115	50	624	1.00	1.00	1.00	0.16	0.00
SL/SL1/28.txt	273	21	1270	1.00	1.00	1.00	0.03	0.00
SL/SL1/16.txt	1004	182	4838	1.00	1.00	1.00	0.08	0.00
SL/SL1/17.txt	1009	333	4929	1.00	1.00	1.00	0.19	0.00
SL/SL1/13.txt	139	49	715	1.00	1.00	1.00	0.09	0.00
SL/SL1/12.txt	333	30	1491	1.00	1.00	1.00	0.03	0.00
SL/SL1/10.txt	314	36	1278	1.00	1.00	1.00	0.05	0.00
SL/SL1/38.txt	405	33	1995	1.00	1.00	1.00	0.03	0.00
SL/SL1/39.txt	143	73	694	1.00	1.00	1.00	0.15	0.00
SL/SL1/11.txt	238	19	1127	1.00	1.00	1.00	0.03	0.00
SL/SL1/9.txt	214	100	1047	1.00	1.00	1.00	0.17	0.00
SL/SL1/8.txt	1140	99	5725	1.00	1.00	1.00	0.04	0.00
SL/SL1/5.txt	263	15	1041	1.00	1.00	1.00	0.02	0.00
SL/SL1/43.txt	172	56	883	1.00	1.00	1.00	0.09	0.00
SL/SL1/42.txt	70	31	296	1.00	1.00	1.00	0.20	0.00
SL/SL1/4.txt	

In [21]:
data_impr = read_results(out_csv)

In [22]:
data_impr[data_impr.T1_Precision != 1.0].shape[0]

1

In [23]:
data_impr[data_impr.T1_Recall != 1.0].shape[0]

469

In [37]:
print(data_impr.groupby('language').mean(numeric_only=True)[['T1_Precision', 'T1_Recall', 'T1_Fmesure', 'T2_AvgLVDistCorrected']].to_markdown(floatfmt='.6f'))

| language   |   T1_Precision |   T1_Recall |   T1_Fmesure |   T2_AvgLVDistCorrected |
|:-----------|---------------:|------------:|-------------:|------------------------:|
| BG         |       1.000000 |    1.000000 |     1.000000 |                0.000000 |
| CZ         |       1.000000 |    1.000000 |     1.000000 |                0.000000 |
| DE         |       1.000000 |    0.998003 |     0.999418 |                0.000039 |
| EN         |       1.000000 |    1.000000 |     1.000000 |                0.000000 |
| ES         |       1.000000 |    1.000000 |     1.000000 |                0.000000 |
| FI         |       1.000000 |    0.932000 |     0.962125 |                0.011625 |
| FR         |       1.000000 |    0.997399 |     0.998598 |                0.000152 |
| NL         |       1.000000 |    1.000000 |     1.000000 |                0.000000 |
| PL         |       0.999800 |    1.000000 |     0.999800 |                0.000000 |
| SL         |       1.000000 |    0.999583

## Comparison

In [25]:
def comparison_table(data_orig, data_impr, column):
    orig = data_orig.groupby('language')[column].mean()
    orig.name = 'Original evalTool'
    impr = data_impr.groupby('language')[column].mean()
    impr.name = 'Improved evalTool'
    diff = (impr - orig)
    diff.name = 'Difference'

    return pd.DataFrame([orig, impr, diff])

In [26]:
print(comparison_table(data_orig, data_impr, 'T1_Precision').to_markdown())

|                   |   BG |   CZ |   DE |   EN |   ES |   FI |   FR |   NL |     PL |   SL |
|:------------------|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-----:|-------:|-----:|
| Original evalTool |    1 |    1 |    1 |    1 |    1 |    1 |    1 |    1 | 0.9998 |    1 |
| Improved evalTool |    1 |    1 |    1 |    1 |    1 |    1 |    1 |    1 | 0.9998 |    1 |
| Difference        |    0 |    0 |    0 |    0 |    0 |    0 |    0 |    0 | 0      |    0 |


In [27]:
print(comparison_table(data_orig, data_impr, 'T1_Recall').to_markdown())

|                   |   BG |   CZ |         DE |   EN |   ES |       FI |        FR |   NL |   PL |       SL |
|:------------------|-----:|-----:|-----------:|-----:|-----:|---------:|----------:|-----:|-----:|---------:|
| Original evalTool |    1 |    1 | 0.990685   |    1 |    1 | 0.931875 | 0.920084  |    1 |    1 | 0.999583 |
| Improved evalTool |    1 |    1 | 0.998003   |    1 |    1 | 0.932    | 0.997399  |    1 |    1 | 0.999583 |
| Difference        |    0 |    0 | 0.00731755 |    0 |    0 | 0.000125 | 0.0773142 |    0 |    0 | 0        |


In [28]:
print(comparison_table(data_orig, data_impr, 'T1_Fmesure').to_markdown())

|                   |   BG |   CZ |         DE |   EN |   ES |       FI |        FR |   NL |     PL |       SL |
|:------------------|-----:|-----:|-----------:|-----:|-----:|---------:|----------:|-----:|-------:|---------:|
| Original evalTool |    1 |    1 | 0.99678    |    1 |    1 | 0.962    | 0.954595  |    1 | 0.9998 | 0.999792 |
| Improved evalTool |    1 |    1 | 0.999418   |    1 |    1 | 0.962125 | 0.998598  |    1 | 0.9998 | 0.999792 |
| Difference        |    0 |    0 | 0.00263807 |    0 |    0 | 0.000125 | 0.0440034 |    0 | 0      | 0        |


In [29]:
print(comparison_table(data_orig, data_impr, 'T2_AvgLVDistCorrected').to_markdown(floatfmt='.6f'))

|                   |       BG |       CZ |        DE |       EN |       ES |       FI |        FR |       NL |       PL |       SL |
|:------------------|---------:|---------:|----------:|---------:|---------:|---------:|----------:|---------:|---------:|---------:|
| Original evalTool | 0.000000 | 0.000000 |  0.000064 | 0.000000 | 0.000000 | 0.011625 |  0.000372 | 0.000000 | 0.000000 | 0.000000 |
| Improved evalTool | 0.000000 | 0.000000 |  0.000039 | 0.000000 | 0.000000 | 0.011625 |  0.000152 | 0.000000 | 0.000000 | 0.000000 |
| Difference        | 0.000000 | 0.000000 | -0.000025 | 0.000000 | 0.000000 | 0.000000 | -0.000220 | 0.000000 | 0.000000 | 0.000000 |


## Why is precision < 1.0?

In [30]:
data_orig[data_orig.T1_Precision != 1.0]

Unnamed: 0,File,NbTokens,NbErroneousTokens,NbSymbolsConsidered,T1_Precision,T1_Recall,T1_Fmesure,T2_AvgLVDistOriginal,T2_AvgLVDistCorrected,language,subset
69,PL/PL1/6.txt,312,136,1659,0.99,1.0,0.99,0.14,0.0,PL,PL1


In [31]:
from ocrpostcorrection.utils import EvalContext, reshape_input_errors, create_perfect_icdar_output
from ocrpostcorrection.icdar_data import process_text

in_file = str(test_data_dir)+'/PL/PL1/6.txt'

eval_context = EvalContext(in_file, verbose=True)
text = process_text(in_file)
test_input = {'key': text}
error_input = create_perfect_icdar_output(test_input)
reshaped_errors = reshape_input_errors(error_input['key'], eval_context)

prec, recall, _fmeasure = eval_context.task1_eval(reshaped_errors, print_sets=True)

print(prec)


#########---- Print Sorted 1) detectedErrPosUnfolded ----######### 
0:[' O ', {}]
3:['Zwierzɇtach ', {}]
15:['Historya ', {}]
24:['Naturalna. 281 ', {}]
39:['mysz ', {}]
44:['kot@', {}]
48:['na ', {}]
59:['w naczyniu ', {}]
81:['iagły ', {}]
95:['cza‑ ', {}]
100:['ffe ', {}]
115:['@mysząt ', {}]
123:['120. ', {}]
142:['ffę ', {}]
158:['lu‑ ', {}]
167:['szkodę. ', {}]
184:['ffę ', {}]
197:['większe ', {}]
209:['nasze ', {}]
256:['pisze ', {}]
262:['Gellius ', {}]
273:['Kardyna‑ ', {}]
304:['11. ', {}]
312:['my@', {}]
315:['szy ', {}]
321:['wyspy ', {}]
359:['mowiłem ', {}]
376:['traktuiąc ', {}]
400:['K. ', {}]
414:['Pismie ', {}]
440:['Pańskiey ', {}]
452:['ffebie ', {}]
462:['myszow ', {}]
476:['1. ', {}]
479:['@Regum ', {}]
491:['5. ', {}]
512:['dziecmi, ', {}]
521:['Xiążęcia ', {}]
530:['Polskiego ', {}]
542:['Kruszwicy ', {}]
555:['potrucie ', {}]
564:['stryiow ', {}]
572:['swych ', {}]
581:['myszy ', {}]
596:['trupow ', {}]
613:['Hatto ', {}]
648:['iż ', {}]
651:['stodołę ', {}]


In [32]:
text = data_test['PL/PL1/6.txt']

for token in text.input_tokens:
    print(token)
    if token.start in (1634, 1438):
        print(token)

InputToken(ocr='O', gs=' O', start=0, len_ocr=1, label=1)
InputToken(ocr='Zwierzętach', gs='Zwierzɇtach', start=2, len_ocr=11, label=1)
InputToken(ocr='Hiflorya', gs='Historya', start=14, len_ocr=8, label=1)
InputToken(ocr='Naturalna.', gs='Naturalna. 281', start=23, len_ocr=10, label=1)
InputToken(ocr='myfz', gs='mysz', start=34, len_ocr=4, label=1)
InputToken(ocr='kot', gs='kotna', start=39, len_ocr=3, label=1)
InputToken(ocr='na', gs='', start=43, len_ocr=2, label=2)
InputToken(ocr='zaparta', gs='zaparta', start=46, len_ocr=7, label=0)
InputToken(ocr='wnaczyniu', gs='w naczyniu', start=54, len_ocr=9, label=1)
InputToken(ocr='tym,', gs='tym,', start=64, len_ocr=4, label=0)
InputToken(ocr='gdzie', gs='gdzie', start=69, len_ocr=5, label=0)
InputToken(ocr='iagly', gs='iagły', start=75, len_ocr=5, label=1)
InputToken(ocr='były,', gs='były,', start=81, len_ocr=5, label=0)
InputToken(ocr='w', gs='w', start=87, len_ocr=1, label=0)
InputToken(ocr='cza*', gs='cza‑', start=89, len_ocr=4, label

In [33]:
for mistake in output['PL/PL1/6.txt']:
    for idx in (1634, 1438):
        if mistake.startswith(str(idx)):
            print(mistake)

## Why is recall < 1.0?

In [34]:
data_impr[data_impr.T1_Recall != 1.0].File.to_list()

['SL/SL1/0.txt',
 'DE/DE6/129.txt',
 'DE/DE6/146.txt',
 'DE/DE6/121.txt',
 'DE/DE6/123.txt',
 'DE/DE6/19.txt',
 'DE/DE6/124.txt',
 'DE/DE7/62.txt',
 'DE/DE7/77.txt',
 'DE/DE7/61.txt',
 'DE/DE7/73.txt',
 'DE/DE7/4.txt',
 'DE/DE5/16.txt',
 'DE/DE5/103.txt',
 'DE/DE5/85.txt',
 'DE/DE4/29.txt',
 'DE/DE4/12.txt',
 'DE/DE4/60.txt',
 'DE/DE4/58.txt',
 'DE/DE4/8.txt',
 'DE/DE4/53.txt',
 'DE/DE4/21.txt',
 'DE/DE4/31.txt',
 'DE/DE3/289.txt',
 'DE/DE3/1090.txt',
 'DE/DE3/1292.txt',
 'DE/DE3/1286.txt',
 'DE/DE3/1443.txt',
 'DE/DE3/1325.txt',
 'DE/DE3/869.txt',
 'DE/DE3/15.txt',
 'DE/DE3/1319.txt',
 'DE/DE3/841.txt',
 'DE/DE3/100.txt',
 'DE/DE3/1494.txt',
 'DE/DE3/1127.txt',
 'DE/DE3/101.txt',
 'DE/DE3/673.txt',
 'DE/DE3/1456.txt',
 'DE/DE3/1442.txt',
 'DE/DE3/28.txt',
 'DE/DE3/1293.txt',
 'DE/DE3/1087.txt',
 'DE/DE3/1508.txt',
 'DE/DE3/1332.txt',
 'DE/DE3/1326.txt',
 'DE/DE3/1468.txt',
 'DE/DE3/1497.txt',
 'DE/DE3/1483.txt',
 'DE/DE3/1118.txt',
 'DE/DE3/466.txt',
 'DE/DE3/300.txt',
 'DE/DE3/1119.t

In [35]:
from ocrpostcorrection.utils import EvalContext, reshape_input_errors, create_perfect_icdar_output
from ocrpostcorrection.icdar_data import process_text

in_file = str(test_data_dir)+'/SL/SL1/0.txt'

eval_context = EvalContext(in_file, verbose=True)
text = process_text(in_file)
test_input = {'key': text}
error_input = create_perfect_icdar_output(test_input)
reshaped_errors = reshape_input_errors(error_input['key'], eval_context)

prec, recall, _fmeasure = eval_context.task1_eval(reshaped_errors, print_sets=True)

print(recall)


#########---- Print Sorted 1) detectedErrPosUnfolded ----######### 
0:[' 161 ', {}]
29:['tôžil, ', {}]
80:['drugoč ', {}]
87:['posredka ', {}]
96:['lo- ', {}]
163:['per@hal ', {}]
171:['  ', {}]
173:['perhčal. ', {}]
241:['predrekav@al, ', {}]
288:['  ', {}]
321:['če@m ', {}]
343:['  ', {}]
352:['  ', {}]
361:['O@', {}]
363:['ča. ', {}]
390:['ore@hami. ', {}]
400:['Krot- ', {}]
421:['naimer tude ', {}]
467:['blizo ', {}]
503:['počinol, ', {}]
523:['ino ', {}]
583:['@miseljo ', {}]
648:['otrok ', {}]
654:['  ', {}]
685:['  ', {}]
827:['  ', {}]
870:['pamet ni ', {}]
902:['  ', {}]
924:['razgovarja ', {}]
935:['  ', {}]
962:['papi- ', {}]
1104:['človečji ', {}]
1118:['čuje; ', {}]
1164:['papigo@', {}]
1171:[', ', {}]
1216:['  ', {}]
1230:['esá ', {}]
1234:['  ', {}]
1236:['etá ', {}]
1249:['senja ', {}]
1266:['trajala   ', {}]
1364:['upametoval, ', {}]
1389:['vzdehnol: ', {}]
1405:['Robinson@', {}]
1414:['! ', {}]
1419:['dv@úju   ', {}]
1441:['popoludne.', {}]

#########---- Print Sorte

In [36]:
text = data_test['SL/SL1/0.txt']

for token in text.input_tokens:
    print(token)
    if token.start in (1452,):
        print(token)
    

InputToken(ocr='161', gs=' 161', start=0, len_ocr=3, label=1)
InputToken(ocr='menje', gs='menje', start=4, len_ocr=5, label=0)
InputToken(ocr='je', gs='je', start=10, len_ocr=2, label=0)
InputToken(ocr='predse', gs='predse', start=13, len_ocr=6, label=0)
InputToken(ocr='po', gs='po', start=20, len_ocr=2, label=0)
InputToken(ocr='mesu', gs='mesu', start=23, len_ocr=4, label=0)
InputToken(ocr='tožil,', gs='tôžil,', start=28, len_ocr=6, label=1)
InputToken(ocr='ino', gs='ino', start=35, len_ocr=3, label=0)
InputToken(ocr='naposled', gs='naposled', start=39, len_ocr=8, label=0)
InputToken(ocr='se', gs='se', start=48, len_ocr=2, label=0)
InputToken(ocr='ni', gs='ni', start=51, len_ocr=2, label=0)
InputToken(ocr='mogel', gs='mogel', start=54, len_ocr=5, label=0)
InputToken(ocr='zderžati,', gs='zderžati,', start=60, len_ocr=9, label=0)
InputToken(ocr='da', gs='da', start=70, len_ocr=2, label=0)
InputToken(ocr='se', gs='se', start=73, len_ocr=2, label=0)
InputToken(ocr='je', gs='je', start=76,