In [1]:
from collections import defaultdict

import pandas as pd
from tqdm.notebook import tqdm
from loguru import logger

from pympi.Elan import Eaf
import json
from jsonlines import jsonlines

from src.settings import ORKPJM_ANN_DIR, GLOSS_ANNOTATIONS_FPATH, POLISH_ANNOTATIONS_FPATH
from src.data.dataset import GlossSeq2PolishDataset
from src.data.dataset.gloss2polish import dump_paired_dataset
from src.data.utils import iter_dto_as_dicts

## Tiers:

### Data structure
_TimeStamp: int
> - in `TIME_UNITS`, miliseconds

_Content: str
> - `glosa`: utf-8,
> - `glosa_druga_reka`: utf-8,
> - `HamNoSys`: unicode (font HamNoSys)
> - `tlumaczenie_publikacja`: utf-8

Tier results:
> ```python
> List[Tuple[ts1: _TimeStamp, ts2: _TimeStamp, content: _Content]]
> ```

In [2]:
from src.data.dto import GlossAnnotationRecord, PolishAnnotationRecord

In [3]:
tier_records = defaultdict(list)
for task_dir_path in tqdm(list(ORKPJM_ANN_DIR.iterdir()), desc="Tasks"):
    if not task_dir_path.is_dir():
        continue
    task_label = task_dir_path.name
    for eaf_doc_filepath in task_dir_path.glob("*.eaf"):
        eac_doc_rel_filepath = eaf_doc_filepath.as_posix().replace(ORKPJM_ANN_DIR.as_posix(), "")
        eaf_doc = Eaf(eaf_doc_filepath)
        if len(eaf_doc.media_descriptors) > 1:
            logger.warning(f"More than one media descriptors! Getting the first one...\n{eaf_doc.media_descriptors=}")
        media_descriptor = eaf_doc.media_descriptors[0]
        vid_fname = media_descriptor["MEDIA_URL"]
        time_units = eaf_doc.header["TIME_UNITS"]
        for tier_name in eaf_doc.get_tier_names():
            if "tlumaczenie_publikacja" == tier_name:
                dto_model = PolishAnnotationRecord
            elif "glosa" in tier_name:
                dto_model = GlossAnnotationRecord
            else:
                continue
            tier_data = eaf_doc.get_annotation_data_for_tier(tier_name)
            for tier_record in tier_data:
                start, end, text = tier_record
                ann_record_dto = dto_model(
                    task_label=task_label,
                    doc_filepath=eac_doc_rel_filepath,
                    video_filename=vid_fname,
                    start=start,
                    end=end,
                    text=text,
                )
                if tier_name.endswith("_druga_reka"):
                    ann_record_dto.dominant_hand = False

                tier_records[tier_name].append(ann_record_dto)

Tasks:   0%|          | 0/7 [00:00<?, ?it/s]

In [4]:
gloss_records = list(iter_dto_as_dicts(tier_records["glosa"]))
with jsonlines.open(GLOSS_ANNOTATIONS_FPATH, mode="w") as f:
    f.write_all(gloss_records)

In [5]:
gloss_df = pd.DataFrame.from_records(gloss_records)
gloss_df

Unnamed: 0,start,end,text,doc_filepath,video_filename,task_label,dominant_hand
0,29960,31200,%,/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15,True
1,38560,39040,%,/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15,True
2,39040,39440,MYŚLEĆ 2.1 P:Z;L:Z (NA PRZEMIAN),/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15,True
3,39440,39640,JAK 1.2 P:I;L:Ø,/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15,True
4,39640,39920,POMYSŁ 1.3 P:AZ;L:Ø,/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15,True
...,...,...,...,...,...,...,...
160671,353480,354480,NIEDŹWIEDŹ/MIŚ 1.2 P:B;L:Ø,/13/K17BF13-26_13_13_comics.eaf,K17BF13-26.mp4,13,True
160672,354680,356440,PRZESTRASZYĆ 1.1 P:5A;L:5A (JAK ZŁAPAĆ),/13/K17BF13-26_13_13_comics.eaf,K17BF13-26.mp4,13,True
160673,359240,360120,BIUSTONOSZ 1.2. P:5;L:5 (STANIK/STRÓJ KĄPIELOWY),/13/K17BF13-26_13_13_comics.eaf,K17BF13-26.mp4,13,True
160674,360120,360960,DZIECKO 2.2 P:I1;LØ,/13/K17BF13-26_13_13_comics.eaf,K17BF13-26.mp4,13,True


In [6]:
polish_records = list(map(lambda dto: dto.model_dump(), tier_records["tlumaczenie_publikacja"]))
with jsonlines.open(POLISH_ANNOTATIONS_FPATH, mode="w") as f:
    f.write_all(polish_records)

In [7]:
polish_df = pd.DataFrame.from_records(polish_records)
polish_df

Unnamed: 0,start,end,text,doc_filepath,video_filename,task_label
0,39040,41120,"Myślę, że mam inny pomysł, można?",/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15
1,41120,49680,"Chyba ten znak mówi, że jak ktoś będzie spacer...",/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15
2,49680,55280,Trzeba przejść łukiem obok leżącego i o tym zn...,/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15
3,61280,66840,"Okrągły znak pomaga nam, mówi, że są pasy na u...",/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15
4,66840,73200,"Jak ktoś zobaczy, ale zignoruje ten znak, to m...",/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15
...,...,...,...,...,...,...
40350,248960,250320,[uderzanie w coś],/13/K17BF13-26_13_13_comics.eaf,K17BF13-26.mp4,13
40351,279560,291400,Kot zobaczył w akwarium rybkę. Podszedł i dał ...,/13/K17BF13-26_13_13_comics.eaf,K17BF13-26.mp4,13
40352,291400,347120,"Zjadłam, zjadłam.",/13/K17BF13-26_13_13_comics.eaf,K17BF13-26.mp4,13
40353,347120,356440,"Zając biegnie, zobaczył wiszące pranie. Wskocz...",/13/K17BF13-26_13_13_comics.eaf,K17BF13-26.mp4,13


In [8]:
sample_pl_annotation = polish_df.iloc[2].to_dict()
print(json.dumps(sample_pl_annotation, indent=2, ensure_ascii=False) + "\n")

sample_gloss_annotations = gloss_df[
    (gloss_df["start"] >= sample_pl_annotation["start"])
    & (gloss_df["end"] <= sample_pl_annotation["end"])
    & (gloss_df["video_filename"] == sample_pl_annotation["video_filename"])
    # & (gloss_df["task_label"] == sample_pl_annotation["task_label"])
]

# assert sample_gloss_annotations["start"].nunique() == sample_gloss_annotations.shape[0]
assert sample_gloss_annotations["video_filename"].nunique() == 1
# assert sample_gloss_annotations["task_label"].nunique() == 1

# print(sample_gloss_annotations.info())
sample_gloss_annotations

{
  "start": 49680,
  "end": 55280,
  "text": "Trzeba przejść łukiem obok leżącego i o tym znak nas informuje.",
  "doc_filepath": "/15/K66BF13-26_15_15_signsNO.eaf",
  "video_filename": "K66BF13-26.mp4",
  "task_label": "15"
}



Unnamed: 0,start,end,text,doc_filepath,video_filename,task_label,dominant_hand
20,49680,51040,IŚĆ 1.3 P:1;L:1 (ROZDZIELNIE),/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15,True
21,51040,51400,WSKAZ: Z WSZYSTKIE KIERUNKI,/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15,True
22,51400,51840,LEŻEĆ 1.2 P:V;L:Ø,/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15,True
23,51840,52280,WSKAZ: Z WSZYSTKIE KIERUNKI,/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15,True
24,52280,52760,ALARM 2.1 P:A5;L:A5 (POGOTOWIE/SYGNAŁ),/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15,True
25,52760,53640,%,/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15,True
26,53640,54320,ZNAK/ZNACZYĆ 1.1 P:Z;L:Z,/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15,True
27,54320,54840,POMAGAĆ/POMOC 1.1 P:B;L:B (DO KOGOŚ),/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15,True
28,54840,55280,%,/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,15,True
61606,50240,50720,%,/24/K66BF13-26_24_24_alarm.eaf,K66BF13-26.mp4,24,True


## Collect correspondive gloss annotations for every Polish annotation

In [9]:
dump_paired_dataset(callback=tqdm)

  0%|          | 0/40355 [00:00<?, ?it/s]

## `GlossSeq2PolishDataset` usage

In [10]:
dataset = GlossSeq2PolishDataset()

In [11]:
len(dataset)

39623

In [12]:
from pprint import pprint

pprint(dataset[0])

GlossSeq2PolishRecord(gloss_sequence=[GlossAnnotationRecord(start=39040, end=39440, text='MYŚLEĆ 2.1 P:Z;L:Z (NA PRZEMIAN)', doc_filepath='/15/K66BF13-26_15_15_signsNO.eaf', video_filename='K66BF13-26.mp4', task_label='15', dominant_hand=True), GlossAnnotationRecord(start=39440, end=39640, text='JAK 1.2 P:I;L:Ø', doc_filepath='/15/K66BF13-26_15_15_signsNO.eaf', video_filename='K66BF13-26.mp4', task_label='15', dominant_hand=True), GlossAnnotationRecord(start=39640, end=39920, text='POMYSŁ 1.3 P:AZ;L:Ø', doc_filepath='/15/K66BF13-26_15_15_signsNO.eaf', video_filename='K66BF13-26.mp4', task_label='15', dominant_hand=True), GlossAnnotationRecord(start=39920, end=40200, text='WSKAZ-JA 1.1 P:L;L;Ø (Z WIDOCZNYM KCIUKIEM)', doc_filepath='/15/K66BF13-26_15_15_signsNO.eaf', video_filename='K66BF13-26.mp4', task_label='15', dominant_hand=True), GlossAnnotationRecord(start=40200, end=41120, text='MOŻNA 1.1 P:B;L:B  (MOŻE/MOŻLIWE)', doc_filepath='/15/K66BF13-26_15_15_signsNO.eaf', video_filename='

In [13]:
for dto in dataset:
    pprint(dto.polish_annotation)
    print()
    pprint(dto.gloss_sequence)
    break

PolishAnnotationRecord(start=39040, end=41120, text='Myślę, że mam inny pomysł, można?', doc_filepath='/15/K66BF13-26_15_15_signsNO.eaf', video_filename='K66BF13-26.mp4', task_label='15')

[GlossAnnotationRecord(start=39040, end=39440, text='MYŚLEĆ 2.1 P:Z;L:Z (NA PRZEMIAN)', doc_filepath='/15/K66BF13-26_15_15_signsNO.eaf', video_filename='K66BF13-26.mp4', task_label='15', dominant_hand=True),
 GlossAnnotationRecord(start=39440, end=39640, text='JAK 1.2 P:I;L:Ø', doc_filepath='/15/K66BF13-26_15_15_signsNO.eaf', video_filename='K66BF13-26.mp4', task_label='15', dominant_hand=True),
 GlossAnnotationRecord(start=39640, end=39920, text='POMYSŁ 1.3 P:AZ;L:Ø', doc_filepath='/15/K66BF13-26_15_15_signsNO.eaf', video_filename='K66BF13-26.mp4', task_label='15', dominant_hand=True),
 GlossAnnotationRecord(start=39920, end=40200, text='WSKAZ-JA 1.1 P:L;L;Ø (Z WIDOCZNYM KCIUKIEM)', doc_filepath='/15/K66BF13-26_15_15_signsNO.eaf', video_filename='K66BF13-26.mp4', task_label='15', dominant_hand=True)