In [31]:
import pandas as pd
import numpy as np
from pathlib import Path
import re
from copy import deepcopy
from loguru import logger
from typing import Dict, Tuple

from src.settings import PREPROCESSED_DIR, POLISH_ANNOTATIONS_FPATH, TASK_LABEL_NAME_MAP

In [32]:
LANDMARKS_DIR = PREPROCESSED_DIR / "landmarks_all"

In [33]:
polish_annotations_df = pd.read_json(POLISH_ANNOTATIONS_FPATH, lines=True)

polish_annotations_df["task_num"] = polish_annotations_df["task_label"].copy()
polish_annotations_df["task_label"] = polish_annotations_df["task_label"].astype("str").map(TASK_LABEL_NAME_MAP)

polish_annotations_df

Unnamed: 0,start,end,text,doc_filepath,video_filename,task_label,task_num
0,39040,41120,"Myślę, że mam inny pomysł, można?",/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,Znaki zakazu,15
1,41120,49680,"Chyba ten znak mówi, że jak ktoś będzie spacer...",/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,Znaki zakazu,15
2,49680,55280,Trzeba przejść łukiem obok leżącego i o tym zn...,/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,Znaki zakazu,15
3,61280,66840,"Okrągły znak pomaga nam, mówi, że są pasy na u...",/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,Znaki zakazu,15
4,66840,73200,"Jak ktoś zobaczy, ale zignoruje ten znak, to m...",/15/K66BF13-26_15_15_signsNO.eaf,K66BF13-26.mp4,Znaki zakazu,15
...,...,...,...,...,...,...,...
40350,248960,250320,[uderzanie w coś],/13/K17BF13-26_13_13_comics.eaf,K17BF13-26.mp4,Komisky,13
40351,279560,291400,Kot zobaczył w akwarium rybkę. Podszedł i dał ...,/13/K17BF13-26_13_13_comics.eaf,K17BF13-26.mp4,Komisky,13
40352,291400,347120,"Zjadłam, zjadłam.",/13/K17BF13-26_13_13_comics.eaf,K17BF13-26.mp4,Komisky,13
40353,347120,356440,"Zając biegnie, zobaczył wiszące pranie. Wskocz...",/13/K17BF13-26_13_13_comics.eaf,K17BF13-26.mp4,Komisky,13


In [4]:
polish_annotations_df["task_label"].unique(), polish_annotations_df["task_num"].unique()

(array(['Znaki zakazu', 'Alarm', 'Kalendarz', 'Gruszki / Chaplin',
        'Sylwester / Żaba', 'Komisky'], dtype=object),
 array([15, 24,  7, 17,  8, 13]))

In [5]:
id2label = {i: label for i, label in enumerate(TASK_LABEL_NAME_MAP.values())}

id2label

{0: 'Kalendarz',
 1: 'Sylwester / Żaba',
 2: 'Komisky',
 3: 'Znaki zakazu',
 4: 'Gruszki / Chaplin',
 5: 'Alarm'}

In [6]:
unique_videos = polish_annotations_df.video_filename.unique()
unique_videos.shape, unique_videos[:3]

((281,),
 array(['K66BF13-26.mp4', 'K32BF01-16.mp4', 'K75AF13-26.mp4'], dtype=object))

In [7]:
import jsonlines


def get_clips_data(video_dir_path: str) -> Tuple[Path, Dict[str, int], pd.DataFrame]:
    for clip_landmarks_csv in video_dir_path.iterdir():
        if not clip_landmarks_csv.exists():
            logger.warning(f"Clip data `{clip_landmarks_csv}` does not exist!")
            continue

        clip_landmarks_df = pd.read_csv(clip_landmarks_csv).drop(columns=["Unnamed: 0"])
        if clip_landmarks_df.empty:
            logger.warning(f"`{clip_landmarks_csv} is empty!`")
            continue

        clip_task_num, clip_start, clip_end = map(int, re.search(r"(\d{2})_(\d+)\-(\d+)\.csv$", clip_landmarks_csv.name).groups())
        clip_metadata = {"task_num": clip_task_num, "start": clip_start, "end": clip_end}
        yield clip_landmarks_csv, clip_metadata, clip_landmarks_df


def gen_clips_data():
    writer = jsonlines.open(PREPROCESSED_DIR / "clips_dataset.jsonl", mode="a")
    for video_filename in unique_videos:
        video_dir_path = LANDMARKS_DIR / video_filename.split(".")[0]
        if not video_dir_path.exists():
            logger.warning(f"Video data does not exist! `{video_dir_path}`")
            continue

        for clip_csv_fpath, clip_metadata, clip_df in get_clips_data(video_dir_path):
            num_frames = clip_df.FrameNumber.max()
            assert isinstance(num_frames, int), f"Number of frames is not an integer! {num_frames} `{clip_csv_fpath}`"

            clip_dataset_record = {
                "VideoFilename": video_filename,
                "NumFrames": num_frames,
                "Start": clip_metadata["start"],
                "End": clip_metadata["end"],
                "TaskNum": clip_metadata["task_num"],
            }

            clip_pl_annotations = polish_annotations_df[
                (polish_annotations_df.video_filename == video_filename)
                & (polish_annotations_df.task_num == clip_metadata["task_num"])
                & (polish_annotations_df.start == clip_metadata["start"])
                & (polish_annotations_df.end == clip_metadata["end"])
            ]
            assert len(clip_pl_annotations) == 1

            clip_pl_annotation_data = clip_pl_annotations.iloc[0].to_dict()
            clip_dataset_record["PolishAnnotation"] = clip_pl_annotation_data["text"]
            clip_dataset_record["TaskLabel"] = clip_pl_annotation_data["task_label"]
            clip_dataset_record["EafDocFilepath"] = clip_pl_annotation_data["doc_filepath"]

            clip_frames_landmarks_coords = []
            for frame_number in range(num_frames):
                frame_df = clip_df[clip_df.FrameNumber == frame_number]
                if frame_df.empty:
                    logger.warning(f"Frame {frame_number} does not exist! {clip_csv_fpath}")
                    continue

                frame_raw_landmarks_coords = frame_df[['X', 'Y', 'Z']].to_numpy()  # 33x3
                assert frame_raw_landmarks_coords.shape == (33, 3), (f"Landmarks coordinates matrixs hape mismatched, got ({frame_raw_landmarks_coords.shape}) instead of (33x3) for frame {frame_number}! {clip_csv_fpath}")

                clip_frames_landmarks_coords.append(frame_raw_landmarks_coords)

            clip_dataset_record["FramesLandmarksCoords"] = np.array(clip_frames_landmarks_coords)

            writable_clip_dataset_record = deepcopy(clip_dataset_record)
            writable_clip_dataset_record["FramesLandmarksCoords"] = writable_clip_dataset_record["FramesLandmarksCoords"].tolist()          
            writer.write(writable_clip_dataset_record)

            yield clip_dataset_record

    writer.close()


In [8]:
clips_df = pd.DataFrame.from_records(gen_clips_data())
clips_df



Unnamed: 0,VideoFilename,NumFrames,Start,End,TaskNum,PolishAnnotation,TaskLabel,EafDocFilepath,FramesLandmarksCoords
0,K66BF13-26.mp4,77,525840,528880,15,Ktoś jedzie sportowym rowerem i z tyłu koń bie...,Znaki zakazu,/15/K66BF13-26_15_15_signsNO.eaf,"[[[0.5882737636566162, 0.3140367865562439, -0...."
1,K66BF13-26.mp4,74,112120,115040,15,"Nie wolno odkrywać ciała, znak przypomina nam ...",Znaki zakazu,/15/K66BF13-26_15_15_signsNO.eaf,"[[[0.5860795974731445, 0.2357127368450164, -0...."
2,K66BF13-26.mp4,44,361560,363280,15,Nigdy czegoś takiego nie widziałem.,Znaki zakazu,/15/K66BF13-26_15_15_signsNO.eaf,"[[[0.6027265787124634, 0.2272534072399139, -0...."
3,K66BF13-26.mp4,197,529960,537800,15,"Przyczepa ok, łyżwy ok, ale hulajnoga? Nie da ...",Znaki zakazu,/15/K66BF13-26_15_15_signsNO.eaf,"[[[0.6342988610267639, 0.3123383522033691, -0...."
4,K66BF13-26.mp4,34,589680,591000,15,Dorośli tak.,Znaki zakazu,/15/K66BF13-26_15_15_signsNO.eaf,"[[[0.6094659566879272, 0.2442431151866912, -0...."
...,...,...,...,...,...,...,...,...,...
19498,K33BF01-14.mp4,36,195640,197040,13,Dobrze.,Komisky,/13/K33BF01-14_13_13_comics.eaf,"[[[0.4911873936653137, 0.193398430943489, -0.8..."
19499,K33BF01-14.mp4,9,138160,138480,13,Ładne.,Komisky,/13/K33BF01-14_13_13_comics.eaf,"[[[0.4703335762023926, 0.2234214097261428, -0...."
19500,K33BF01-14.mp4,87,147840,151320,8,Ptaszek wziął metalową kulę i zrzucił ją wpros...,Sylwester / Żaba,/08/K33BF01-14_08_tweety_08_tweety_tweety.eaf,"[[[0.4666298925876617, 0.2872171700000763, -0...."
19501,K33BF01-14.mp4,8,198360,198640,13,Dobrze.,Komisky,/13/K33BF01-14_13_13_comics.eaf,"[[[0.4809567928314209, 0.199927270412445, -0.8..."


In [9]:
clips_df.VideoFilename.unique().shape

(113,)

In [10]:
clips_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19503 entries, 0 to 19502
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   VideoFilename          19503 non-null  object
 1   NumFrames              19503 non-null  int64 
 2   Start                  19503 non-null  int64 
 3   End                    19503 non-null  int64 
 4   TaskNum                19503 non-null  int64 
 5   PolishAnnotation       19503 non-null  object
 6   TaskLabel              19503 non-null  object
 7   EafDocFilepath         19503 non-null  object
 8   FramesLandmarksCoords  19503 non-null  object
dtypes: int64(4), object(5)
memory usage: 1.3+ MB


In [11]:
clips_df.NumFrames.describe()

count    19503.000000
mean        58.988822
std         51.192794
min          3.000000
25%         25.000000
50%         44.000000
75%         76.000000
max        778.000000
Name: NumFrames, dtype: float64

## HerBERT

In [1]:
import torch
from transformers import AutoTokenizer, AutoModel
from jsonlines import jsonlines
from tqdm.notebook import tqdm

In [2]:
herbert_tokenizer = AutoTokenizer.from_pretrained("allegro/herbert-base-cased")
herbert = AutoModel.from_pretrained("allegro/herbert-base-cased").to("cuda:0")

Some weights of the model checkpoint at allegro/herbert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.sso.sso_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.sso.sso_relationship.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [5]:
def get_herbert_embedding(text: str, device: str = "cuda") -> np.ndarray:
    return herbert(
        **herbert_tokenizer.encode_plus(text,
            padding='longest',
            add_special_tokens=True,
            return_tensors='pt'
        ).to(device)
    ).pooler_output.to("cpu").detach().numpy()[0]

get_herbert_embedding("Ala ma kota").shape

(768,)

In [34]:
max_tokens = -1
clips_dataset_fpath = PREPROCESSED_DIR / "clips_dataset.jsonl"
with jsonlines.open(clips_dataset_fpath) as reader:
    for record in tqdm(reader, total=19_503):
        num_tokens = len(herbert_tokenizer.tokenize(record["PolishAnnotation"]))
        if num_tokens > max_tokens:
            max_tokens = num_tokens

print(f"{max_tokens=}")

  0%|          | 0/19503 [00:00<?, ?it/s]

max_tokens=66


In [35]:
MAX_TOKENS = max_tokens

In [38]:
def get_herbert_token_ids(text: str) -> torch.Tensor:
    return herbert_tokenizer.encode(text,
        # padding='longest',
        padding='max_length',
        max_length=MAX_TOKENS,
        add_special_tokens=True,
        return_tensors='pt'
    )[0]

print(herbert_tokenizer.decode(token_ids=1))
print(get_herbert_token_ids("Ala ma kota"))
print(get_herbert_token_ids("Kot ma Ale"))
print(get_herbert_token_ids("Kolejna Kolejne Kolejny"))
print(get_herbert_token_ids("ale są różne długości `token_ids` bo są różne długości `text`. Kolejne zdanie dlatego jest w środku `0` (od <s>, special_token)"))

<pad>
tensor([    0,    37,  2121,  2185, 24112,     2,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1])
tensor([    0, 22513,  2185,  2894,     2,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
       

In [39]:
clips_dataset_fpath = PREPROCESSED_DIR / "clips_dataset.jsonl"
extended_dataset_fpath = clips_dataset_fpath.as_posix().replace(".jsonl", "_wth_herbert_token_ids.jsonl")
with jsonlines.open(clips_dataset_fpath) as reader, jsonlines.open(extended_dataset_fpath, mode="w") as writer:
    for record in tqdm(reader, total=19_503):
        record["PolishAnnotationTokenIds"] = get_herbert_token_ids(record["PolishAnnotation"]).tolist()
        writer.write(record)

  0%|          | 0/19503 [00:00<?, ?it/s]