In [13]:
import pandas as pd
import seaborn as sns
import os
import json
from typing import List, Optional

#For visualization
import matplotlib.pyplot as plt
import numpy as np

#Törlendők
import tensorflow as tf

In [15]:
DATA_PATH = "../data/anklealign/anklealign"
consensus_neptuns = ['D6AE9F','ECSGGY','FO6K58']
classes = ['1_Pronacio', '2_Neutralis', '3_Szupinacio']

In [14]:
class AnnotationTask:
    def __init__(self, raw: dict, annotator: str):
        self.annotator = annotator
        self.studentImage = None
        self.raw = raw

        self.id: int = raw.get("id")
        self.image_path: str = raw.get("data", {}).get("image")
        if len(self.image_path.split('/')) == 5: 
            self.image_name = self.image_path.split('/')[4].split('-')[1]
        else:
             self.image_name = self.image_path.split('/')[1].split('-')[1]
             self.studentImage =  self.image_path.split('/')[0]
        # Extract label (if available)
        annotations = raw.get("annotations", [])
        if annotations and "result" in annotations[0]:
            if len(annotations[0]["result"]) != 0:
                result = annotations[0]["result"][0]
                self.label: Optional[str] = result["value"]["choices"][0]
            else:
                self.label = None

        # Full metadata
        self.meta = raw.get("meta", {})
        self.created_at = raw.get("created_at")
        self.updated_at = raw.get("updated_at")

    def __repr__(self):
        return f"AnnotationTask(id={self.id}, label={self.label}, image_path='{self.image_path}')"

    def get_infos(self):
        return {
            "annotator" : self.annotator, 
            "owner_image" : self.studentImage,
            "image_name" : self.image_name, 
            "label" : self.label,
        }

def load_annotation_file(path: str, student: str) -> List[AnnotationTask]:
    """Load a Label Studio JSON export and convert to Python objects."""
    with open(path, "r", encoding="utf-8") as f:
        items = json.load(f)

    tasks = [AnnotationTask(item, student) for item in items]
    return tasks


In [16]:
def labelsRepairing(df):
    df = df.copy()

    # 1. None sorok eltávolítása
    df = df[df['label'].notna()]

    # 2. Komplett egységesítési táblázat
    mapping = {
        # Pronáció
        'pronation': '1_Pronacio',
        '1_Pronacio': '1_Pronacio',
        '1_Pronáló': '1_Pronacio',
        '1_Pronáló ': '1_Pronacio',

        # Neutral
        'neutral': '2_Neutralis',
        '2_Neutralis': '2_Neutralis',
        '2_Neutrális': '2_Neutralis',

        # Szupináció
        'supination': '3_Szupinacio',
        '3_Szupinacio': '3_Szupinacio',
        '3_Szupináló': '3_Szupinacio',
        '3_Szupináló ': '3_Szupinacio'
    }

    # 3. Replace
    df['label'] =  df['label'].replace(mapping)
    return df


In [17]:
def consensusReparing(df):# 1) owner_image kitöltése
    df['owner_image'] = None
    for neptun in consensus_neptuns:
        mask = df['image_name'].str.contains(neptun)
        df.loc[mask, 'owner_image'] = neptun

    # 2) clean_image_name létrehozása és tisztítása
    clean_names = df['image_name']

    # NEPTUN-ok eltávolítása bárhol
    for neptun in consensus_neptuns:
        clean_names = clean_names.str.replace(
            rf"{neptun}_?|_{neptun}", 
            "",
            regex=True
        )
    ## resztvevo -> reszvevo javítás
    #clean_names = clean_names.str.replace("resztvevo", "reszvevo", regex=True)

    # dupla _ javítása és elejéről/végéről _ eltávolítása
    clean_names = clean_names.str.replace("__", "_", regex=False).str.strip("_")

    df['clean_image_name'] = clean_names

    #3.) Extra

    mask_extra = df['image_name'].str.contains('[0-9]_D6AE9F.jpg', regex=True)
    df.loc[mask_extra, 'owner_image'] = 'OJHGS8'

    mask_extra = df['annotator'].str.contains('GK1XQ4')
    df.loc[mask_extra, 'owner_image'] = 'D6AE9F'

    mask_extra = df['image_name'].str.contains('resztvevo')
    df.loc[mask_extra, 'owner_image'] = 'FO6K58'
    #4.) Extra 2: Az én elnevezési hibám miatt ki tudtam deríteni, hogy bizonyos én fájlaimhoz melyik címkék tartoznak: sajat_resztvevo_03, _04, és _05.
    #Ellenőriztem is, hogy a mások nem követtek el ilyen hibát.
    mask_extra = df['image_name'].str.contains('sajat_reszvevo_0[3-5]_*', regex=True)
    df.loc[mask_extra, 'owner_image'] = 'ECSGGY'

    return df


In [18]:
def consensusFiltering(df):
    neptuns_as_filter = df['owner_image'].unique()[df['owner_image'].unique()!=None]
    pattern = "|".join(neptuns_as_filter)
    return df[df['image_name'].str.contains(pattern, regex=True)]

In [19]:
exclude = {'anklealign-consensus.txt', 'DPMC6W.json', 'ITWQ3V.json'}

consensus_dir = f"{DATA_PATH}/consensus"
files = [
    f for f in os.listdir(consensus_dir)
    if f not in exclude
]

annotators_tasks = []

for filename in files:
    annotator = filename.rsplit('.', 1)[0].upper()
    tasks = load_annotation_file(f"{consensus_dir}/{filename}", annotator)
    annotators_tasks.extend([t.get_infos() for t in tasks])

df_consensus = pd.DataFrame(annotators_tasks)
df_consensus = consensusReparing(df_consensus)
df_consensus = consensusFiltering(df_consensus)
df_consensus = labelsRepairing(df_consensus)

In [21]:
# 3 címke darabszámának összegzése adott neptun és kép szerint
vote_counts = (
    df_consensus.groupby(['owner_image', 'image_name', 'clean_image_name', 'label'])
      .size()
      .reset_index(name='votes')
)

In [None]:
images = vote_counts['clean_image_name'].unique()

# --- beállítjuk hányszor hányszor legyenek ---
cols = 3
rows = int(np.ceil(len(images) / cols))

plt.figure(figsize=(12, 4 * rows))  # kisebb, kompaktabb ábrák

for i, img in enumerate(images, 1):
    sub = vote_counts[
        (vote_counts['owner_image'] == consensus_neptuns[0]) &
        (vote_counts['clean_image_name'] == img)
    ]

    plt.subplot(rows, cols, i)
    plt.bar(sub['label'], sub['votes'])
    plt.title(img, fontsize=10)
    plt.xticks(rotation=45, fontsize=8)
    plt.yticks(fontsize=8)
    plt.tight_layout()

plt.show()


In [22]:
EXCLUDE_DIRS = {'consensus', 'sample', 'NC1O2T', 'ECSGGY', 'GI9Y8B'}

neptuns = [
    d for d in os.listdir(DATA_PATH)
    if os.path.isdir(os.path.join(DATA_PATH, d)) and d not in EXCLUDE_DIRS
]

annotators_tasks = []

for neptun in neptuns:
    folder = os.path.join(DATA_PATH, neptun)
    json_files = [f for f in os.listdir(folder) if f.endswith('.json')]

    if not json_files:
        print(f"Nincs JSON fájl: {neptun}")
        continue

    json_path = os.path.join(folder, json_files[0])

    tasks = load_annotation_file(json_path, neptun)
    annotators_tasks.extend(t.get_infos() for t in tasks)

# --- DataFrame ---
df_own_neptun = pd.DataFrame(annotators_tasks)
df_own_neptun['owner_image'] = df_own_neptun['annotator']
df_own_neptun = labelsRepairing(df_own_neptun)
df_own_neptun['clean_image_name'] = df_own_neptun['image_name']
df_own_neptun = df_own_neptun.drop(['annotator','image_name'], axis= 1) 


In [23]:
len(df_own_neptun)

304

In [254]:
final_df = []

for neptun in vote_counts['owner_image'].unique():
    idx = (
        vote_counts[vote_counts['owner_image'] == neptun]
        .groupby('clean_image_name')['votes']
        .idxmax()
    )
    top_labels = vote_counts.loc[idx]
    final_df.append(top_labels)

final_df = pd.concat(final_df, ignore_index=True)
final_df = final_df.drop(['votes', 'image_name'], axis=1)
final_df = pd.concat([final_df, df_own_neptun])

# Konklúzió:
A címkézési munkafolyamat nagyrésze nagyon inkonzisztens volt, mivel az egyiknél rosszul lett elnevezve, míg a konzszus könyvtárban nem volt egyértelműen megnevezve, hogy adott hallgató (sajátján kívül, de a consuses mappában a sajátját beleértve) milyen címkét adtt adott fájlhoz (képhez). A rendkivüli inkonzisztencia miatt a fenti kódot nem lehetett jobban egyszerűsíteni, mivel más-más esetre más kodot kellett implementálnom, amit a py formátumban került használatra (src/01-data-preprocessing).

In [189]:
img = tf.io.read_file(f"{DATA_PATH}/{final_df['owner_image'].iloc[0]}/{final_df['clean_image_name'].iloc[0]}")
img = tf.image.decode_jpeg(img, channels=3)
img = tf.image.convert_image_dtype(img, tf.float32)
img = tf.image.resize(img, [256, 256])

W0000 00:00:1765459075.292359     646 gpu_device.cc:2342] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [200]:
final_df

Unnamed: 0,owner_image,clean_image_name,label
0,D6AE9F,sajat_reszvevo01_01.jpg,3_Szupinacio
1,D6AE9F,sajat_reszvevo01_02.jpg,3_Szupinacio
2,D6AE9F,sajat_reszvevo01_03.jpg,3_Szupinacio
3,D6AE9F,sajat_reszvevo02_01.jpg,2_Neutralis
4,D6AE9F,sajat_reszvevo03_01.jpg,3_Szupinacio
...,...,...,...
300,XV0M8Z,sajat_resztvevo13_02.jpg,2_Neutralis
301,XV0M8Z,sajat_resztvevo14_01.jpg,2_Neutralis
302,XV0M8Z,sajat_resztvevo14_02.jpg,2_Neutralis
303,XV0M8Z,sajat_resztvevo14_03.jpg,2_Neutralis


In [245]:
import tensorflow as tf
import os

def load_image(path, target_size=(256, 256)):
    # próbálkozás JPEG dekódolással
    try:
        img = tf.io.read_file(path)
        img = tf.image.decode_jpeg(img, channels=3)
    except tf.errors.InvalidArgumentError:
        # ha nem JPEG, próbáljuk PNG-vel
        try:
            img = tf.io.read_file(path)
            img = tf.image.decode_png(img, channels=3)
        except tf.errors.InvalidArgumentError:
            print(f"Nem sikerült betölteni a képet: {path}")
            return None
    
    img = tf.image.convert_image_dtype(img, tf.float32)
    img = tf.image.resize(img, target_size)
    return img

images = []
owners = []
names = []
labels = []
for index in range(len(final_df)):
    owner = final_df['owner_image'].iloc[index]
    img_name = final_df['clean_image_name'].iloc[index]
    label = final_df['label'].iloc[index]
    # próbáljuk .jpg-t és .jpeg-t is
    possible_paths = [
        os.path.join(DATA_PATH, owner, img_name),
        os.path.join(DATA_PATH, owner, img_name.replace('.jpg', '.jpeg')),
        os.path.join(DATA_PATH, owner, img_name.replace('.jpeg', '.jpg'))
    ]
    
    for path in possible_paths:
        if os.path.exists(path):
            img = load_image(path)
            if img is not None:
                images.append(img)
                owners.append(owner)
                names.append(img_name)
                labels.append(label)
            break
            
# Egységesítés
images_clean = []
for img in images:
    # Ha van extra dimenzió (pl. [1,256,256,3]), azt eltávolítjuk
    if len(img.shape) == 4 and img.shape[0] == 1:
        img = tf.squeeze(img, axis=0)
    images_clean.append(img)

Corrupt JPEG data: 2 extraneous bytes before marker 0xd7
Corrupt JPEG data: 3 extraneous bytes before marker 0xd7
Corrupt JPEG data: 3 extraneous bytes before marker 0xd0
Corrupt JPEG data: 5 extraneous bytes before marker 0xd7


Nem sikerült betölteni a képet: ../data/anklealign/anklealign/C6037J/internet_freepik_02.jpg
Nem sikerült betölteni a képet: ../data/anklealign/anklealign/C6037J/internet_hss.edu_01.jpg


2025-12-11 14:26:43.617179: I tensorflow/core/framework/local_rendezvous.cc:407] Local rendezvous is aborting with status: INVALID_ARGUMENT: Trying to decode BMP format using a wrong op. Use `decode_bmp` or `decode_image` instead. Op used: DecodeJpeg


In [225]:
owners_ = pd.DataFrame(owners)
names_ = pd.DataFrame(names)

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((images_clean, labels, owners_+"/"+names_))

<_TensorSliceDataset element_spec=(TensorSpec(shape=(256, 256, 3), dtype=tf.float32, name=None), TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(1,), dtype=tf.string, name=None))>

In [24]:
dataset = tf.data.Dataset.load(DATA_PATH+"/dataset")

W0000 00:00:1765473928.228261   90897 gpu_device.cc:2342] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [27]:
dataset

<_LoadDataset element_spec=(TensorSpec(shape=(256, 256, 3), dtype=tf.float32, name=None), TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(1,), dtype=tf.string, name=None))>