In [2]:
import os
import json
import pandas as pd
from datetime import datetime
import hopsworks

def parse_annotation_file(annotation_file, label, base_dir_images=""):
    """
    Parse a WIDER Face-style annotation file.

    Returns a DataFrame with columns:
      - file_path (str): absolute path to image file
      - file_timestamp (datetime or None)
      - file_size_mb (float or None)
      - num_bboxes (int)
      - bboxes (str): JSON-encoded list[list[int]]
      - label (str): 'train' | 'val'
      - ingested_at (datetime)
    """
    rows = []
    with open(annotation_file, "r") as f:
        # Keep only non-empty lines
        lines = [line.strip() for line in f if line.strip()]

    i = 0
    now = datetime.utcnow()
    while i < len(lines):
        filename = lines[i]
        num_bboxes = int(lines[i + 1])
        bbox_lines = lines[i + 2 : i + 2 + num_bboxes]

        # Convert bbox lines into list[list[int]]
        bboxes_list = [list(map(int, l.split())) for l in bbox_lines]

        # Build full absolute path for the image file
        # If filename is already absolute, abspath will keep it.
        full_path = os.path.abspath(os.path.join(base_dir_images, filename))

        # File metadata (graceful if file doesn't exist on disk)
        if os.path.exists(full_path):
            size_mb = os.path.getsize(full_path) / (1024 * 1024)
            ts = datetime.fromtimestamp(os.path.getmtime(full_path))
        else:
            size_mb = None
            ts = None

        rows.append(
            {
                "file_path": full_path,
                "file_timestamp": ts,
                "file_size_mb": size_mb,
                "num_bboxes": num_bboxes,
                "bboxes": json.dumps(bboxes_list), # Store as JSON string 
                "label": label,
                "ingested_at": now,
            }
        )

        i += 2 + num_bboxes

    return pd.DataFrame(rows)


In [3]:
def build_dataset_and_write_to_hopsworks(
    df_train,
    df_val,
    feature_group_name="wider_face_files",
    feature_group_version=1,
    primary_key=("file_path"),
    event_time="file_timestamp",
    description="WIDER Face annotations with file metadata and bounding boxes (JSON).",
):
    """
    Parse train/val files, combine, and write to a Hopsworks Feature Group.
    """

    df = pd.concat([df_train, df_val], ignore_index=True)

    project = hopsworks.login()
    fs = project.get_feature_store()

    fg = fs.get_or_create_feature_group(
        name=feature_group_name,
        version=feature_group_version,
        primary_key=list(primary_key),
        event_time=event_time,
        description=description,
        online_enabled=False
    )

    fg.insert(df)
    return df, fg


In [4]:
# Source annotation files
TRAIN_FILE = "/hopsfs/Jupyter/yolov8-face/data/wider_face_split/wider_face_train_bbx_gt.txt"
VAL_FILE   = "/hopsfs/Jupyter/yolov8-face/data/wider_face_split/wider_face_val_bbx_gt.txt"

# Base directory where the actual image files live (so we can compute size/timestamp).
# If the paths in the files are already absolute, leave this empty string.
TRAIN_DIR_IMAGES = "/hopsfs/Jupyter/yolov8-face/data/WIDER_train/images"
VAL_DIR_IMAGES = "/hopsfs/Jupyter/yolov8-face/data/WIDER_val/images"

df_train = parse_annotation_file(TRAIN_FILE, label="train", base_dir_images=TRAIN_DIR_IMAGES)
df_val = parse_annotation_file(VAL_FILE, label="val", base_dir_images=VAL_DIR_IMAGES)

df_train

Unnamed: 0,file_path,file_timestamp,file_size_mb,num_bboxes,bboxes,label,ingested_at
0,/hopsfs/Jupyter/yolov8-face/data/WIDER_train/i...,2025-08-21 08:01:40,0.116430,1,"[[467, 95, 112, 150, 0, 0, 0, 0, 0, 0]]",train,2025-08-25 15:59:58.722887
1,/hopsfs/Jupyter/yolov8-face/data/WIDER_train/i...,2025-08-21 08:01:39,0.093248,2,"[[222, 106, 70, 122, 0, 0, 0, 0, 0, 0], [646, ...",train,2025-08-25 15:59:58.722887
2,/hopsfs/Jupyter/yolov8-face/data/WIDER_train/i...,2025-08-21 08:01:38,0.046742,1,"[[336, 82, 112, 162, 0, 0, 0, 0, 0, 0]]",train,2025-08-25 15:59:58.722887
3,/hopsfs/Jupyter/yolov8-face/data/WIDER_train/i...,2025-08-21 08:01:49,0.091596,1,"[[271, 426, 237, 256, 0, 0, 0, 0, 0, 0]]",train,2025-08-25 15:59:58.722887
4,/hopsfs/Jupyter/yolov8-face/data/WIDER_train/i...,2025-08-21 08:01:49,0.185308,3,"[[364, 894, 87, 99, 1, 0, 1, 0, 0, 0], [545, 7...",train,2025-08-25 15:59:58.722887
...,...,...,...,...,...,...,...
2459,/hopsfs/Jupyter/yolov8-face/data/WIDER_train/i...,2025-08-21 08:06:46,0.087589,2,"[[530, 270, 112, 174, 0, 0, 0, 0, 0, 0], [638,...",train,2025-08-25 15:59:58.722887
2460,/hopsfs/Jupyter/yolov8-face/data/WIDER_train/i...,2025-08-21 08:07:08,0.083886,6,"[[642, 162, 182, 268, 0, 0, 0, 0, 0, 0], [305,...",train,2025-08-25 15:59:58.722887
2461,/hopsfs/Jupyter/yolov8-face/data/WIDER_train/i...,2025-08-21 08:07:00,0.152492,2,"[[172, 313, 337, 373, 0, 0, 0, 0, 1, 0], [404,...",train,2025-08-25 15:59:58.722887
2462,/hopsfs/Jupyter/yolov8-face/data/WIDER_train/i...,2025-08-21 08:07:02,0.053181,1,"[[416, 22, 326, 466, 0, 0, 0, 0, 0, 0]]",train,2025-08-25 15:59:58.722887


In [5]:
df, fg = build_dataset_and_write_to_hopsworks(
    df_train,
    df_val,
    primary_key=("file_path")
)

print(f"Wrote {len(df)} rows to Feature Group: {fg.name}, v{fg.version}")
print(df.head())


2025-08-21 21:51:25,019 INFO: Python Engine initialized.

Logged in to project, explore it here https://snurran.devnet.hops.works/p/120
Feature Group created successfully, explore it at 
https://snurran.devnet.hops.works/p/120/fs/67/fg/15


Uploading Dataframe: 100.00% |██████████| Rows 37941/37941 | Elapsed Time: 00:01 | Remaining Time: 00:00


Launching job: face_bboxes_1_offline_fg_materialization
Job started successfully, you can follow the progress at 
https://snurran.devnet.hops.works/p/120/jobs/named/face_bboxes_1_offline_fg_materialization/executions
Wrote 37941 rows to Feature Group: face_bboxes, v1
                                           file_path      file_timestamp  \
0  /hopsfs/Jupyter/yolov8-face/data/WIDER_train/i... 2025-08-21 08:01:40   
1  /hopsfs/Jupyter/yolov8-face/data/WIDER_train/i... 2025-08-21 08:01:39   
2  /hopsfs/Jupyter/yolov8-face/data/WIDER_train/i... 2025-08-21 08:01:39   
3  /hopsfs/Jupyter/yolov8-face/data/WIDER_train/i... 2025-08-21 08:01:38   
4  /hopsfs/Jupyter/yolov8-face/data/WIDER_train/i... 2025-08-21 08:01:49   

   file_size_mb  num_bboxes  bbox_index  \
0      0.116430           1           0   
1      0.093248           2           0   
2      0.093248           2           1   
3      0.046742           1           0   
4      0.091596           1           0   

                