In [3]:
import pandas as pd
import json
import numpy as np
from pathlib import Path

# Define paths — adjust them as needed for your environment
sandbox_dir = Path("sandbox")
annotations_path = sandbox_dir / "train_sample_annotations.csv"
labels_path = sandbox_dir / "label_descriptions.json"

# Print to ensure they exist
print("Annotations CSV:", annotations_path)
print("Labels JSON:", labels_path)


Annotations CSV: sandbox/train_sample_annotations.csv
Labels JSON: sandbox/label_descriptions.json


In [None]:
# Load the annotations CSV (which includes columns like ImageId, AttributesIds, etc.)
ann_df = pd.read_csv(annotations_path)
print("Total annotation rows:", len(ann_df))


with open(labels_path, "r") as f:
    label_data = json.load(f)

attributes_list = label_data["attributes"]
num_attributes = len(attributes_list)
print("Number of attributes defined:", num_attributes)


Total annotation rows: 14765
Number of attributes defined: 294


In [5]:
# Build a mapping: attribute id --> index in our vector
attr_ids = [attr["id"] for attr in attributes_list]
attr_to_index = {attr_id: idx for idx, attr_id in enumerate(attr_ids)}
print("Attribute mapping (first 5):", dict(list(attr_to_index.items())[:5]))


Attribute mapping (first 5): {0: 0, 1: 1, 2: 2, 3: 3, 4: 4}


In [6]:
# Group the annotations by ImageId and create a binary vector for each image
training_data = []

# Group annotations by image id
grouped = ann_df.groupby("ImageId")

for img_id, group in grouped:
    vec = np.zeros(num_attributes, dtype=int)
    # Iterate through every annotation row for the image
    for _, row in group.iterrows():
        raw_attr = str(row["AttributesIds"])  # Ensure it's a string
        # Split by comma; some rows might have multiple attribute ids
        for a in raw_attr.split(","):
            a = a.strip()
            if a.isdigit():
                a_int = int(a)
                # Set the corresponding position if a_int exists in our mapping
                if a_int in attr_to_index:
                    vec[attr_to_index[a_int]] = 1
    training_data.append({"image_id": img_id, "attributes": vec.tolist()})

# Create a DataFrame
train_attr_df = pd.DataFrame(training_data)
print(train_attr_df.head())


                           image_id  \
0  0000fe7c9191fba733c8a69cfaf962b7   
1  001a66b16b12f12dc45e2bba40e04683   
2  00382465705798a714595f1d043a24e6   
3  003ad8a37d2190bd944a8968fb0906e2   
4  0040e5863c5e6197cd264509bc2fbb1c   

                                          attributes  
0  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
4  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  


In [7]:
output_path = sandbox_dir / "train_attribute_data.csv"
train_attr_df.to_csv(output_path, index=False)
print(f"Multi-label attribute training data saved to: {output_path}")


Multi-label attribute training data saved to: sandbox/train_attribute_data.csv
