In [None]:
import os
import json
from collections import Counter
import pandas as pd
from sklearn.model_selection import train_test_split

## Loading all 100k images

In [49]:
image_dir = "data/bdd100k_images_100k/100k"
label_dir = "data/bdd100k_labels/100k"

target_classes = {'car', 'traffic sign', 'traffic light', 'person'}

In [53]:
# testing on a single file

test_image = "data/bdd100k_labels/100k/test/cabc30fc-e7726578.json"

with open(test_image, 'r') as f:
            data = json.load(f)
            # BDD100K JSONs usually have a 'labels' list
            frames = data.get('frames')
            for frame in frames:
                for obj in frame.get('objects', []):
                    category = obj.get('category')
                    # print(category)
                    if category in target_classes:
                        print(category)

traffic sign
traffic sign
car
car
car
car
car
car
person


In [54]:
# Calculating statistics

stats = Counter()
label_path = os.path.join(label_dir, 'train')

for file in os.listdir(label_path):
    if file.endswith(".json"):
        with open(os.path.join(label_path, file), 'r') as f:
            data = json.load(f)
            frames = data.get('frames')

            for frame in frames:
                for obj in frame.get('objects', []):
                    category = obj.get('category')
                    if category in target_classes:
                        stats[category] += 1

display(stats)


Counter({'car': 102625,
         'traffic sign': 34478,
         'traffic light': 26922,
         'person': 13080})

In [None]:
objects_labels = []

# for file in os.listdir(label_path)[:1000]:
for file in os.listdir(label_path):
    if file.endswith(".json"):
        with open(os.path.join(label_path, file), 'r') as f:
            objects = []

            data = json.load(f)
            frames = data.get('frames')
            for frame in frames:
                for obj in frame.get('objects', []):
                    category = obj.get('category')
                    if category in target_classes:
                        if "box2d" in obj:
                            objects.append({
                                "category": category,
                                "box2d": obj["box2d"]
                            })
            objects_labels.append({
                "name": file,
                "labels": objects
            })


In [59]:
objects_labels

[{'name': '0000f77c-6257be58.json',
  'labels': [{'category': 'traffic light',
    'box2d': {'x1': 1125.902264,
     'y1': 133.184488,
     'x2': 1156.978645,
     'y2': 210.875445}},
   {'category': 'traffic light',
    'box2d': {'x1': 1156.978645,
     'y1': 136.637417,
     'x2': 1191.50796,
     'y2': 210.875443}},
   {'category': 'traffic sign',
    'box2d': {'x1': 1101.731743,
     'y1': 211.122087,
     'x2': 1170.79037,
     'y2': 233.566141}},
   {'category': 'traffic sign',
    'box2d': {'x1': 0, 'y1': 0.246631, 'x2': 100.381647, 'y2': 122.825696}},
   {'category': 'car',
    'box2d': {'x1': 45.240919,
     'y1': 254.530367,
     'x2': 357.805838,
     'y2': 487.906215}},
   {'category': 'car',
    'box2d': {'x1': 507.82755,
     'y1': 221.727518,
     'x2': 908.367588,
     'y2': 442.715126}},
   {'category': 'traffic sign',
    'box2d': {'x1': 0.156955,
     'y1': 0.809282,
     'x2': 102.417429,
     'y2': 133.411856}}]},
 {'name': '0000f77c-62c2a288.json',
  'labels': [{'

In [60]:
rows = []
for lbl in objects_labels:
    img = lbl["name"]
    for obj in lbl["labels"]:
        x1, y1, x2, y2 = obj["box2d"].values()
        rows.append([img, obj["category"], x1, y1, x2, y2])

df = pd.DataFrame(rows, columns=["image", "category", "x1", "y1", "x2", "y2"])

print("Total annotations:", len(df))

Total annotations: 1231818


In [91]:
df.head()

Unnamed: 0,image,category,x1,y1,x2,y2
0,0000f77c-6257be58.json,traffic light,1125.9,133.18,1156.98,210.88
1,0000f77c-6257be58.json,traffic light,1156.98,136.64,1191.51,210.88
2,0000f77c-6257be58.json,traffic sign,1101.73,211.12,1170.79,233.57
3,0000f77c-6257be58.json,traffic sign,0.0,0.25,100.38,122.83
4,0000f77c-6257be58.json,car,45.24,254.53,357.81,487.91


In [62]:
# Objects per category
objs_per_cat = df["category"].value_counts()
print("Objects per category:")
print(objs_per_cat)

Objects per category:
category
car              714121
traffic sign     239961
traffic light    186301
person            91435
Name: count, dtype: int64


## Limiting to 10k

### Random Selection

In [100]:
unique_images = df['image'].unique()
print(len(unique_images))

# Randomly select 10,000 unique images
random_10k_images, _ = train_test_split(unique_images, train_size=10000, random_state=42)

# df -> train+val (9k) / test(1k)
train_val_imgs, test_imgs = train_test_split(random_10k_images, train_size=9000, random_state=42)
# train+val (9k) -> train (7k) and Val (2k)
train_imgs, val_imgs = train_test_split(train_val_imgs, train_size=7000, random_state=42)

train_df_random = df[df['image'].isin(train_imgs)]
val_df_random = df[df['image'].isin(val_imgs)]
test_df_random = df[df['image'].isin(test_imgs)]

print(f"Train: {len(train_df_random)}, Val: {len(val_df_random)}, Test: {len(test_df_random)}")

train_df_random['image'] = train_df_random['image'].str.replace('.json', '', regex=False)
val_df_random['image'] = val_df_random['image'].str.replace('.json', '', regex=False)
test_df_random['image'] = test_df_random['image'].str.replace('.json', '', regex=False)

total_random_10k = pd.concat([train_df_random, val_df_random, test_df_random])

objs_per_cat_total_random_10k = total_random_10k["category"].value_counts()
print("Objects per category:")
print(objs_per_cat_total_random_10k)

unique_images = total_random_10k['image'].unique()
print(len(unique_images))


69989
Train: 122857, Val: 34529, Test: 17629
Objects per category:
category
car              101039
traffic sign      34201
traffic light     26648
person            13127
Name: count, dtype: int64
10000


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df_random['image'] = train_df_random['image'].str.replace('.json', '', regex=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  val_df_random['image'] = val_df_random['image'].str.replace('.json', '', regex=False)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df_random['image'] = tes

In [101]:
total_random_10k.head()

Unnamed: 0,image,category,x1,y1,x2,y2
13,0000f77c-cb820c98,car,581.94,301.36,641.32,350.35
14,0000f77c-cb820c98,car,697.73,299.88,722.97,331.05
15,0000f77c-cb820c98,car,715.55,302.85,768.99,344.41
16,0000f77c-cb820c98,car,760.08,292.45,864.0,375.59
17,0000f77c-cb820c98,car,801.65,299.88,896.66,393.4


### Selection by the class hierarcy (to include more samples of the rarest category)
Person (is the rarest category) > Traffic Light > Traffic Sign > Car (Common category).

In [None]:
# Aggregate all classes for each image
image_counts = df.groupby('image')['category'].apply(list).reset_index()

def get_stratify_group(categories):
    # Hierarchy: Person > Light > Sign > Car
    if 'person' in categories:
        return 'person'
    elif 'traffic light' in categories:
        return 'traffic light'
    elif 'traffic sign' in categories:
        return 'traffic sign'
    else:
        return 'car'

image_counts['stratify_group'] = image_counts['category'].apply(get_stratify_group)

# Select 10k images total
df_10k, _ = train_test_split(
    image_counts, 
    train_size=10000, 
    stratify=image_counts['stratify_group'], 
    random_state=42
)

# Split the 10k subset into Train/Val/Test 70/20/10
# df -> train+val (9k) / test(1k)
train_val_df, test_df = train_test_split(
    df_10k, 
    train_size=9000, 
    stratify=df_10k['stratify_group'], 
    random_state=42
)

# train+val (9k) -> train (7k) and Val (2k)
train_df, val_df = train_test_split(
    train_val_df, 
    train_size=7000, 
    stratify=train_val_df['stratify_group'], 
    random_state=42
)

print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")


In [None]:
# Filter our original df to keep only these images
final_train = df[df['image'].isin(train_df['image'])].copy()
final_val = df[df['image'].isin(val_df['image'])].copy()
final_test = df[df['image'].isin(test_df['image'])].copy()

total_10k = pd.concat([final_train, final_val, final_test])

objs_per_cat_total_10k = total_10k["category"].value_counts()
print("Objects per category:")
print(objs_per_cat_total_10k)

Objects per category:
category
car              102589
traffic sign      34319
traffic light     26695
person            13151
Name: count, dtype: int64


## Saving annotations

In [None]:
# final_train.to_csv("train_annotations.csv", index=False)
# final_val.to_csv("val_annotations.csv", index=False)
# final_test.to_csv("test_annotations.csv", index=False)

In [102]:
train_df_random.to_csv("data/labels/train_labels.csv", index=False)
val_df_random.to_csv("data/labels/val_labels.csv", index=False)
test_df_random.to_csv("data/labels/test_labels.csv", index=False)