# Data processing for the YOLOv5 model training from the Open Images preselected and processed data

Load the previously processed data

In [1]:
import pandas as pd

df_train = pd.read_csv("yolo_model\\sel_train_bb_10k.csv")
df_test = pd.read_csv("yolo_model\\sel_test_bb_10k.csv")
df_validation = pd.read_csv("yolo_model\\sel_validation_bb_10k.csv")

Check the data is complete

In [2]:
df_train.shape, df_train["ImageID"].nunique()

((32989, 21), 7041)

In [3]:
df_test.shape, df_test["ImageID"].nunique()

((1066, 21), 281)

In [4]:
df_validation.shape, df_validation["ImageID"].nunique()

((383, 21), 104)

## Label Mapping
Since YOLOv5 needs integers, we need to map the label names to integers.

In [5]:
selected_labels = df_train["LabelName"].unique().tolist()
len(selected_labels)

6

In [6]:
label_map = {label: i for i, label in enumerate(sorted(selected_labels))}

df_train["class_id"] = df_train["LabelName"].map(label_map)
df_test["class_id"] = df_test["LabelName"].map(label_map)
df_validation["class_id"] = df_validation["LabelName"].map(label_map)

## YOLO annotations and file structure
Creation of the YOLO annotation file and organization of data in the proper directory structure

Move the images to their respective folders

In [7]:
import shutil
from pathlib import Path

def move_images(ids, src_folder, dst_folder):
    src = Path(src_folder)
    dst = Path(dst_folder)
    dst.mkdir(parents=True, exist_ok=True)

    moved = 0
    not_found = 0
    for img_id in ids:

        src_img = src / f"{img_id}.jpg"
        if not src_img.exists():
            print("\nImage", src_img, "is not in the source folder!")
            not_found += 1
            continue

        dest_img = dst / f"{img_id}.jpg"
        if not dest_img.exists():
            shutil.copy(src_img, dest_img)
            moved += 1
    
    print("Processed: ", moved + not_found, "\nmoved: ", moved, "\nnot found: ", not_found)


In [8]:
train_imgs_IDs = df_train["ImageID"].unique().tolist()
test_imgs_IDs = df_test["ImageID"].unique().tolist()
validation_imgs_IDs = df_validation["ImageID"].unique().tolist()

In [13]:
move_images(train_imgs_IDs, "yolo_model\\data/", "yolo_model\\dataset\\images\\train/")
move_images(test_imgs_IDs, "yolo_model\\data/", "yolo_model\\dataset\\images\\test/")
move_images(validation_imgs_IDs, "yolo_model\\data/", "yolo_model\\dataset\\images\\val/")

Processed:  7041 
moved:  7041 
not found:  0
Processed:  281 
moved:  281 
not found:  0
Processed:  104 
moved:  104 
not found:  0


Creation of label files for the splits

In [9]:
from pathlib import Path

def create_yolo_labels(df, labels_dir):
    labels_dir = Path(labels_dir)
    labels_dir.mkdir(parents=True, exist_ok=True)

    grouped = df.groupby("ImageID")

    for image_id, g in grouped:
        lines = []

        for _, row in g.iterrows():
            xc = (row.XMin + row.XMax) / 2
            yc = (row.YMin + row.YMax) / 2
            w  = row.XMax - row.XMin
            h  = row.YMax - row.YMin

            lines.append(
                f"{row.class_id} {xc} {yc} {w} {h}"
            )

        with open(labels_dir / f"{image_id}.txt", "w") as f:
            f.write("\n".join(lines))


In [15]:
create_yolo_labels(df_train, "yolo_model\\dataset\\labels\\train\\")
create_yolo_labels(df_test, "yolo_model\\dataset\\labels\\test\\")
create_yolo_labels(df_validation, "yolo_model\\dataset\\labels\\val\\")

Convertion of the labels in the dataset to their display names

In [10]:
df_class_description = pd.read_csv("data\\oidv7-class-descriptions-boxable.csv")

df_class_description.head()

Unnamed: 0,LabelName,DisplayName
0,/m/0mkg,Accordion
1,/m/03m3vtv,Adhesive tape
2,/m/0k5j,Aircraft
3,/m/046dlr,Alarm clock
4,/m/0pcr,Alpaca


In [11]:
selected_displayNames = df_class_description[df_class_description["LabelName"].isin(selected_labels)]
selected_displayNames

Unnamed: 0,LabelName,DisplayName
169,/m/01d40f,Dress
185,/m/0463sg,Fashion accessory
204,/m/09j5n,Footwear
218,/m/0jyfg,Glasses
283,/m/0fly7,Jeans
502,/m/01xyhv,Suit


In [19]:
selected_displayNames[selected_displayNames["LabelName"] == '/m/01d40f']["DisplayName"].values[0]

'Dress'

In [20]:
lis_disp_names = [selected_displayNames[selected_displayNames["LabelName"] == label_name]["DisplayName"].values[0] for label_name in list(label_map.keys())]
names = dict(zip(list(label_map.values()), lis_disp_names))
names

{0: 'Dress',
 1: 'Suit',
 2: 'Fashion accessory',
 3: 'Footwear',
 4: 'Jeans',
 5: 'Glasses'}

Renaming of entries so it will not induce to potential errors when passing to the models.

In [21]:
names[2] = "Fashion_accessory"

Creation of the data.yaml file

In [23]:
import yaml

data = {
    "path": "C:\\Users\\juanm\\INAION\\Proyectos Universidad Maestr√≠a\\Semestre 2\\EINIS\\Project\\yolo_model\\dataset",
    "train": "images/train",
    "val": "images/val",
    "test": "images/test",
    "names": names
}

with open("yolo_model\\dataset\\data.yaml", "w") as f:
    yaml.dump(data, f)
