In [4]:
#Imports
import os
import pandas as pd
import random
from pycocotools.coco import COCO
from sklearn.model_selection import train_test_split

In [2]:
# Paths for the dataset, anotations, working directory and images
TRAIN_PATH = 'D:/Download/JDownloader/MSCOCO/images/train2017'
VAL_PATH = 'D:/Download/JDownloader/MSCOCO/images/val2017'
ANNOTATIONS_PATH = 'D:/Download/JDownloader/MSCOCO/annotations'
WORKING_DIR = 'D:/Projetos/Mestrado/2024_Topicos_Esp_Sist_Informacao/ARTIGO_FINAL/object_detection_model_compare/working'
VAL_MODEL_IMG = 'D:/Projetos/Mestrado/2024_Topicos_Esp_Sist_Informacao/ARTIGO_FINAL/object_detection_model_compare/val_model_img'
FILTERED_CATEGORIES = ['person', 'cat', 'dog']

In [None]:
# Load COCO annotations
annotations_file = os.path.join(ANNOTATIONS_PATH, 'instances_train2017.json')
coco = COCO(annotations_file)

# Get category IDs for the selected categories
category_ids = coco.getCatIds(catNms=FILTERED_CATEGORIES)

loading annotations into memory...
Done (t=8.89s)
creating index...
index created!


[1, 17, 18]

In [5]:
# Get total image count per category
category_image_counts = {}

for category_name, category_id in zip(FILTERED_CATEGORIES, category_ids):
    # Get all annotation IDs for the category
    ann_ids = coco.getAnnIds(catIds=[category_id])
    
    # Load annotations and extract unique image IDs
    anns = coco.loadAnns(ann_ids)
    image_ids = {ann['image_id'] for ann in anns}  # Use a set to ensure uniqueness
    
    # Count unique images
    category_image_counts[category_name] = len(image_ids)

# Print results
print("Total image count per category:")
for category, count in category_image_counts.items():
    print(f"{category}: {count}")

Total image count per category:
person: 64115
cat: 4114
dog: 4385


In [6]:
# Generate filtered dataset to train the model

# Map category IDs to category names
categories = coco.loadCats(category_ids)
category_id_to_name = {category['id']: category['name'] for category in categories}

# Collect up to 1000 annotations per category
filtered_data = []
for category_id in category_ids:
    ann_ids = coco.getAnnIds(catIds=[category_id])
    anns = coco.loadAnns(ann_ids)
    selected_anns = random.sample(anns, min(1000, len(anns)))
    for ann in selected_anns:
        image_info = coco.loadImgs(ann['image_id'])[0]
        filtered_data.append({
            "image_id": ann['image_id'],
            "image": image_info['file_name'],
            "category_id": ann['category_id'],
            "bbox": ann['bbox'],
            "label": category_id_to_name[ann['category_id']],
        })

# Save filtered data to CSV
filtered_csv_path = os.path.join(WORKING_DIR, 'filtered_coco.csv')
filtered_df = pd.DataFrame(filtered_data)
filtered_df.to_csv(filtered_csv_path, index=False)

print(f"Filtered dataset saved to {os.path.abspath(filtered_csv_path)}")

Filtered dataset saved to D:\Projetos\Mestrado\2024_Topicos_Esp_Sist_Informacao\ARTIGO_FINAL\object_detection_model_compare\working\filtered_coco.csv


In [7]:
# Split the filtered dataset
# Load the filtered CSV
data = pd.read_csv(filtered_csv_path)

# Display record count per category_id
category_counts = data['category_id'].value_counts()
print("Record count per category_id:")
print(category_counts)
print("")

# Split into training and testing datasets
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Display record count per category_id train and test
category_train_counts = train_data['category_id'].value_counts()
print("Train - Record count per category_id:")
print(category_train_counts)
print("")

category_test_counts = test_data['category_id'].value_counts()
print("Test - Record count per category_id:")
print(category_test_counts)
print("")

Record count per category_id:
category_id
1     1000
17    1000
18    1000
Name: count, dtype: int64

Train - Record count per category_id:
category_id
18    814
17    803
1     783
Name: count, dtype: int64

Test - Record count per category_id:
category_id
1     217
17    197
18    186
Name: count, dtype: int64



In [8]:
# Save the split datasets
train_csv_path = os.path.join(WORKING_DIR, 'train_data.csv')
test_csv_path = os.path.join(WORKING_DIR, 'test_data.csv')

train_data.to_csv(train_csv_path, index=False)
test_data.to_csv(test_csv_path, index=False)

print(f"Training dataset saved to {os.path.abspath(train_csv_path)}")
print(f"Testing dataset saved to {os.path.abspath(test_csv_path)}")

Training dataset saved to D:\Projetos\Mestrado\2024_Topicos_Esp_Sist_Informacao\ARTIGO_FINAL\object_detection_model_compare\working\train_data.csv
Testing dataset saved to D:\Projetos\Mestrado\2024_Topicos_Esp_Sist_Informacao\ARTIGO_FINAL\object_detection_model_compare\working\test_data.csv


In [9]:
# Load split datasets
train_csv_path = os.path.join(WORKING_DIR, 'train_data.csv')
test_csv_path = os.path.join(WORKING_DIR, 'test_data.csv')

train_data = pd.read_csv(train_csv_path)
test_data = pd.read_csv(test_csv_path)

In [10]:
#Generate the validation dataset

# Load COCO annotations from validation set
val_annotations_file = os.path.join(ANNOTATIONS_PATH, 'instances_val2017.json')
coco_val = COCO(val_annotations_file)

# Collect up to 200 annotations per category
val_filtered_data = []
for category_id in category_ids:
    ann_ids = coco_val.getAnnIds(catIds=[category_id])
    anns = coco_val.loadAnns(ann_ids)
    selected_anns = random.sample(anns, min(200, len(anns)))
    for ann in selected_anns:
        image_info = coco_val.loadImgs(ann['image_id'])[0]
        val_filtered_data.append({
            "image_id": ann['image_id'],
            "image": image_info['file_name'],
            "category_id": ann['category_id'],
            "bbox": ann['bbox'],
            "label": category_id_to_name[ann['category_id']],
        })

# Save filtered data to CSV
val_filtered_csv_path = os.path.join(WORKING_DIR, 'val_filtered_coco.csv')
val_filtered_df = pd.DataFrame(val_filtered_data)
val_filtered_df.to_csv(val_filtered_csv_path, index=False)

print(f"Validation Filtered dataset saved to {os.path.abspath(val_filtered_csv_path)}")


# Load the filtered CSV
val_data = pd.read_csv(val_filtered_csv_path)

# Display record count per category_id to validate
val_category_counts = val_data['category_id'].value_counts()
print("Record count per category_id:")
print(val_category_counts)
print("")

loading annotations into memory...
Done (t=0.39s)
creating index...
index created!
Validation Filtered dataset saved to D:\Projetos\Mestrado\2024_Topicos_Esp_Sist_Informacao\ARTIGO_FINAL\object_detection_model_compare\working\val_filtered_coco.csv
Record count per category_id:
category_id
1     200
17    200
18    200
Name: count, dtype: int64

