# Data Exploration of COCO Dataset

In [1]:
import os
import json
import matplotlib.pyplot as plt
from PIL import Image


data_dir = '../data/coco'
annotations_file = os.path.join(data_dir, 'annotations', 'instances_train2017.json')


with open(annotations_file) as f:
    annotations = json.load(f)


num_images = len(annotations['images'])
num_categories = len(annotations['categories'])
print(f'Number of images: {num_images}')
print(f'Number of categories: {num_categories}')

In [2]:

def display_sample_images(num_samples=5):
    sample_images = annotations['images'][:num_samples]
    for img in sample_images:
        img_path = os.path.join(data_dir, 'train2017', img['file_name'])
        image = Image.open(img_path)
        plt.imshow(image)
        plt.axis('off')
        plt.title(f'Image ID: {img['id']}')
        plt.show()

display_sample_images()

In [3]:

from collections import Counter

category_counter = Counter()
for annotation in annotations['annotations']:
    category_counter[annotation['category_id']] += 1


plt.figure(figsize=(12, 6))
plt.bar(category_counter.keys(), category_counter.values())
plt.xlabel('Category ID')
plt.ylabel('Number of Instances')
plt.title('Distribution of Categories in COCO Dataset')
plt.show()