In [None]:
import requests
import zipfile
import os

# Create folder for dataset
os.makedirs("coco_dataset/annotations", exist_ok=True)

# COCO Annotations URL (Captions)
annotations_url = "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
annotations_path = "coco_dataset/annotations_trainval2017.zip"

# Download and extract annotations
print("Downloading COCO annotations...")
response = requests.get(annotations_url, stream=True)
with open(annotations_path, "wb") as file:
    for chunk in response.iter_content(chunk_size=1024):
        file.write(chunk)

# Extract annotations
print("Extracting annotations...")
with zipfile.ZipFile(annotations_path, "r") as zip_ref:
    zip_ref.extractall("coco_dataset/annotations")

# Remove zip file
os.remove(annotations_path)
print("COCO Annotations Downloaded & Extracted!")


Downloading COCO annotations...
Extracting annotations...
COCO Annotations Downloaded & Extracted!


In [8]:
import json
import random

# Load COCO captions JSON
annotations_file = r"D:\VS_Jnotepad_code\Search Enginer Project\coco_dataset\annotations\annotations\captions_train2017.json"

with open(annotations_file, "r") as f:
    coco_data = json.load(f)

# Select a limited number of images (e.g., 500)
num_images_to_download = 10000
selected_images = random.sample(coco_data["images"], num_images_to_download)

# Save selected image IDs
selected_image_ids = {img["id"]: img["file_name"] for img in selected_images}

print(f"Selected {len(selected_image_ids)} images for download.")


Selected 10000 images for download.


In [11]:
import requests
import os
from concurrent.futures import ThreadPoolExecutor

# COCO Images Base URL
coco_images_url = "http://images.cocodataset.org/train2017/"

# Create folder for images
os.makedirs("coco_dataset/train2017_subset", exist_ok=True)

def download_image(file_name):
    """Download a single image."""
    image_url = coco_images_url + file_name
    image_path = os.path.join("coco_dataset/train2017_subset", file_name)

    try:
        response = requests.get(image_url, stream=True, timeout=10)
        response.raise_for_status()  # Raise an error for bad responses (4xx, 5xx)
        with open(image_path, "wb") as img_file:
            for chunk in response.iter_content(chunk_size=8192):
                img_file.write(chunk)
        print(f"✅ Downloaded: {file_name}")
    except requests.exceptions.RequestException as e:
        print(f"❌ Failed to download {file_name}: {e}")

# Use ThreadPoolExecutor for parallel downloads
num_threads = 10  # You can increase this to 20-30 depending on your internet speed

print("🚀 Starting parallel downloads...")
with ThreadPoolExecutor(max_workers=num_threads) as executor:
    executor.map(download_image, selected_image_ids.values())

print("🎉 All selected images downloaded successfully!")


🚀 Starting parallel downloads...
✅ Downloaded: 000000426300.jpg
✅ Downloaded: 000000550572.jpg
✅ Downloaded: 000000433921.jpg
✅ Downloaded: 000000105904.jpg
✅ Downloaded: 000000378030.jpg
✅ Downloaded: 000000226076.jpg
✅ Downloaded: 000000229024.jpg
✅ Downloaded: 000000101575.jpg
✅ Downloaded: 000000255326.jpg
✅ Downloaded: 000000376407.jpg
✅ Downloaded: 000000315036.jpg
✅ Downloaded: 000000531883.jpg
✅ Downloaded: 000000444804.jpg
✅ Downloaded: 000000468337.jpg
✅ Downloaded: 000000008789.jpg
✅ Downloaded: 000000443397.jpg
✅ Downloaded: 000000218850.jpg
✅ Downloaded: 000000076518.jpg
✅ Downloaded: 000000096732.jpg
✅ Downloaded: 000000412094.jpg
✅ Downloaded: 000000568974.jpg
✅ Downloaded: 000000578108.jpg
✅ Downloaded: 000000364044.jpg
✅ Downloaded: 000000387369.jpg
✅ Downloaded: 000000553541.jpg
✅ Downloaded: 000000007455.jpg
✅ Downloaded: 000000122724.jpg
✅ Downloaded: 000000096997.jpg
✅ Downloaded: 000000544260.jpg
✅ Downloaded: 000000560908.jpg
✅ Downloaded: 000000267774.jpg
✅ Down

KeyboardInterrupt: 

✅ Downloaded: 000000357386.jpg


✅ Downloaded: 000000096854.jpg
