In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import requests
import os
import time
from urllib.request import urlretrieve
import zipfile
import json

# Constants
CLIENT_ID = ""  # Replace with your actual key
SEASON = "fall"
IMAGE_COUNT = {"train": 10000, "validation": 2500, "test": 2500}
PER_PAGE = 10  # Number of images per page
SECONDS_BETWEEN_REQUESTS = 1.2  # Throttle requests to avoid hitting rate limit too quickly
BASE_PATH = "/content/project"  # Base directory for saving images
PROGRESS_FILE = "/content/download_progress.json"  # File to track download progress

In [4]:
# Function to fetch images from Unsplash
def get_unsplash_photos(keyword, client_id, page, per_page=PER_PAGE):
    url = "https://api.unsplash.com/search/photos"
    params = {"query": keyword, "client_id": client_id, "per_page": per_page, "page": page}
    response = requests.get(url, params=params)
    if response.status_code == 200:
        return [photo['urls']['regular'] for photo in response.json()['results']]

    else:
        print(f"Failed to fetch photos for {keyword}, page {page}: {response.status_code}")
        return []
# Function to download and save images with custom filenames
def download_and_save_images(image_urls, folder_path, image_name, start_index):
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)
    for i, url in enumerate(image_urls, start=start_index):
        filename = f"{folder_path}/{image_name}_{i}.jpg"
        urlretrieve(url, filename)

# Function to download images for a season and dataset part
def download_season_images(season, part, target_count, progress):
    folder_name = os.path.join(BASE_PATH, season, part)
    current_page = progress.get(season, {}).get(part, 1)
    downloaded_count = (current_page - 1) * PER_PAGE

    while downloaded_count < target_count:
        image_urls = get_unsplash_photos(season, CLIENT_ID, current_page)
        if len(image_urls) != []:
          print(f"Successfully downloaded photo {downloaded_count} on page {current_page}")
        download_and_save_images(image_urls, folder_name, season.capitalize(), downloaded_count)
        downloaded_count += len(image_urls)
        current_page += 1
        save_progress(season, part, current_page)
        if downloaded_count < target_count:
            time.sleep(SECONDS_BETWEEN_REQUESTS)  # Throttle requests

# Function to save progress to a file
def save_progress(season, part, page):
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, 'r') as file:
            progress = json.load(file)
    else:
        progress = {}

    if season not in progress:
        progress[season] = {}
    progress[season][part] = page

    with open(PROGRESS_FILE, 'w') as file:
        json.dump(progress, file)

# Function to load progress from a file
def load_progress():
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, 'r') as file:
            return json.load(file)
    return {}

# Function to zip the folders
def zip_folders():
    with zipfile.ZipFile('/content/project.zip', 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(BASE_PATH):
            for file in files:
                zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), BASE_PATH))

In [None]:
# Main download loop with progress tracking
progress = load_progress()
for part, count in IMAGE_COUNT.items():
    download_season_images(SEASON, part, count, progress)

zip_folders()
print("Zipping complete. Download '/content/project.zip'")

In [None]:
# Main download loop with progress tracking
progress = load_progress()
for part, count in IMAGE_COUNT.items():
    if part != 'train':  # Skip the training data
        download_season_images(SEASON, part, count, progress)

zip_folders()
print("Zipping complete. Download '/content/project.zip'")