In [2]:
# find flickr user's id
import flickrapi

API_KEY = "90955b22fa72536d02e0da876d635760"
API_SECRET = "6eefed0dc7a85c9d"
USERNAME = "pokeplushproject"

flickr = flickrapi.FlickrAPI(API_KEY, API_SECRET, format='parsed-json')
response = flickr.people.findByUsername(username=USERNAME)
USER_ID = response['user']['nsid']

print("User NSID:", USER_ID)

User NSID: 23662195@N06


In [15]:
# --- Step 0: User chooses mode ---
print("Choose mode:")
print("1: Download images from URL file only")
print("2: Check API for missing image URLs and download all")
mode = input("Enter 1 or 2: ").strip()
if mode not in ("1", "2"):
    raise ValueError("Invalid mode! Enter 1 or 2.")

# --- Step 1: Mount Google Drive ---
from google.colab import drive
drive.mount('/content/drive')

import os
import requests
import flickrapi
from concurrent.futures import ThreadPoolExecutor, as_completed
import shutil
from datetime import datetime
import time

# --- Step 2: Flickr API setup ---
API_KEY = "90955b22fa72536d02e0da876d635760"
API_SECRET = "6eefed0dc7a85c9d"
USER_ID = "23662195@N06"

flickr = flickrapi.FlickrAPI(API_KEY, API_SECRET, format='parsed-json')

# --- Step 3: Paths and logging ---
BASE_DIR = "/content/drive/MyDrive/IRL Pokedex/training images (plushies)" # this is the destination folder for all downlaods
os.makedirs(BASE_DIR, exist_ok=True)

LOG_FILE = os.path.join(BASE_DIR, "flickr_downloads_log.txt")
URL_FILE = os.path.join(BASE_DIR, "flickr_downloads_urls.txt")

api_call_count = 0  # global counter

def log(message):
    line = f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {message}"
    print(line)
    with open(LOG_FILE, "a") as f:
        f.write(line + "\n")

def save_url(photo_id, url, filepath):
    with open(URL_FILE, "a") as f:
        f.write(f"{photo_id}\t{url}\t{filepath}\n")

def safe_filename(name):
    return "".join(c if c.isalnum() or c in " ._-" else "_" for c in name)

def api_call(func, *args, **kwargs):
    global api_call_count
    result = func(*args, **kwargs)
    api_call_count += 1
    log(f"API call #{api_call_count}: {func.__name__}")
    time.sleep(1)
    return result

def get_original_url(photo_id):
    sizes = api_call(flickr.photos.getSizes, photo_id=photo_id)
    for size in sizes['sizes']['size']:
        if size['label'] == "Original":
            return size['source']
    return sizes['sizes']['size'][-1]['source']

# --- Step 4: Load existing URL file ---
existing_urls = {}
if os.path.exists(URL_FILE):
    with open(URL_FILE, "r") as f:
        for line in f:
            parts = line.strip().split("\t")
            if len(parts) == 3:
                existing_urls[parts[0]] = (parts[1], parts[2])

download_jobs = []

# --- Step 5: Mode 2: Check API for missing URLs (both album/photostream) ---
if mode == "2":
    album_photo_ids = set(existing_urls.keys())
    album_photo_count = 0
    photostream_photo_count = 0

    photosets = api_call(flickr.photosets.getList, user_id=USER_ID)
    album_count = len(photosets['photosets']['photoset'])
    log(f"Found {album_count} albums.")

    for album in photosets['photosets']['photoset']:
        album_title = safe_filename(album['title']['_content']) or f"album_{album['id']}"
        album_dir = os.path.join(BASE_DIR, album_title)
        os.makedirs(album_dir, exist_ok=True)

        log(f"Scanning album: {album_title}")
        photos = api_call(flickr.photosets.getPhotos, photoset_id=album['id'], user_id=USER_ID)

        for photo in photos['photoset']['photo']:
            photo_id = photo['id']
            album_photo_ids.add(photo_id)
            if photo_id not in existing_urls:
                album_photo_count += 1
                url = get_original_url(photo_id)
                filename = f"{safe_filename(photo['title'])}_{photo_id}.jpg"
                filepath = os.path.join(album_dir, filename)
                if not os.path.exists(filepath):
                  download_jobs.append((url, filepath))
                save_url(photo_id, url, filepath)

    log(f"Collected {album_photo_count} new photos from albums.")

    # Photostream
    log("Scanning photostream...")
    page = 1
    while True:
        photostream = api_call(flickr.people.getPhotos, user_id=USER_ID, per_page=500, page=page)
        photos = photostream['photos']['photo']

        if not photos:
            log(f"No photos returned on page {page}, stopping.")
            break

        # Process photos
        for photo in photos:
            photo_id = photo['id']
            if photo_id not in album_photo_ids and photo_id not in existing_urls:
                photostream_photo_count += 1
                url = get_original_url(photo_id)
                folder = os.path.join(BASE_DIR, f"photo_{photo_id}")
                os.makedirs(folder, exist_ok=True)
                filename = f"{safe_filename(photo['title'])}_{photo_id}.jpg"
                filepath = os.path.join(folder, filename)
                if not os.path.exists(filepath):
                    download_jobs.append((url, filepath))
                save_url(photo_id, url, filepath)

        if page >= photostream['photos']['pages']:
            log(f"Reached last page ({page}), stopping photostream scan.")
            break

        page += 1

    log(f"Collected {photostream_photo_count} new photos from photostream.")
    log(f"Total new photos to download: {len(download_jobs)}")

# --- Step 6: Mode 1 or resume: Download from URL file ---
if mode == "1" or mode == "2":
    if not download_jobs:
        log("No new downloads found. Checking URL file for existing downloads...")
        for photo_id, (url, filepath) in existing_urls.items():
            if not os.path.exists(filepath):
                download_jobs.append((url, filepath))
        log(f"{len(download_jobs)} photos need downloading from URL file.")

# --- Step 7: Parallel Download ---
def download_file(job):
    url, filepath = job
    try:
        resp = requests.get(url, stream=True, timeout=30)
        resp.raise_for_status()
        os.makedirs(os.path.dirname(filepath), exist_ok=True)
        with open(filepath, "wb") as f:
            for chunk in resp.iter_content(1024):
                f.write(chunk)
        time.sleep(0.3)  # short pause to reduce 429 risk
        return f"Downloaded: {filepath}"
    except Exception as e:
        return f"Failed {filepath}: {e}"

if download_jobs:
    log("Starting downloads...")
    with ThreadPoolExecutor(max_workers=2) as executor:
        futures = [executor.submit(download_file, job) for job in download_jobs]
        for future in as_completed(futures):
            log(future.result())
else:
    log("✅ No photos to download.")

Choose mode:
1: Download images from URL file only
2: Check API for missing image URLs and download all
Enter 1 or 2: 2
Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
[2025-08-24 22:50:29] Scanning photostream...
[2025-08-24 22:50:29] API call #1: getPhotos
[2025-08-24 22:50:31] API call #2: getPhotos
[2025-08-24 22:50:33] API call #3: getPhotos
[2025-08-24 22:50:34] API call #4: getPhotos
[2025-08-24 22:50:36] API call #5: getPhotos
[2025-08-24 22:50:37] API call #6: getPhotos
[2025-08-24 22:50:39] API call #7: getPhotos
[2025-08-24 22:50:40] API call #8: getPhotos
[2025-08-24 22:50:41] Reached last page (8), stopping photostream scan.
[2025-08-24 22:50:41] Collected 0 new photos from photostream.
[2025-08-24 22:50:41] Total new photos to download: 0
[2025-08-24 22:50:41] No new downloads found. Checking URL file for existing downloads...
[2025-08-24 22:50:42] 0 photos need downloading from URL file.
[20