### Data collection

In [1]:
import os
import csv
import io
import sys
from bing_image_downloader import downloader
from contextlib import redirect_stdout

#
# HELPER FUNCTION
# ---------------
# Given the captured logs (as a single string), separates invalid and valid URLs,
# renames downloaded images in the order they were saved, and writes [image_file_name, URL] to a CSV.
#
def process_download_logs(
    logs_str,              # entire captured log as a string
    download_folder,       # e.g. "centipede/centipede"
    csv_file,              # e.g. "centipede_urls.csv"
    rename_suffix          # e.g. "centipede" for "image_1_centipede.jpg"
):
    """
    Parses the Bing Image Downloader logs to extract valid URLs, 
    renames the images, and writes the mapping to a CSV.
    """

    # Convert log string to list of lines
    logs = logs_str.splitlines()

    valid_urls = []
    invalid_urls = set()

    # Identify invalid URLs from lines
    # (Bing image downloader logs typically show "[Error]Invalid image" or "Issue getting:")
    for line in logs:
        if "[Error]Invalid image" in line or "Issue getting:" in line:
            url_start = line.find("http")
            if url_start != -1:
                invalid_url = line[url_start:].strip()
                invalid_urls.add(invalid_url)

    # Extract valid URLs (not in invalid_urls)
    for line in logs:
        if "http" in line:
            url_start = line.find("http")
            if url_start != -1:
                clean_url = line[url_start:].strip()
                if clean_url not in invalid_urls:
                    valid_urls.append(clean_url)

    # Make sure the download folder exists
    if not os.path.exists(download_folder):
        print(f"[ERROR] Download folder '{download_folder}' not found.")
        return

    # Sort files by creation time so earliest are considered first
    downloaded_files = os.listdir(download_folder)
    downloaded_files = sorted(
        downloaded_files,
        key=lambda x: os.path.getctime(os.path.join(download_folder, x))
    )

    # Rename files and write CSV
    with open(csv_file, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["image file name", "URL"])

        for index, (filename, url) in enumerate(zip(downloaded_files, valid_urls), start=1):
            file_extension = os.path.splitext(filename)[1]
            # new file name: e.g. "image_1_centipede.jpg"
            new_name = f"image_{index}_{rename_suffix}{file_extension}"

            original_path = os.path.join(download_folder, filename)
            new_path = os.path.join(download_folder, new_name)
            os.rename(original_path, new_path)

            writer.writerow([new_name, url])

    print(f"[INFO] Renamed images and saved URLs to '{csv_file}'.")


def download_images_and_capture_logs(query, limit, output_dir, adult_filter_off=True, force_replace=False, timeout=60):
    """
    Uses bing_image_downloader to download images and captures logs printed to stdout.
    Returns the captured logs as a string.
    """
    buffer = io.StringIO()
    # Capture stdout from the downloader
    with redirect_stdout(buffer):
        downloader.download(
            query=query,
            limit=limit,
            output_dir=output_dir,
            adult_filter_off=adult_filter_off,
            force_replace=force_replace,
            timeout=timeout
        )
    # Get the entire output
    return buffer.getvalue()


# ------------------- MAIN SCRIPT ------------------- #
if __name__ == "__main__":
    # 1) Centipede
    print("=== Downloading 150 images of 'house centipede' ===")
    logs_centipede = download_images_and_capture_logs(
        query="house centipede",
        limit=150,
        output_dir="house centipede",  # main folder
        adult_filter_off=True,
        force_replace=False,
        timeout=60
    )

    # Bing Image Downloader typically creates: "centipede/centipede"
    download_folder_centipede = os.path.join("house centipede", "house centipede")
    csv_file_centipede = "house centipede_urls.csv"

    process_download_logs(
        logs_str=logs_centipede,
        download_folder=download_folder_centipede,
        csv_file=csv_file_centipede,
        rename_suffix="house centipede"
    )


    # 2) Silverfish
    print("\n=== Downloading 150 images of 'silverfish' ===")
    logs_silverfish = download_images_and_capture_logs(
        query="silverfish",
        limit=150,
        output_dir="silverfish",  # main folder
        adult_filter_off=True,
        force_replace=False,
        timeout=60
    )

    download_folder_silverfish = os.path.join("silverfish", "silverfish")
    csv_file_silverfish = "silverfish_urls.csv"

    process_download_logs(
        logs_str=logs_silverfish,
        download_folder=download_folder_silverfish,
        csv_file=csv_file_silverfish,
        rename_suffix="silverfish"
    )


    # 3) Bedbug
    print("\n=== Downloading 150 images of 'bedbug' ===")
    logs_bedbug = download_images_and_capture_logs(
        query="bedbug",
        limit=150,
        output_dir="bedbug",  # main folder
        adult_filter_off=True,
        force_replace=False,
        timeout=60
    )

    download_folder_bedbug = os.path.join("bedbug", "bedbug")
    csv_file_bedbug = "bedbug_urls.csv"

    process_download_logs(
        logs_str=logs_bedbug,
        download_folder=download_folder_bedbug,
        csv_file=csv_file_bedbug,
        rename_suffix="bedbug"
    )

    print("\nAll downloads and CSV exports completed successfully!")


=== Downloading 150 images of 'house centipede' ===
[INFO] Renamed images and saved URLs to 'house centipede_urls.csv'.

=== Downloading 150 images of 'silverfish' ===
[INFO] Renamed images and saved URLs to 'silverfish_urls.csv'.

=== Downloading 150 images of 'bedbug' ===
[INFO] Renamed images and saved URLs to 'bedbug_urls.csv'.

All downloads and CSV exports completed successfully!
