### Data collection

In [3]:
# FILE DESCRIPTION: -------------------------------------------------------

# This file web scrapes images via Bing. Includes two functions that captures the log as a string, separates invalid
# and valid URLs, renames downloaded images in the order they were saved, and writes [image_file_name, URL] to a CSV.

# --------------------------------------------------------------------------



# ----------- IMPORTS ----------------

import os
import csv
import io
import sys
import numpy as np
import pandas as pd
from bing_image_downloader import downloader
from contextlib import redirect_stdout



# ----------- CONSTANTS ----------------

# Image download
IMAGE_LIMIT = 150  
TIMEOUT = 60  

# Query dictionary
QUERIES = {
    "house centipede": "house_centipede_urls.csv",
    "silverfish": "silverfish_urls.csv",
    "bedbug": "bedbug_urls.csv",
    "fleas": "fleas_urls.csv",
    "ticks": "ticks_urls.csv",
    "carpenter ant": "carpenter_ant_urls.csv",
    "american house spider": "american_house_spider_urls.csv",
    "cellar spider": "cellar_spider_urls.csv",
    "brown stink bug": "brown_stink_bug_urls.csv",
    "rice weevil": "rice_weevil_urls.csv",
    "subterranean termite": "subterranean_termite_urls.csv"
}

# Directory structure
OUTPUT_DIR = "DATA/IMAGES"  
CSV_DIR = "DATA/CSV"  

os.makedirs(OUTPUT_DIR, exist_ok=True)
os.makedirs(CSV_DIR, exist_ok=True)



def process_download_logs(logs_str: str, download_folder: str, csv_file: str, rename_suffix: str) -> None:
    """
    Parses the Bing Image Downloader logs to extract valid URLs, 
    renames images, and saves URLs to a CSV
    
    Parameters:
        logs_str (str) - captured log as a string
        download_folder (str) - directory where images are downloaded
        csv_file (str) -  path to save CSV files
        rename_suffix (str) - suffix for renaming images
    
    """

    # Error check: checking for correct params (strings) before contuining 
    if not isinstance(logs_str, str) or not isinstance(download_folder, str) \
       or not isinstance(csv_file, str) or not isinstance(rename_suffix, str):
        raise TypeError("All parameters must be strings.")

    # Splits the logs into lines
    logs = logs_str.splitlines()

    # Lists to store valid and invalid image URLs
    valid_urls = []
    invalid_urls = set()

    # Identify invalid URLs in the logs to filter them out later
    for line in logs:
        if "[Error]Invalid image" in line or "Issue getting:" in line:
            url_start = line.find("http")
            if url_start != -1:
                invalid_urls.add(line[url_start:].strip())

     # Retrieves the valid URLs by identifying the invalid URLs
    for line in logs:
        if "http" in line:
            url_start = line.find("http")
            if url_start != -1:
                clean_url = line[url_start:].strip()
                if clean_url not in invalid_urls:
                    valid_urls.append(clean_url)

    # Error check: checks if download folder exists before continuing 
    if not os.path.exists(download_folder):
        print(f"[ERROR] Download folder '{download_folder}' not found.")
        return

    # Sort files by creation time so earliest are considered first
    downloaded_files = sorted(
        os.listdir(download_folder),
        key=lambda x: os.path.getctime(os.path.join(download_folder, x))
    )

    # Dataframe to store image filenames and URLs
    url_df = pd.DataFrame(columns=["image file name", "URL"])

    # Rename files in order and updates the dataframe
    for index, (filename, url) in enumerate(zip(downloaded_files, valid_urls), start=1):
        file_extension = os.path.splitext(filename)[1]
        new_name = f"image_{index}_{rename_suffix}{file_extension}"

        os.rename(
            os.path.join(download_folder, filename),
            os.path.join(download_folder, new_name)
        )
        url_df.loc[index] = [new_name, url]
    
    # Saves dataframe to CSV file
    url_df.to_csv(csv_file, index=False, encoding="utf-8")
    print(f"[INFO] Renamed images and saved URLs to '{csv_file}'.")


def download_images_and_capture_logs(query: str, limit: int = IMAGE_LIMIT, output_dir: str = OUTPUT_DIR,
                                     adult_filter_off: bool = True, force_replace: bool = False,
                                     timeout: int = TIMEOUT) -> str:
    """
    Downloads images using Bing Image Downloader and captures logs as a string
    
    Parameters:
        query (str) -  search query
        limit (int) -  amount of images to download
        output_dir (str) -  directory to save images
        adult_filter_off (bool) -  turns off adult filter
        force_replace (bool) -  overwrites existing images
        timeout (int) -  timeout value (in seconds) for downloading images
    
    Returns:
        (str) - captured logs from the downloader as a string
    """

    # Error check: checking for correct params (strings) before contuining 
    if not isinstance(query, str) or not isinstance(limit, int) or not isinstance(output_dir, str):
        raise TypeError("Invalid parameter types.")
    
    # Capture logs from downloader
    buffer = io.StringIO()
    with redirect_stdout(buffer):
        downloader.download(
            query=query,
            limit=limit,
            output_dir=output_dir,
            adult_filter_off=adult_filter_off,
            force_replace=force_replace,
            timeout=timeout
        )
    return buffer.getvalue()


def main():
    """
    Main function to run the image downloads and log processing functions
    """
    # Go over each insect query and corresponding CSV filename
    for query, csv_filename in QUERIES.items():
        print(f"\n=== Downloading {IMAGE_LIMIT} images of '{query}' ===")

        # 1) download images and capture the logs
        logs = download_images_and_capture_logs(query=query)
        
        # 2) define file paths for downloaded imagews and CSV file
        download_folder = os.path.join(OUTPUT_DIR, query)
        csv_file = os.path.join(CSV_DIR, csv_filename)

        # 3) processes logs, renames images, and saves URLs to a CSV file
        process_download_logs(logs, download_folder, csv_file, query)

    print("\nAll downloads and CSV exports completed successfully!")


    # --------------------------------------------------------------------------
    # TEST CASE / EXPECTED RESULTS when this script is run:

        # 11 CSV files in CSV folder (path: DATA/CSV)
        # 11 class folders in IMAGES folder (path: DATA/IMAGES)
        # 150 images per class in IMAGES folder (path: DATA/IMAGES)
    
        # time completion: around 16.5-22 minutes
    # --------------------------------------------------------------------------



if __name__ == "__main__":
    main()



=== Downloading 5 images of 'house centipede' ===
[INFO] Renamed images and saved URLs to 'DATA/CSV/house_centipede_urls.csv'.

=== Downloading 5 images of 'silverfish' ===
[INFO] Renamed images and saved URLs to 'DATA/CSV/silverfish_urls.csv'.

=== Downloading 5 images of 'bedbug' ===
[INFO] Renamed images and saved URLs to 'DATA/CSV/bedbug_urls.csv'.

=== Downloading 5 images of 'fleas' ===
[INFO] Renamed images and saved URLs to 'DATA/CSV/fleas_urls.csv'.

=== Downloading 5 images of 'ticks' ===
[INFO] Renamed images and saved URLs to 'DATA/CSV/ticks_urls.csv'.

=== Downloading 5 images of 'carpenter ant' ===
[INFO] Renamed images and saved URLs to 'DATA/CSV/carpenter_ant_urls.csv'.

=== Downloading 5 images of 'american house spider' ===
[INFO] Renamed images and saved URLs to 'DATA/CSV/american_house_spider_urls.csv'.

=== Downloading 5 images of 'cellar spider' ===
[INFO] Renamed images and saved URLs to 'DATA/CSV/cellar_spider_urls.csv'.

=== Downloading 5 images of 'brown stin