#### Data collection

In [None]:
pip install bing_image_downloader

Defaulting to user installation because normal site-packages is not writeable
You should consider upgrading via the '/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip' command.[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import csv
import os
from bing_image_downloader import downloader

In [None]:
%%capture captured_logs
downloader.download("subterranean termite", # search term
                    limit=150, 
                    output_dir="subterranean termite", # folder name
                    force_replace=False)

In [5]:
# Gets the logs outputs
logs = captured_logs.stdout.splitlines()

# Stores the invalid & valid links
valid_urls = []
invalid_urls = set()

# Extract invalid urls (e.g., invalid are the urls that couldn't be accessed)
for line in logs:
    if "[Error]Invalid image" in line or "Issue getting:" in line:
        # extracts the url from the line
        url_start = line.find("http")
        if url_start != -1:  # to make sure "http" exists in the line
            invalid_url = line[url_start:].strip()
            invalid_urls.add(invalid_url)  # add to invalid urls

# Extract valid urls
for line in logs:
    if "http" in line:
        url_start = line.find("http")
        if url_start != -1:
            clean_url = line[url_start:].strip()
            # add url only if it's not in the invalid urls set
            if clean_url not in invalid_urls:
                valid_urls.append(clean_url)

# Folder name
download_folder = "subterranean termite/subterranean termite"

# Ensure images correspond to urls in their extraction order
csv_file = "subterranean_termite_urls.csv"
if os.path.exists(download_folder):
    downloaded_files = os.listdir(download_folder)
    downloaded_files = sorted(
        downloaded_files, 
        key=lambda x: os.path.getctime(os.path.join(download_folder, x))
    )
    
    with open(csv_file, mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["image file name", "URL"])
        
        for index, (filename, url) in enumerate(zip(downloaded_files, valid_urls), start=1):
            file_extension = os.path.splitext(filename)[1]

            #  Image file name
            new_name = f"image_{index}_subterranean termite{file_extension}"

            original_path = os.path.join(download_folder, filename)
            new_path = os.path.join(download_folder, new_name)
            os.rename(original_path, new_path)
            writer.writerow([new_name, url])
            
    print(f"Renamed images and saved URLs to {csv_file}")
else:
    print(f"Download folder '{download_folder}' not found.")


Renamed images and saved URLs to subterranean_termite_urls.csv
