In [8]:
import cv2
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup
import base64
import os
import time
import numpy as np
import random
import hashlib

# List of Tesla models to search for
models = ["Model_3", "Model_Y", "Model_X", "Model_S"]
num_of_images = 1000  # Number of images per model
base_download_folder = "dataset"  # Base download folder

# YOLOv4 paths (relative paths as per your setup)
weights_path = "../yolov4/yolov4.weights"
config_path = "../yolov4/yolov4.cfg"
coco_names_path = "../yolov4/coco.names"

# Load YOLOv4 network
try:
    net = cv2.dnn.readNet(weights_path, config_path)
    with open(coco_names_path, "r") as f:
        classes = f.read().strip().split("\n")
except Exception as e:
    print(f"Error loading YOLO files: {e}")
    exit()

# Define class of interest
target_class = "car"

# Initial query variations and extra keywords for generating new queries dynamically
initial_queries = [
    "car", "exterior", "interior", "on road", "HD", "front view", "side view", "high quality", "back view"
]
extra_keywords = ["2024", "night view", "in city", "rural", "beach", "mountain", "sunset", "parked", "driveway"]

def initialize_webdriver():
    """Initialize the Selenium WebDriver with custom headers."""
    options = webdriver.ChromeOptions()
    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36")
    try:
        driver = webdriver.Chrome(options=options)  # Ensure ChromeDriver is installed
        driver.maximize_window()
        driver.get("https://images.google.com/")
        return driver
    except Exception as e:
        print(f"Error initializing WebDriver: {e}")
        return None

def search_images(driver, model, query):
    """Search Google Images for a specific query."""
    try:
        # Enter the query in the search box
        search_box = driver.find_element(By.NAME, "q")
        search_box.clear()
        search_box.send_keys(query + "\n")
        
        # Random delay to mimic human interaction
        time.sleep(random.uniform(2, 4))

        # Scroll multiple times to load more images for each query
        for _ in range(10):  # Increased scrolls to maximize image load
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(random.uniform(2, 4))
            
    except Exception as e:
        print(f"Error during search with query '{query}': {e}")

def contains_car(image_data):
    """Check if the image contains a car using YOLOv4."""
    image = cv2.imdecode(np.frombuffer(image_data, np.uint8), cv2.IMREAD_COLOR)
    if image is None:
        return False

    # Resize image for YOLOv4 input size and prepare the blob
    image_resized = cv2.resize(image, (416, 416))
    blob = cv2.dnn.blobFromImage(image_resized, 1/255.0, (416, 416), swapRB=True, crop=False)
    net.setInput(blob)
    layer_names = net.getLayerNames()
    output_layers = [layer_names[i - 1] for i in net.getUnconnectedOutLayers().flatten()]

    # Run forward pass and check for cars in detections
    detections = net.forward(output_layers)
    for detection in detections:
        for object_data in detection:
            scores = object_data[5:]
            class_id = np.argmax(scores)
            confidence = scores[class_id]
            if classes[class_id] == target_class and confidence > 0.5:
                return True
    return False

def calculate_hash(image_data):
    """Calculate a unique hash for the image data to identify duplicates."""
    return hashlib.md5(image_data).hexdigest()

def save_resized_image(img_data, count, download_folder, hash_set):
    """Resize image to 224x224, check for duplicates, and save it in the download folder."""
    # Calculate hash and check for duplicates
    img_hash = calculate_hash(img_data)
    if img_hash in hash_set:
        print("Duplicate image found, skipping.")
        return False
    hash_set.add(img_hash)
    
    try:
        # Decode and resize image
        image = cv2.imdecode(np.frombuffer(img_data, np.uint8), cv2.IMREAD_COLOR)
        if image is not None:
            resized_image = cv2.resize(image, (224, 224))
            file_path = os.path.join(download_folder, f"img_{count}.jpg")
            cv2.imwrite(file_path, resized_image)
            print(f"Downloaded and resized img_{count}.jpg containing a car")
            return True
    except Exception as e:
        print(f"Error saving image {count}: {e}")
    return False

# Main loop for each model
for model in models:
    download_folder = os.path.join(base_download_folder, model)
    os.makedirs(download_folder, exist_ok=True)  # Create model-specific download folder

    driver = initialize_webdriver()
    if driver:
        try:
            hash_set = set()  # Track image hashes to avoid duplicate images
            successful_downloads = 0
            seen_urls = set()  # Track seen URLs to avoid duplicates
            
            # Start with initial queries
            queries = [f"Tesla {model} {variation}" for variation in initial_queries]
            query_idx = 0
            
            while successful_downloads < num_of_images and query_idx < len(queries):
                query = queries[query_idx]
                search_images(driver, model, query)
                
                # Extract and process images
                soup = BeautifulSoup(driver.page_source, "html.parser")
                image_elements = soup.find_all("img")

                for img in image_elements:
                    if successful_downloads >= num_of_images:
                        break

                    img_url = img.get("src")
                    if img_url and img_url not in seen_urls:
                        seen_urls.add(img_url)

                        # Handle base64 and URL images
                        if img_url.startswith("data:image"):
                            base64_data = img_url.split(",")[1]
                            img_data = base64.b64decode(base64_data)
                        else:
                            try:
                                img_data = requests.get(img_url).content
                            except Exception as e:
                                print(f"Error downloading image: {e}")
                                continue

                        # Detect if image contains a car
                        if contains_car(img_data):
                            if save_resized_image(img_data, successful_downloads + 1, download_folder, hash_set):
                                successful_downloads += 1

                # Move to the next query
                query_idx += 1

                # Generate new queries if needed
                if successful_downloads < num_of_images and query_idx >= len(queries):
                    # Add additional queries dynamically by combining model name with extra keywords
                    new_queries = [f"Tesla {model} {random.choice(extra_keywords)}"]
                    queries.extend(new_queries)
                    print(f"Generated new queries: {new_queries}")

        finally:
            driver.quit()
    else:
        print("WebDriver initialization failed. Exiting script for model:", model)

Downloaded and resized img_1.jpg containing a car
Downloaded and resized img_2.jpg containing a car
Downloaded and resized img_3.jpg containing a car
Downloaded and resized img_4.jpg containing a car
Downloaded and resized img_5.jpg containing a car
Downloaded and resized img_6.jpg containing a car
Downloaded and resized img_7.jpg containing a car
Downloaded and resized img_8.jpg containing a car
Downloaded and resized img_9.jpg containing a car
Downloaded and resized img_10.jpg containing a car
Downloaded and resized img_11.jpg containing a car
Downloaded and resized img_12.jpg containing a car
Downloaded and resized img_13.jpg containing a car
Downloaded and resized img_14.jpg containing a car
Downloaded and resized img_15.jpg containing a car
Downloaded and resized img_16.jpg containing a car
Downloaded and resized img_17.jpg containing a car
Downloaded and resized img_18.jpg containing a car
Downloaded and resized img_19.jpg containing a car
Downloaded and resized img_20.jpg contai