In [1]:
import os
import requests
from bs4 import BeautifulSoup
import urllib.parse
import time

def download_images(query, num_images, output_dir):
    # Create the directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    # Set the headers to mimic a browser visit
    headers = {
        "User-Agent": "Safari/537.36"
            }
    
    # Format the query for URL encoding
    query = urllib.parse.quote(query)
    
    downloaded = 0
    page = 0
    
    while downloaded < num_images:
        # Construct the Google Image search URL with pagination
        search_url = f"https://www.google.com/search?q={query}&tbm=isch&start={page*20}"
        
        # Get the HTML content of the search page
        response = requests.get(search_url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Find all image elements
        img_tags = soup.find_all('img')[1:]  # Skip the first image which is usually the Google logo
        
        if not img_tags:
            break
        
        for i, img in enumerate(img_tags):
            if downloaded >= num_images:
                break
            try:
                img_url = img.get('src') or img.get('data-src')
                if not img_url or not img_url.startswith('http'):
                    continue
                img_data = requests.get(img_url).content
                with open(os.path.join(output_dir, f"{query}_{downloaded}.jpg"), 'wb') as handler:
                    handler.write(img_data)
                print(f"Downloaded {query}_{downloaded}.jpg")
                downloaded += 1
            except Exception as e:
                print(f"Could not download image {downloaded}: {e}")
        
        # Increase page count to move to the next set of images
        page += 1
        time.sleep(5)  # Pause to avoid overwhelming the server

# Start the crawling process with error handling
download_images("Domestic Cat", 1000, "data/Domestic Cat")
download_images("Siberian Cat", 1000, "data/Siberian Cat")
download_images("Bengal Cat", 1000, "data/Bengal Cat")
download_images("Siamese Cat", 1000, "data/Siamese Cat")
download_images("Maine Coon Cat", 1000, "data/Maine Coon")
download_images("Scottish Fold Cat", 1000, "data/Scottish Fold")
download_images("British Shorthair Cat", 1000, "data/British Shorthair")
download_images("Persian Cat", 1000, "data/Persian Cat")
download_images("Ragdoll Cat", 1000, "data/Ragdoll Cat")
download_images("Abyssinian Cat", 1000, "data/Abyssinian Cat")

Downloaded Domestic%20Cat_0.jpg
Downloaded Domestic%20Cat_1.jpg
Downloaded Domestic%20Cat_2.jpg
Downloaded Domestic%20Cat_3.jpg
Downloaded Domestic%20Cat_4.jpg
Downloaded Domestic%20Cat_5.jpg
Downloaded Domestic%20Cat_6.jpg
Downloaded Domestic%20Cat_7.jpg
Downloaded Domestic%20Cat_8.jpg
Downloaded Domestic%20Cat_9.jpg
Downloaded Domestic%20Cat_10.jpg
Downloaded Domestic%20Cat_11.jpg
Downloaded Domestic%20Cat_12.jpg
Downloaded Domestic%20Cat_13.jpg
Downloaded Domestic%20Cat_14.jpg
Downloaded Domestic%20Cat_15.jpg
Downloaded Domestic%20Cat_16.jpg
Downloaded Domestic%20Cat_17.jpg
Downloaded Domestic%20Cat_18.jpg
Downloaded Domestic%20Cat_19.jpg
Downloaded Domestic%20Cat_20.jpg
Downloaded Domestic%20Cat_21.jpg
Downloaded Domestic%20Cat_22.jpg
Downloaded Domestic%20Cat_23.jpg
Downloaded Domestic%20Cat_24.jpg
Downloaded Domestic%20Cat_25.jpg
Downloaded Domestic%20Cat_26.jpg
Downloaded Domestic%20Cat_27.jpg
Downloaded Domestic%20Cat_28.jpg
Downloaded Domestic%20Cat_29.jpg
Downloaded Domestic%

In [3]:
import os
import random
import shutil

# Define paths
base_dir = 'data'
train_dir = os.path.join(base_dir, 'train')
val_dir = os.path.join(base_dir, 'val')
test_dir = os.path.join(base_dir, 'test')

# Ensure the directories exist
def create_dirs(base, classes):
    for class_name in classes:
        os.makedirs(os.path.join(base, class_name), exist_ok=True)

# List of class names (excluding the train, val, and test directories)
class_names = [d for d in os.listdir(base_dir) if os.path.isdir(os.path.join(base_dir, d)) and d not in ['train', 'val', 'test']]

# Create train, val, and test directories for each class
create_dirs(train_dir, class_names)
create_dirs(val_dir, class_names)
create_dirs(test_dir, class_names)

# Split ratios
split_ratios = {'train': 0.7, 'val': 0.15, 'test': 0.15}

# Split data
for class_name in class_names:
    class_dir = os.path.join(base_dir, class_name)
    images = os.listdir(class_dir)
    random.shuffle(images)

    train_split = int(split_ratios['train'] * len(images))
    val_split = int(split_ratios['val'] * len(images))
    
    train_images = images[:train_split]
    val_images = images[train_split:train_split + val_split]
    test_images = images[train_split + val_split:]

    for image in train_images:
        shutil.copy(os.path.join(class_dir, image), os.path.join(train_dir, class_name, image))
    for image in val_images:
        shutil.copy(os.path.join(class_dir, image), os.path.join(val_dir, class_name, image))
    for image in test_images:
        shutil.copy(os.path.join(class_dir, image), os.path.join(test_dir, class_name, image))

print("Dataset split complete.")


Dataset split complete.


In [10]:
import os

# Print the directories to check for unexpected folders
train_data_dir = 'data/train'
val_data_dir = 'data/val'
test_data_dir = 'data/test'

print("Training directories:", os.listdir(train_data_dir))
print("Validation directories:", os.listdir(val_data_dir))
print("Test directories:", os.listdir(test_data_dir))


Training directories: ['Abyssinian Cat', 'Bengal Cat', 'British Shorthair', 'Domestic Cat', 'Maine Coon', 'Persian Cat', 'Ragdoll Cat', 'Scottish Fold', 'Siamese Cat', 'Siberian Cat', 'train']
Validation directories: ['Abyssinian Cat', 'Bengal Cat', 'British Shorthair', 'Domestic Cat', 'Maine Coon', 'Persian Cat', 'Ragdoll Cat', 'Scottish Fold', 'Siamese Cat', 'Siberian Cat', 'train', 'val']
Test directories: ['Abyssinian Cat', 'Bengal Cat', 'British Shorthair', 'Domestic Cat', 'Maine Coon', 'Persian Cat', 'Ragdoll Cat', 'Scottish Fold', 'Siamese Cat', 'Siberian Cat', 'test', 'train', 'val']
