#### **Utility for reducing sample images**

In [7]:
import os
import random
import shutil
import pandas as pd
from datetime import datetime

def select_images_and_update_csv(train_csv_path, train_images_folder, num_images_to_keep):
    # Load CSV file
    df = pd.read_csv(train_csv_path)
    
    # Get all image ids
    image_ids = df['ImageId'].unique()
    
    # Randomly select num_images_to_keep image ids
    selected_image_ids = random.sample(list(image_ids), num_images_to_keep)
    
    # Create a new directory with a timestamp
    current_date = datetime.now().strftime('%d-%m-%Y')
    new_folder_name = f"severstal_dataset_{num_images_to_keep}_{current_date}"
    os.makedirs(new_folder_name, exist_ok=True)
    
    # Create a sub-folder for selected images
    new_images_folder = os.path.join(new_folder_name, 'train_images')
    os.makedirs(new_images_folder, exist_ok=True)
    
    # Copy the selected images to the new folder
    for image_id in selected_image_ids:
        image_file = os.path.join(train_images_folder, f"{image_id}")
        if os.path.exists(image_file):
            shutil.copy(image_file, new_images_folder)
    
    # Filter the CSV to keep only rows corresponding to selected image ids
    new_df = df[df['ImageId'].isin(selected_image_ids)]
    
    # Save the updated CSV to the new folder
    new_csv_path = os.path.join(new_folder_name, 'train.csv')
    new_df.to_csv(new_csv_path, index=False)
    
    print(f"Process completed. New folder created: {new_folder_name}")
    print(f"Updated CSV saved at: {new_csv_path}")

# Example usage
train_csv_path = 'train.csv'
train_images_folder = 'train_images/'
num_images_to_keep = 1000

select_images_and_update_csv(train_csv_path, train_images_folder, num_images_to_keep)


Process completed. New folder created: severstal_dataset_1000_17-02-2025
Updated CSV saved at: severstal_dataset_1000_17-02-2025\train.csv
