In [1]:
import os
import random
import pandas as pd
from PIL import Image
from pathlib import Path

def process_imagenet_dataset_two_stage(source_dir, output_dir, num_folders=1000, target_size=(1174, 918)):
    """
    两级随机抽取ImageNet图片
    
    参数:
    source_dir: ImageNet源目录路径
    output_dir: 输出目录路径
    num_folders: 需要选择的子文件夹数量
    target_size: 目标图像尺寸 (宽, 高)
    """
    
    # 创建输出目录
    train_dir = os.path.join(output_dir, 'train')
    test_dir = os.path.join(output_dir, 'test')
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)
    
    # 获取所有子文件夹
    subfolders = [f.path for f in os.scandir(source_dir) if f.is_dir()]
    
    # 随机选择指定数量的子文件夹
    if len(subfolders) < num_folders:
        raise ValueError(f"子文件夹数量不足，需要至少{num_folders}个，但只有{len(subfolders)}个")
    
    selected_folders = random.sample(subfolders, num_folders)
    
    # 处理并保存图片，同时收集信息
    data = []
    train_count = 1
    test_count = 1
    
    for folder in selected_folders:
        # 获取文件夹中的所有图片
        image_files = []
        for ext in ['*.jpg', '*.JPEG', '*.png', '*.bmp', '*.tiff']:
            image_files.extend(list(Path(folder).glob(ext)))
        
        # 确保文件夹中有至少2张图片
        if len(image_files) < 2:
            print(f"跳过文件夹 {folder}，因为图片数量不足2张")
            continue
        
        # 随机选择2张不同的图片
        selected_images = random.sample(image_files, 2)
        
        # 处理第一张图片作为train
        try:
            img_path = selected_images[0]
            with Image.open(img_path) as img:
                # 转换为灰度图
                gray_img = img.convert('L')
                # 调整尺寸
                resized_img = gray_img.resize(target_size, Image.LANCZOS)
                
                # 保存图片
                new_filename = f"{train_count}.jpg"
                output_path = os.path.join(train_dir, new_filename)
                resized_img.save(output_path, 'JPEG')
                
                # 记录信息
                data.append({
                    'original_path': str(img_path),
                    'new_filename': new_filename,
                    'set': 'train'
                })
                
                train_count += 1
        except Exception as e:
            print(f"处理图片 {img_path} 时出错: {e}")
        
        # 处理第二张图片作为test
        try:
            img_path = selected_images[1]
            with Image.open(img_path) as img:
                # 转换为灰度图
                gray_img = img.convert('L')
                # 调整尺寸
                resized_img = gray_img.resize(target_size, Image.LANCZOS)
                
                # 保存图片
                new_filename = f"{test_count}.jpg"
                output_path = os.path.join(test_dir, new_filename)
                resized_img.save(output_path, 'JPEG')
                
                # 记录信息
                data.append({
                    'original_path': str(img_path),
                    'new_filename': new_filename,
                    'set': 'test'
                })
                
                test_count += 1
        except Exception as e:
            print(f"处理图片 {img_path} 时出错: {e}")
    
    # 创建DataFrame并保存为CSV
    df = pd.DataFrame(data)
    csv_path = os.path.join(output_dir, 'dataset_info.csv')
    df.to_csv(csv_path, index=False)
    
    print(f"处理完成！共处理了 {len(data)} 张图片。")
    print(f"训练集图片保存在: {train_dir}")
    print(f"测试集图片保存在: {test_dir}")
    print(f"数据集信息保存在: {csv_path}")


In [2]:
source_directory = "/media/ubuntu/sda/visual_stimuli_pattern/things/object_images"  
output_directory = "/media/ubuntu/sda/visual_stimuli_pattern/things"   

process_imagenet_dataset_two_stage(source_directory, output_directory)

处理完成！共处理了 2000 张图片。
训练集图片保存在: /media/ubuntu/sda/visual_stimuli_pattern/things/train
测试集图片保存在: /media/ubuntu/sda/visual_stimuli_pattern/things/test
数据集信息保存在: /media/ubuntu/sda/visual_stimuli_pattern/things/dataset_info.csv


In [39]:
def refill_missing_images(csv_path, output_dir, target_size=(1174, 918), type = 'train'):
    """
    检测并填补缺失的图片编号
    
    参数:
    csv_path: 原始CSV文件路径
    source_dir: ImageNet源目录路径
    output_dir: 输出目录路径
    target_size: 目标图像尺寸 (宽, 高)
    """
    
    df = pd.read_csv(csv_path)
    

    train_dir = os.path.join('/media/ubuntu/sda/visual_stimuli_pattern/imagenet_slice', type)
    
    def find_missing_numbers(directory, max_number):
        existing_files = set()
        for file in os.listdir(directory):
            if file.endswith('.jpg'):
                try:
                    num = int(file.split('.')[0])
                    existing_files.add(num)
                except ValueError:
                    continue
        
        all_numbers = set(range(1, max_number + 1))
        missing_numbers = sorted(all_numbers - existing_files)
        return missing_numbers
    
    train_missing = find_missing_numbers(train_dir, 1000)
    
    print(f"训练集缺失 {len(train_missing)} 张图片: {train_missing}")
    
    if not train_missing:
        print("没有发现缺失的图片编号")
        return
    
    def get_folder_for_number(number, set_type):
        record = df[(df['new_filename'] == f"{number}.jpg") & (df['set'] == set_type)]
        if not record.empty:
            original_path = record.iloc[0]['original_path']
            return os.path.dirname(original_path)
        return None
    
    for number in train_missing:
        folder_path = get_folder_for_number(number, 'train')
        if not folder_path:
            print(f"无法找到编号 {number} 对应的原始文件夹")
            continue
        
        image_files = []
        for ext in ['*.jpg', '*.JPEG', '*.png', '*.bmp', '*.tiff']:
            image_files.extend(list(Path(folder_path).glob(ext)))
        
        used_images = set()
        for _, row in df[df['set'] == 'train'].iterrows():
            if os.path.dirname(row['original_path']) == folder_path:
                used_images.add(row['original_path'])
        
        available_images = [img for img in image_files if str(img) not in used_images]
        
        if not available_images:
            print(f"文件夹 {folder_path} 中没有可用的图片用于填补编号 {number}")
            continue
        
        selected_image = random.choice(available_images)
        
        try:
            with Image.open(selected_image) as img:
                gray_img = img.convert('L')
                resized_img = gray_img.resize(target_size, Image.LANCZOS)
                
                # 保存图片
                new_filename = f"{number}.jpg"
                output_path = os.path.join(output_dir, new_filename)
                resized_img.save(output_path, 'JPEG')
                
                new_record = {
                    'original_path': str(selected_image),
                    'new_filename': new_filename,
                    'set': type
                }
                
                mask = (df['new_filename'] == new_filename) & (df['set'] == type)
                if mask.any():
                    df.loc[mask, 'original_path'] = str(selected_image)
                else:
                    df = pd.concat([df, pd.DataFrame([new_record])], ignore_index=True)
                
                print(f"已填补训练集编号 {number}")
        except Exception as e:
            print(f"处理图片 {selected_image} 时出错: {e}")
    
    
    df.to_csv(csv_path, index=False)
    print(f"CSV文件已更新: {csv_path}")


    

In [41]:
csv_path = "/media/ubuntu/sda/visual_stimuli_pattern/imagenet_slice/dataset_info.csv"  # 替换为您的CSV文件路径
output_directory = "/media/ubuntu/sda/visual_stimuli_pattern/imagenet_slice/new"   

refill_missing_images(csv_path, output_directory, type='test')

训练集缺失 14 张图片: [211, 410, 436, 443, 612, 614, 658, 715, 780, 797, 869, 922, 941, 969]
已填补训练集编号 211
已填补训练集编号 410
已填补训练集编号 436
已填补训练集编号 443
已填补训练集编号 612
已填补训练集编号 614
已填补训练集编号 658
已填补训练集编号 715
已填补训练集编号 780
已填补训练集编号 797
已填补训练集编号 869
已填补训练集编号 922
已填补训练集编号 941
已填补训练集编号 969
CSV文件已更新: /media/ubuntu/sda/visual_stimuli_pattern/imagenet_slice/dataset_info.csv


In [55]:
def refill_missing_images(csv_path, source_dir, output_dir, target_size=(1174, 918), type='train'):
    """
    检测并填补缺失的图片编号，允许重新选择子文件夹
    
    参数:
    csv_path: 原始CSV文件路径
    source_dir: ImageNet源目录路径
    output_dir: 输出目录路径
    target_size: 目标图像尺寸 (宽, 高)
    type: 处理的数据集类型 ('train' 或 'test')
    """
    
    df = pd.read_csv(csv_path)
    
    # 获取所有可用的子文件夹
    all_subfolders = [f.path for f in os.scandir(source_dir) if f.is_dir()]
    
    # 获取已经使用过的子文件夹（从CSV中提取）
    used_folders = set()
    for _, row in df.iterrows():
        folder_path = os.path.dirname(row['original_path'])
        used_folders.add(folder_path)
    
    # 获取可用的子文件夹（排除已使用的）
    available_folders = [f for f in all_subfolders if f not in used_folders]
    
    print(f"总共 {len(all_subfolders)} 个子文件夹，已使用 {len(used_folders)} 个，可用 {len(available_folders)} 个")
    
    if not available_folders:
        print("没有可用的子文件夹用于重新选择")
        return
    
    # 检测缺失的图片编号
    def find_missing_numbers(directory, max_number):
        existing_files = set()
        for file in os.listdir(directory):
            if file.endswith('.jpg'):
                try:
                    num = int(file.split('.')[0])
                    existing_files.add(num)
                except ValueError:
                    continue
        
        all_numbers = set(range(1, max_number + 1))
        missing_numbers = sorted(all_numbers - existing_files)
        return missing_numbers
    
    train_missing = find_missing_numbers(output_dir, 1000)
    
    print(f"{type}集缺失 {len(train_missing)} 张图片: {train_missing}")
    
    if not train_missing:
        print("没有发现缺失的图片编号")
        return
    
    # 处理缺失的图片
    for number in train_missing:
        # 随机选择一个可用的子文件夹
        selected_folder = random.choice(available_folders)
        
        # 获取文件夹中的所有图片
        image_files = []
        for ext in ['*.jpg', '*.JPEG', '*.png', '*.bmp', '*.tiff']:
            image_files.extend(list(Path(selected_folder).glob(ext)))
            image_files.extend(list(Path(selected_folder).glob(ext.upper())))
        
        if not image_files:
            print(f"文件夹 {selected_folder} 中没有图片，跳过")
            continue
        
        # 随机选择一张图片
        selected_image = random.choice(image_files)
        
        try:
            with Image.open(selected_image) as img:
                gray_img = img.convert('L')
                resized_img = gray_img.resize(target_size, Image.LANCZOS)
                
                # 保存图片
                new_filename = f"{number}.jpg"
                output_path = os.path.join('/media/ubuntu/sda/visual_stimuli_pattern/imagenet_slice/new', new_filename)
                resized_img.save(output_path, 'JPEG')
                
                # 更新CSV
                new_record = {
                    'original_path': str(selected_image),
                    'new_filename': new_filename,
                    'set': type
                }
                
                # 查找是否已有该编号的记录
                mask = (df['new_filename'] == new_filename) & (df['set'] == type)
                if mask.any():
                    df.loc[mask, 'original_path'] = str(selected_image)
                    print(f"已更新{type}集编号 {number}，使用新文件夹: {os.path.basename(selected_folder)}")
                else:
                    df = pd.concat([df, pd.DataFrame([new_record])], ignore_index=True)
                    print(f"已添加{type}集编号 {number}，使用新文件夹: {os.path.basename(selected_folder)}")
                
                # 将该文件夹标记为已使用
                used_folders.add(selected_folder)
                # 从可用文件夹列表中移除
                available_folders.remove(selected_folder)
                
        except Exception as e:
            print(f"处理图片 {selected_image} 时出错: {e}")
    
    # 保存更新后的CSV
    df.to_csv(csv_path, index=False)
    print(f"CSV文件已更新: {csv_path}")

In [80]:
csv_path = "/media/ubuntu/sda/visual_stimuli_pattern/imagenet_slice/dataset_info.csv"  # 替换为您的CSV文件路径
source_directory = "/media/ubuntu/sda/visual_stimuli_pattern/Imagenet"  # 替换为您的ImageNet文件夹路径
output_directory = "/media/ubuntu/sda/visual_stimuli_pattern/imagenet_slice/"    # 替换为您想要的输出路径

# 处理训练集
refill_missing_images(
    csv_path=csv_path,
    source_dir=source_directory,
    output_dir=os.path.join(output_directory, 'train'),
    type='train'
)

总共 10184 个子文件夹，已使用 1676 个，可用 8508 个
train集缺失 2 张图片: [479, 639]
已更新train集编号 479，使用新文件夹: n01972541
已更新train集编号 639，使用新文件夹: n01563746
CSV文件已更新: /media/ubuntu/sda/visual_stimuli_pattern/imagenet_slice/dataset_info.csv
