# Sampling CASIA v2.0 Dataset

In [1]:
# Proportionate Stratified Random Sampling

## 1. Import Libraries

In [3]:
import os
import shutil
import random
import re
from tqdm import tqdm
import pandas as pd

## 2. Au: Sampling

### Setting Paths & .txt File

In [4]:
# Paths
au_src_dir = r'C:\Users\User\Downloads\casia_sh\Au'
au_output_dir = r'C:\Users\User\Documents\DS4\25-ds-casia\Au'
txt_dir = r'C:\Users\User\Documents\DS4\25-ds-casia'
au_txt_dir_fp = os.path.join(txt_dir, 'au_list.txt')

In [5]:
os.makedirs(au_output_dir, exist_ok=True)

### Dictionary of Categories & Counts

In [6]:
# Dictionary of categories and number of images to sample from each
au_cc_dict = {
    'pla': 244,
    'art': 228,
    'nat': 252,  
    'arc': 269,
    'ani': 274,
    'ind': 87,
    'cha': 245,
    'sec': 238,
    'txt': 37
}

### Storing Filenames of Copied Images

In [7]:
# List to store the filenames of copied images
au_list = []

### Sampling Function: Au

In [8]:
def casia25_au_sample(src_dir, output_dir, categories):
    for category, sample_size in categories.items():
        # List all files in the source directory
        all_files = os.listdir(src_dir)

        # Filter files to include only those from the current category and with the desired extensions
        category_files = [f for f in all_files if f.startswith(f'Au_{category}_') and (f.endswith('.jpg') or f.endswith('.tif'))]

        # Check if there are enough files to sample
        if len(category_files) < sample_size:
            print(f"Not enough files in category '{category}' to sample {sample_size} images. Available: {len(category_files)}.")
            continue

        # Sample the specified number of files from the current category
        sampled_files = random.sample(category_files, sample_size)

        # Copy the sampled files to the output directory
        for file_name in sampled_files:
            src_path = os.path.join(src_dir, file_name)
            dest_path = os.path.join(output_dir, file_name)
            shutil.copy(src_path, dest_path)
            au_list.append(file_name)

        print(f"Sampled and copied {sample_size} '{category}' images to {output_dir}")

    return au_list

### Calling Sampling Function: Au

In [9]:
# Run the sampling and copying process
au_list = casia25_au_sample(au_src_dir, au_output_dir, au_cc_dict)

Sampled and copied 244 'pla' images to C:\Users\User\Documents\DS4\25-ds-casia\Au
Sampled and copied 228 'art' images to C:\Users\User\Documents\DS4\25-ds-casia\Au
Sampled and copied 252 'nat' images to C:\Users\User\Documents\DS4\25-ds-casia\Au
Sampled and copied 269 'arc' images to C:\Users\User\Documents\DS4\25-ds-casia\Au
Sampled and copied 274 'ani' images to C:\Users\User\Documents\DS4\25-ds-casia\Au
Sampled and copied 87 'ind' images to C:\Users\User\Documents\DS4\25-ds-casia\Au
Sampled and copied 245 'cha' images to C:\Users\User\Documents\DS4\25-ds-casia\Au
Sampled and copied 238 'sec' images to C:\Users\User\Documents\DS4\25-ds-casia\Au
Sampled and copied 37 'txt' images to C:\Users\User\Documents\DS4\25-ds-casia\Au


### Creating .txt File: Au

In [10]:
# Write the list of copied filenames to a text file
with open(au_txt_dir_fp, 'w') as f:
    for filename in au_list:
        f.write(f"{filename}\n")

print(f"All sampling and copying completed.\nFilenames of copied images are listed in {au_txt_dir_fp}")

All sampling and copying completed.
Filenames of copied images are listed in C:\Users\User\Documents\DS4\25-ds-casia\au_list.txt


## 2. Tp: Sampling (Tp_S)

### Setting Paths & .txt File

In [11]:
# Paths
tp_src_dir = r'C:\Users\User\Downloads\casia_sh\Tp'
tp_output_dir = r'C:\Users\User\Documents\DS4\25-ds-casia\Tp'
txt_dir = r'C:\Users\User\Documents\DS4\25-ds-casia'
tp_tps_txt_dir_fp = os.path.join(txt_dir, 'tp_tps_list.txt')

In [12]:
os.makedirs(tp_output_dir, exist_ok=True)

### Dictionary of Categories & Counts

In [13]:
# For Tp_S
tp_tps_cc_dict = {
    'pla': 103,
    'art': 109,
    'nat': 97,  
    'arc': 109,
    'ani': 77,
    'ind': 103,
    'cha': 96,
    'sec': 95,
    'txt': 34
}

### Storing Filenames of Copied Images

In [14]:
tp_tps_list = []

### Sampling Function: Tp (Tp_S)

In [15]:
def casia25_tp_tps_sample(tp_tps_src_dir, tp_tps_output_dir, tp_tps_categories):
    for category, sample_size in tp_tps_categories.items():
        # List all files in the source directory
        all_files = os.listdir(tp_tps_src_dir)

        # Debug: Print all files for verification
        print(f"Processing category '{category}'")

        # Define a regex pattern to match the category in the filenames
        pattern = re.compile(rf'Tp_S_.*_{category}\d{{5}}_{category}\d{{5}}_.*\.(jpg|tif)')

        # Filter files to include only those that match the category pattern
        category_files = [f for f in all_files if pattern.match(f)]

        # Debug: Print the number of files found for this category
        print(f"Found {len(category_files)} files.")

        # Check if there are enough files to sample
        if len(category_files) < sample_size:
            print(f"Not enough files in category '{category}' to sample {sample_size} images. Available: {len(category_files)}.")
            continue

        # Sample the specified number of files from the current category
        sampled_files = random.sample(category_files, sample_size)

        # Copy the sampled files to the output directory
        for file_name in sampled_files:
            src_path = os.path.join(tp_tps_src_dir, file_name)
            dest_path = os.path.join(tp_tps_output_dir, file_name)
            shutil.copy(src_path, dest_path)
            tp_tps_list.append(file_name)

        print(f"Sampled and copied {sample_size} '{category}' images to {tp_tps_output_dir}\n")

    return tp_tps_list

### Calling Sampling Function: Tp (Tp_S)

In [16]:
# Run the sampling and copying process for Tp_S images
tp_tps_list = casia25_tp_tps_sample(tp_src_dir, tp_output_dir, tp_tps_cc_dict)

Processing category 'pla'
Found 413 files.
Sampled and copied 103 'pla' images to C:\Users\User\Documents\DS4\25-ds-casia\Tp

Processing category 'art'
Found 437 files.
Sampled and copied 109 'art' images to C:\Users\User\Documents\DS4\25-ds-casia\Tp

Processing category 'nat'
Found 388 files.
Sampled and copied 97 'nat' images to C:\Users\User\Documents\DS4\25-ds-casia\Tp

Processing category 'arc'
Found 437 files.
Sampled and copied 109 'arc' images to C:\Users\User\Documents\DS4\25-ds-casia\Tp

Processing category 'ani'
Found 309 files.
Sampled and copied 77 'ani' images to C:\Users\User\Documents\DS4\25-ds-casia\Tp

Processing category 'ind'
Found 412 files.
Sampled and copied 103 'ind' images to C:\Users\User\Documents\DS4\25-ds-casia\Tp

Processing category 'cha'
Found 383 files.
Sampled and copied 96 'cha' images to C:\Users\User\Documents\DS4\25-ds-casia\Tp

Processing category 'sec'
Found 381 files.
Sampled and copied 95 'sec' images to C:\Users\User\Documents\DS4\25-ds-casia\

### Creating .txt File: Tp (Tp_S)

In [17]:
# Write the list of copied filenames to a text file
with open(tp_tps_txt_dir_fp, 'w') as f:
    for filename in tp_tps_list:
        f.write(f"{filename}\n")

print(f"All sampling and copying completed.\nFilenames of copied images are listed in {tp_tps_txt_dir_fp}")

All sampling and copying completed.
Filenames of copied images are listed in C:\Users\User\Documents\DS4\25-ds-casia\tp_tps_list.txt


## 3. Tp: Sampling (Tp_D)

### Setting .txt file

In [25]:
tp_tpd_txt_dir_fp = os.path.join(txt_dir, 'tp_list.txt')

In [19]:
os.makedirs(tp_output_dir, exist_ok=True)

### Dictionary of Categories and Counts

In [20]:
# For Tp_D
tp_tpd_cc_dict = {
    'pla': 26,
    'art': 56,
    'nat': 113,  
    'arc': 45,
    'ani': 53,
    'ind': 28,
    'cha': 65,
    'sec': 55,
    'txt': 17
}

### Storing Filenames of Copied Images

In [21]:
tp_tpd_list = []

### Sampling Function: Tp (Tp_D)

In [22]:
def casia25_tp_tpd_sample(tp_tps_src_dir, tp_tps_output_dir, tp_tps_categories):
    for category, sample_size in tp_tps_categories.items():
        # List all files in the source directory
        all_files = os.listdir(tp_tps_src_dir)

        # Debug: Print all files for verification
        print(f"Processing category '{category}'")

        # Define a regex pattern to match the category in the filenames
        pattern = re.compile(rf'Tp_D_.*_{category}\d{{5}}_{category}\d{{5}}_.*\.(jpg|tif)')

        # Filter files to include only those that match the category pattern
        category_files = [f for f in all_files if pattern.match(f)]

        # Debug: Print the number of files found for this category
        print(f"Found {len(category_files)} files.")

        # Check if there are enough files to sample
        if len(category_files) < sample_size:
            print(f"Not enough files in category '{category}' to sample {sample_size} images. Available: {len(category_files)}.")
            continue

        # Sample the specified number of files from the current category
        sampled_files = random.sample(category_files, sample_size)

        # Copy the sampled files to the output directory
        for file_name in sampled_files:
            src_path = os.path.join(tp_tps_src_dir, file_name)
            dest_path = os.path.join(tp_tps_output_dir, file_name)
            shutil.copy(src_path, dest_path)
            tp_tps_list.append(file_name)

        print(f"Sampled and copied {sample_size} '{category}' images to {tp_tps_output_dir}\n")

    return tp_tps_list

### Calling Sampling Function: Tp (Tp_D)

In [23]:
# Run the sampling and copying process for Tp_S images
tp_tpd_list = casia25_tp_tpd_sample(tp_src_dir, tp_output_dir, tp_tpd_cc_dict)

Processing category 'pla'
Found 55 files.
Sampled and copied 26 'pla' images to C:\Users\User\Documents\DS4\25-ds-casia\Tp

Processing category 'art'
Found 62 files.
Sampled and copied 56 'art' images to C:\Users\User\Documents\DS4\25-ds-casia\Tp

Processing category 'nat'
Found 218 files.
Sampled and copied 113 'nat' images to C:\Users\User\Documents\DS4\25-ds-casia\Tp

Processing category 'arc'
Found 49 files.
Sampled and copied 45 'arc' images to C:\Users\User\Documents\DS4\25-ds-casia\Tp

Processing category 'ani'
Found 184 files.
Sampled and copied 53 'ani' images to C:\Users\User\Documents\DS4\25-ds-casia\Tp

Processing category 'ind'
Found 54 files.
Sampled and copied 28 'ind' images to C:\Users\User\Documents\DS4\25-ds-casia\Tp

Processing category 'cha'
Found 119 files.
Sampled and copied 65 'cha' images to C:\Users\User\Documents\DS4\25-ds-casia\Tp

Processing category 'sec'
Found 57 files.
Sampled and copied 55 'sec' images to C:\Users\User\Documents\DS4\25-ds-casia\Tp

Proc

### Creating .txt File: Tp (Tp_D + Tp_S)

In [26]:
# Write the list of copied filenames to a text file
with open(tp_tpd_txt_dir_fp, 'w') as f:
    for filename in tp_tpd_list:
        f.write(f"{filename}\n")

print(f"All sampling and copying completed.\nFilenames of copied images are listed in {tp_tpd_txt_dir_fp}")

All sampling and copying completed.
Filenames of copied images are listed in C:\Users\User\Documents\DS4\25-ds-casia\tp_list.txt


## 4. Gt: Sampling

### Setting Paths & .txt File

In [62]:
# Paths
gt_src_dir = r'C:\Users\User\Downloads\casia_sh\Gt'
gt_output_dir = r'C:\Users\User\Documents\DS4\25-ds-casia\Gt'
txt_dir = r'C:\Users\User\Documents\DS4\25-ds-casia'

In [63]:
os.makedirs(gt_output_dir, exist_ok=True)

### Read tp_list.txt File

In [64]:
# Text file containing the list of Tp files
tp_list_file = os.path.join(txt_dir, 'tp_list.txt')

### Gt Function

In [65]:
def casia_gt(tp_list_file, gt_src_dir, gt_output_dir):
    # Read the list of Tp files from the text file
    with open(tp_list_file, 'r') as file:
        tp_files = file.read().splitlines()

    # Initialize counter for successful copies
    successful_copies = 0

    # Iterate through each Tp filename
    for tp_filename in tp_files:
        # Construct the corresponding Gt filename
        gt_filename = tp_filename.replace('.tif', '_gt.png').replace('.jpg', '_gt.png')

        # Construct the full paths for both Tp and Gt images
        tp_path_tif = os.path.join(gt_src_dir, tp_filename)
        tp_path_jpg = os.path.join(gt_src_dir, tp_filename.replace('.tif', '.jpg'))
        gt_path = os.path.join(gt_src_dir, gt_filename)
        output_gt_path = os.path.join(gt_output_dir, gt_filename)

        # Check if the Gt image exists in both TIFF and JPEG format
        if os.path.exists(gt_path):
            # Copy the Gt image to the output directory
            shutil.copy(gt_path, output_gt_path)
            successful_copies += 1

        elif os.path.exists(tp_path_jpg):
            # Construct the corresponding Gt filename for the JPEG file
            gt_filename_jpg = gt_filename.replace('_gt.png', '_gt.jpg')

            # Construct the full path for Gt image in JPEG format
            gt_path_jpg = os.path.join(gt_src_dir, gt_filename_jpg)

            # Copy the Gt image in JPEG format to the output directory
            shutil.copy(gt_path_jpg, output_gt_path)
            successful_copies += 1

    # Print the number of successful copies
    print(f"Successfully copied {successful_copies} Gt images to {gt_output_dir}")

In [66]:
# Call the function to copy Gt images
casia_gt(tp_list_file, gt_src_dir, gt_output_dir)

Successfully copied 1281 Gt images to C:\Users\User\Documents\DS4\25-ds-casia\Gt
