# Creating Training, Validation, and Testing Split: SIFT + SVM

In [1]:
# For image binary classification task
# Training 70%, Validation 20%, and Testing 10%
# https://youtu.be/C6wbr1jJvVs?si=g4peQ6Fr4CRez1MB

## 1. Install & Import Libraries

In [94]:
import os
import shutil
import re

## 2. Allocation: Au

### Setting Paths

In [75]:
sds_au_dir = r'C:\Users\User\Documents\DS4\25-ds-casia-ss\Au'
sds_au_train_dst = r'C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Au'
sds_au_validation_dst = r'C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Au'
sds_au_test_dst = r'C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\test\Au'

In [76]:
os.makedirs(sds_au_train_dst, exist_ok=True)
os.makedirs(sds_au_validation_dst, exist_ok=True)
os.makedirs(sds_au_test_dst, exist_ok=True)

### Dictionary of Categories & Counts: Train Set

In [77]:
sds_au_train_cc_dict = {
    'pla': 171,
    'art': 160,
    'nat': 176,  
    'arc': 188,
    'ani': 192,
    'ind': 61,
    'cha': 172,
    'sec': 167,
    'txt': 26
}

### Moving Images Function

In [78]:
def sds_au_alloc(src_dir, output_dir, categories):
    all_files = os.listdir(src_dir)
    for category, sample_size in categories.items():
        # Filter files to include only those from the current category and with the desired extensions
        category_files = [f for f in all_files if f.startswith(f'Au_{category}_') and (f.endswith('.jpg') or f.endswith('.tif'))]

        # Check if there are enough files to sample
        if len(category_files) < sample_size:
            print(f"Not enough files in category '{category}' to sample {sample_size} images. Available: {len(category_files)}.")
            continue

        # Move the specified number of files from the current category
        for i in range(sample_size):
            file_name = category_files[i]
            src_path = os.path.join(src_dir, file_name)
            dest_path = os.path.join(output_dir, file_name)
            shutil.move(src_path, dest_path)

        print(f"Moved {sample_size} '{category}' images to {output_dir}")

### Calling Moving Images Function: Train Set

In [79]:
sds_au_alloc(sds_au_dir, sds_au_train_dst, sds_au_train_cc_dict)

Sampled and moved 171 'pla' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Au
Sampled and moved 160 'art' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Au
Sampled and moved 176 'nat' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Au
Sampled and moved 188 'arc' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Au
Sampled and moved 192 'ani' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Au
Sampled and moved 61 'ind' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Au
Sampled and moved 172 'cha' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Au
Sampled and moved 167 'sec' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Au
Sampled and moved 26 'txt' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Au


### Dictionary of Categories & Counts: Validation Set

In [80]:
sds_au_validation_cc_dict = {
    'pla': 49,
    'art': 46,
    'nat': 50,  
    'arc': 54,
    'ani': 55,
    'ind': 17,
    'cha': 49,
    'sec': 48,
    'txt': 7
}

### Calling Moving Images Function: Validation Set

In [81]:
sds_au_alloc(sds_au_dir, sds_au_validation_dst, sds_au_validation_cc_dict)

Sampled and moved 49 'pla' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Au
Sampled and moved 46 'art' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Au
Sampled and moved 50 'nat' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Au
Sampled and moved 54 'arc' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Au
Sampled and moved 55 'ani' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Au
Sampled and moved 17 'ind' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Au
Sampled and moved 49 'cha' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Au
Sampled and moved 48 'sec' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Au
Sampled and moved 7 'txt' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Au


### Dictionary of Categories & Counts: Test Set

In [89]:
sds_au_test_cc_dict = {
    'pla': 24,
    'art': 23,
    'nat': 25,  
    'arc': 27,
    'ani': 27,
    'ind': 9,
    'cha': 24, # 25-1
    'sec': 23, # 24-1
    'txt': 4
}

### Calling Moving Images Function: Test Set

In [None]:
# If there are not enough files to move,
# just cut and paste the rest from the sds_au_dir to sds_au_test_dst.

In [90]:
sds_au_alloc(sds_au_dir, sds_au_test_dst, sds_au_test_cc_dict)

Sampled and moved 24 'pla' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\test\Au
Not enough files in category 'art' to sample 23 images. Available: 22.
Sampled and moved 25 'nat' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\test\Au
Sampled and moved 27 'arc' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\test\Au
Sampled and moved 27 'ani' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\test\Au
Sampled and moved 9 'ind' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\test\Au
Sampled and moved 24 'cha' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\test\Au
Sampled and moved 23 'sec' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\test\Au
Sampled and moved 4 'txt' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\test\Au


## 3. Allocation: Tp (Tp_S)

### Setting Paths

In [91]:
sds_tp_dir = r'C:\Users\User\Documents\DS4\25-ds-casia-ss\Tp'
sds_tp_train_dst = r'C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Tp'
sds_tp_validation_dst = r'C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Tp'
sds_tp_test_dst = r'C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\test\Tp'

In [92]:
os.makedirs(sds_tp_train_dst, exist_ok=True)
os.makedirs(sds_tp_validation_dst, exist_ok=True)
os.makedirs(sds_tp_test_dst, exist_ok=True)

### Dictionary of Categories & Counts: Train Set

In [102]:
# For Tp_S
sds_tp_tps_train_cc_dict = {
    'pla': 72,
    'art': 76,
    'nat': 68,  
    'arc': 76,
    'ani': 54,
    'ind': 72,
    'cha': 67,
    'sec': 67,
    'txt': 24
}

### Moving Images Function (Tp_S)

In [95]:
def sds_tp_tps_alloc(tp_tps_src_dir, tp_tps_output_dir, tp_tps_categories):
    all_files = os.listdir(tp_tps_src_dir)
    for category, sample_size in tp_tps_categories.items():
        # Debug: Print all files for verification
        print(f"Processing category '{category}'")

        # Define a regex pattern to match the category in the filenames
        pattern = re.compile(rf'Tp_S_.*_{category}\d{{5}}_{category}\d{{5}}_.*\.(jpg|tif)')

        # Filter files to include only those that match the category pattern
        category_files = [f for f in all_files if pattern.match(f)]

        # Debug: Print the number of files found for this category
        print(f"Found {len(category_files)} files.")

        # Check if there are enough files to sample
        if len(category_files) < sample_size:
            print(f"Not enough files in category '{category}' to sample {sample_size} images. Available: {len(category_files)}.")
            continue

        # Move the specified number of files from the current category
        for i in range(sample_size):
            file_name = category_files[i]
            src_path = os.path.join(tp_tps_src_dir, file_name)
            dest_path = os.path.join(tp_tps_output_dir, file_name)
            shutil.move(src_path, dest_path)

        print(f"Moved {sample_size} '{category}' images to {tp_tps_output_dir}\n")

### Calling Moving Images Function: Train Set

In [96]:
sds_tp_tps_alloc(sds_tp_dir, sds_tp_train_dst, sds_tp_tps_train_cc_dict)

Processing category 'pla'
Found 103 files.
Moved 72 'pla' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Tp

Processing category 'art'
Found 109 files.
Moved 76 'art' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Tp

Processing category 'nat'
Found 97 files.
Moved 68 'nat' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Tp

Processing category 'arc'
Found 109 files.
Moved 76 'arc' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Tp

Processing category 'ani'
Found 77 files.
Moved 54 'ani' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Tp

Processing category 'ind'
Found 103 files.
Moved 72 'ind' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Tp

Processing category 'cha'
Found 96 files.
Moved 67 'cha' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Tp

Processing category 'sec'
Found 95 files.
Moved 67 'sec' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Tp

Proc

### Dictionary of Categories & Counts: Validation Set

In [98]:
# For Tp_S
sds_tp_tps_validation_cc_dict = {
    'pla': 21,
    'art': 22,
    'nat': 19,  
    'arc': 22,
    'ani': 15,
    'ind': 21,
    'cha': 19,
    'sec': 19,
    'txt': 7
}

### Calling Moving Images Function: Validation Set

In [99]:
sds_tp_tps_alloc(sds_tp_dir, sds_tp_validation_dst, sds_tp_tps_validation_cc_dict)

Processing category 'pla'
Found 31 files.
Moved 21 'pla' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Tp

Processing category 'art'
Found 33 files.
Moved 22 'art' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Tp

Processing category 'nat'
Found 29 files.
Moved 19 'nat' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Tp

Processing category 'arc'
Found 33 files.
Moved 22 'arc' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Tp

Processing category 'ani'
Found 23 files.
Moved 15 'ani' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Tp

Processing category 'ind'
Found 31 files.
Moved 21 'ind' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Tp

Processing category 'cha'
Found 29 files.
Moved 19 'cha' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Tp

Processing category 'sec'
Found 28 files.
Moved 19 'sec' images to C:\Users\User\Documents\DS4\25-ds-cas

## 4. Allocation: Tp (Tp_D)

### Dictionary of Categories & Counts: Train Set

In [104]:
# For Tp_D
sds_tp_tpd_train_cc_dict = {
    'pla': 18,
    'art': 39,
    'nat': 79,  
    'arc': 32,
    'ani': 37,
    'ind': 20,
    'cha': 46,
    'sec': 39,
    'txt': 12
}

### Moving Images Function (Tp_D)

In [105]:
def sds_tp_tpd_alloc(tp_tpd_src_dir, tp_tpd_output_dir, tp_tpd_categories):
    all_files = os.listdir(tp_tpd_src_dir)
    for category, sample_size in tp_tpd_categories.items():
        # Debug: Print all files for verification
        print(f"Processing category '{category}'")

        # Define a regex pattern to match the category in the filenames
        pattern = re.compile(rf'Tp_D_.*_{category}\d{{5}}_{category}\d{{5}}_.*\.(jpg|tif)')

        # Filter files to include only those that match the category pattern
        category_files = [f for f in all_files if pattern.match(f)]

        # Debug: Print the number of files found for this category
        print(f"Found {len(category_files)} files.")

        # Check if there are enough files to sample
        if len(category_files) < sample_size:
            print(f"Not enough files in category '{category}' to sample {sample_size} images. Available: {len(category_files)}.")
            continue

        # Move the specified number of files from the current category
        for i in range(sample_size):
            file_name = category_files[i]
            src_path = os.path.join(tp_tpd_src_dir, file_name)
            dest_path = os.path.join(tp_tpd_output_dir, file_name)
            shutil.move(src_path, dest_path)

        print(f"Moved {sample_size} '{category}' images to {tp_tpd_output_dir}\n")

### Calling Moving Images Function: Train Set

In [106]:
sds_tp_tpd_alloc(sds_tp_dir, sds_tp_train_dst, sds_tp_tpd_train_cc_dict)

Processing category 'pla'
Found 26 files.
Moved 18 'pla' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Tp

Processing category 'art'
Found 56 files.
Moved 39 'art' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Tp

Processing category 'nat'
Found 113 files.
Moved 79 'nat' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Tp

Processing category 'arc'
Found 45 files.
Moved 32 'arc' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Tp

Processing category 'ani'
Found 53 files.
Moved 37 'ani' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Tp

Processing category 'ind'
Found 28 files.
Moved 20 'ind' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Tp

Processing category 'cha'
Found 65 files.
Moved 46 'cha' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Tp

Processing category 'sec'
Found 55 files.
Moved 39 'sec' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\train\Tp

Process

### Dictionary of Categories & Counts: Validation Set

In [107]:
# For Tp_D
sds_tp_tpd_validation_cc_dict = {
    'pla': 5,
    'art': 11,
    'nat': 23,  
    'arc': 9,
    'ani': 11,
    'ind': 6,
    'cha': 13,
    'sec': 11,
    'txt': 3
}

In [108]:
sds_tp_tpd_alloc(sds_tp_dir, sds_tp_validation_dst, sds_tp_tpd_validation_cc_dict)

Processing category 'pla'
Found 8 files.
Moved 5 'pla' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Tp

Processing category 'art'
Found 17 files.
Moved 11 'art' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Tp

Processing category 'nat'
Found 34 files.
Moved 23 'nat' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Tp

Processing category 'arc'
Found 13 files.
Moved 9 'arc' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Tp

Processing category 'ani'
Found 16 files.
Moved 11 'ani' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Tp

Processing category 'ind'
Found 8 files.
Moved 6 'ind' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Tp

Processing category 'cha'
Found 19 files.
Moved 13 'cha' images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\validaton\Tp

Processing category 'sec'
Found 16 files.
Moved 11 'sec' images to C:\Users\User\Documents\DS4\25-ds-casia-ss

## 5. Allocation: Rest of Tp for Testing (Tp_S & Tp_D)

### Moving Images Function (Tp_S & Tp_D)

In [109]:
def sds_tp_test_alloc(tp_src_dir, tp_dst_dir):
    # List all files in the source directory
    all_files = os.listdir(tp_src_dir)
    
    # Filter files to include only jpg and tif files
    image_files = [f for f in all_files if f.endswith('.jpg') or f.endswith('.tif')]

    # Move each file to the destination directory
    for file_name in image_files:
        src_path = os.path.join(tp_src_dir, file_name)
        dest_path = os.path.join(tp_dst_dir, file_name)
        shutil.move(src_path, dest_path)
    
    # Print the total number of images moved
    print(f"Moved {len(image_files)} images to {tp_dst_dir}")

### Calling Moving Images Function: Test Set

In [110]:
sds_tp_test_alloc(sds_tp_dir, sds_tp_test_dst)

Moved 126 images to C:\Users\User\Documents\DS4\25-ds-casia-ss-tvt\test\Tp
