In [None]:
import random

def split_train_test(train_file, test_file, test_ratio=0.1):
    # Read the train.txt file
    with open(train_file, 'r') as f:
        lines = f.readlines()

    # Shuffle the lines randomly
    random.shuffle(lines)

    # Calculate the number of test samples
    num_test_samples = int(len(lines) * test_ratio)

    # Split the lines into train and test sets
    test_lines = lines[:num_test_samples]
    train_lines = lines[num_test_samples:]

    # Write the test lines to test.txt
    with open(test_file, 'w+') as f:
        test_lines = sorted(test_lines, key=lambda x:int(x.split()[1]))
        f.writelines(test_lines)

    # Write the remaining train lines back to new split_train.txt
    with open(train_file, 'w+') as f:
        train_lines = sorted(train_lines, key=lambda x:int(x.split()[1]))
        f.writelines(train_lines)

In [None]:
import os
import shutil

def move_files(train_dir, test_dir, test_file):
    # Create the test directory if it doesn't exist
    os.makedirs(test_dir, exist_ok=True)

    # Read each line in test.txt and move the files from train/ to test/
    with open(test_file, 'r') as file:
        for line in file:
            # Extract the file name (assuming the format is 'filename label')
            filename = line.strip().split()[0]

            # Define the source and destination paths
            src_path = os.path.join(train_dir, filename)
            dest_path = os.path.join(test_dir, filename)

            # Move the file from the train directory to the test directory
            if os.path.isfile(src_path):
                shutil.move(src_path, dest_path)
                print(f"Moved: {filename} from {src_path} to {dest_path}")
            else:
                print(f"File not found: {filename}")

    print(f"Files moved to {test_dir}.")
    
# return all test videos to train in order to make a new split
def return_files(train_dir, test_dir, test_file):
    move_files(test_dir, train_dir, test_file)

In [None]:
train_file = "./data/k700-2020/updated_splits/train.txt"
test_file = "./data/k700-2020/updated_splits/test.txt"

train_dir = "./data/k700-2020/updated_splits/train"
test_dir = "./data/k700-2020/updated_splits/test"

In [None]:
# create a new test_file by splitting off train_file
split_train_test(train_file, test_file, test_ratio=0.1)

# move files based on the test_file
move_files(train_dir, test_dir, test_file)

In [None]:
# return the files back to train based on the test file
# run this before creating a new test split
return_files(train_dir, test_dir, test_file)

In [None]:
# Create smaller versions of train/val/test split

train_file = "./data/k700-2020/updated_splits/train.txt"
val_file = "./data/k700-2020/updated_splits/val.txt"

small_train_file = "./data/k700-2020/updated_splits/small_train.txt"
small_val_file = "./data/k700-2020/updated_splits/small_val.txt"
small_test_file = "./data/k700-2020/updated_splits/small_test.txt"

train_dir = "./data/k700-2020/updated_splits/train"
small_train_dir = "./data/k700-2020/updated_splits/small_train"

val_dir = "./data/k700-2020/updated_splits/val"
small_val_dir = "./data/k700-2020/updated_splits/small_val"

small_test_dir = "./data/k700-2020/updated_splits/small_test"

In [None]:
# split train_file -> small_train_file
split_train_test(train_file, small_train_file, test_ratio=0.1)
# move files train_dir -> small_train_dir based on small_train_file
move_files(train_dir, small_train_dir, small_train_file)

# split small_train_file -> small_test_file
split_train_test(small_train_file, small_test_file, test_ratio=0.1)
# move small_train_dir -> small_test_dir based on small_test_file
move_files(small_train_dir, small_test_dir, small_test_file)

# # split val_file -> small_val_file
split_train_test(val_file, small_val_file, test_ratio=0.1)
# move files val -> small_val based on small_val_file
move_files(val_dir, small_val_dir, small_val_file)