# This file move all tfrecord files randomly based on their location to sub folders of train test and validation

In [3]:
import zipfile
import os

In [4]:
def unzip_files_in_directory(directory):
    """
    Unzips all zip files in the given directory.

    Args:
    - directory (str): The path to the directory containing zip files.
    """
    for file in os.listdir(directory):
        if file.endswith('.zip'):
            zip_path = os.path.join(directory, file)
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                print(f"Unzipping {file}...")
                zip_ref.extractall(directory)
                print(f"Unzipped {file}.")

In [3]:
unzip_files_in_directory('raw_records')

Unzipping asset-index-2-20240103T071400Z-001.zip...
Unzipped asset-index-2-20240103T071400Z-001.zip.
Unzipping asset-index-2-20240103T071400Z-002.zip...
Unzipped asset-index-2-20240103T071400Z-002.zip.
Unzipping asset-index-2-20240103T071400Z-003.zip...
Unzipped asset-index-2-20240103T071400Z-003.zip.
Unzipping asset-index-2-20240103T071400Z-004.zip...
Unzipped asset-index-2-20240103T071400Z-004.zip.
Unzipping asset-index-2-20240103T071400Z-005.zip...
Unzipped asset-index-2-20240103T071400Z-005.zip.
Unzipping asset-index-20231229T004625Z-001.zip...
Unzipped asset-index-20231229T004625Z-001.zip.
Unzipping asset-index-20231229T004625Z-002.zip...
Unzipped asset-index-20231229T004625Z-002.zip.
Unzipping asset-index-20231229T004625Z-003.zip...
Unzipped asset-index-20231229T004625Z-003.zip.
Unzipping asset-index-20231229T004625Z-004.zip...
Unzipped asset-index-20231229T004625Z-004.zip.
Unzipping asset-index-20231229T004625Z-005.zip...
Unzipped asset-index-20231229T004625Z-005.zip.
Unzipping 

In [18]:
import os
import random
import glob

random.seed(42)

def extract_location_from_filename(filename):
    """
    Extracts the longitude and latitude from the filename.

    Args:
    - filename (str): The filename to parse.

    Returns:
    - (float, float): A tuple of longitude and latitude.
    """
    # Assuming filename format: "year_longitude_latitude_..."
    parts = filename.split('_')
    longitude = float(parts[1])
    latitude = float(parts[2])
    return (longitude, latitude)

def split_files_by_location(directory, train_ratio=0.6, validation_ratio=0.2, test_ratio=0.2):
    """
    Splits files into training, validation, and test sets based on their location.

    Args:
    - directory (str): Directory containing the files.
    - train_ratio (float): Ratio of training set size to total.
    - validation_ratio (float): Ratio of validation set size to total.
    - test_ratio (float): Ratio of test set size to total.

    Returns:
    - train_files, val_files, test_files (tuple): Lists of file paths for training, validation, and test.
    """
    assert train_ratio + validation_ratio + test_ratio == 1, "Ratios must sum up to 1"

    file_paths = glob.glob(os.path.join(directory, '*.tfrecord.gz'))
    file_paths.sort()

    # Group files by location
    location_groups = {}
    for path in file_paths:
        filename = os.path.basename(path)
        location = extract_location_from_filename(filename)
        if location not in location_groups:
            location_groups[location] = []
        location_groups[location].append(path)

    # Split locations into train, validation, and test sets
    locations = list(location_groups.keys())
    random.shuffle(locations)

    train_end = int(len(locations) * train_ratio)
    val_end = train_end + int(len(locations) * validation_ratio)

    train_locations = set(locations[:train_end])
    val_locations = set(locations[train_end:val_end])
    test_locations = set(locations[val_end:])

    # Allocate files to train, validation, or test sets
    train_files, val_files, test_files = [], [], []
    for location, paths in location_groups.items():
        if location in train_locations:
            train_files.extend(paths)
        elif location in val_locations:
            val_files.extend(paths)
        elif location in test_locations:
            test_files.extend(paths)

    return train_files, val_files, test_files

In [19]:
train, val, test = split_files_by_location('raw_records')

In [20]:
import shutil

def move_files_to_subfolders(train_files, val_files, test_files, base_directory):
    """
    Moves training and validation files into respective subfolders.

    Args:
    - train_files (list): List of training file paths.
    - val_files (list): List of validation file paths.
    - test_files (list): List of test file paths
    - base_directory (str): The base directory where the subfolders will be created.
    """

    train_dir = os.path.join(base_directory, 'train')
    val_dir = os.path.join(base_directory, 'val')
    test_dir = os.path.join(base_directory, 'test')

    # Create subfolders if they don't exist
    os.makedirs(train_dir, exist_ok=True)
    os.makedirs(val_dir, exist_ok=True)
    os.makedirs(test_dir, exist_ok=True)

    # Move training files
    for file in train_files:
        shutil.move(file, train_dir)
    print(f"Moved {len(train_files)} files to {train_dir}")

    # Move validation files
    for file in val_files:
        shutil.move(file, val_dir)
    print(f"Moved {len(val_files)} files to {val_dir}")

    # Move validation files
    for file in test_files:
        shutil.move(file, test_dir)
    print(f"Moved {len(test_files)} files to {test_dir}")

In [21]:
move_files_to_subfolders(train, val, test, 'raw_records')

Moved 7867 files to raw_records\train
Moved 2636 files to raw_records\val
Moved 2637 files to raw_records\test
