## Train/valid/test sampling

Copies all the NetCDF files from cyclone_binaries to a sibling folder partitions with subfolders for each part of the split

In [5]:
from tqdm import tqdm
import random
import shutil
from pathlib import Path

print("Starting...")

# Root folder for our NetCDF files
def cyclone_binary_path() -> str:
    # Based on process_tracks.py (we don't import it to avoid importing the universe)
    user = 'ob2720'
    return f'/g/data/x77/{user}/cyclone_binaries/'

path = Path(cyclone_binary_path())

# List of IBTrACS cyclone NetCDF files
files = [p for p in path.iterdir()]
files = files[:100]

# Split:
# 80% train
# 10% valid
# 10% test

# We make a folder of copies of NetCDF files for each set
# We could use a library's dataset splitting function with a random seed, or we could use
# symlinks to the original files, but this gives us 100% confidence our split stays intact

random.shuffle(files) # much random, very unbias
part1 = int(len(files) * 0.8)
part2 = int(len(files) * 0.9)
train = files[:part1]
valid = files[part1:part2]
test = files[part2:]

print("Init finished")

train_path = path.parent / 'partition' / 'train'
train_path.mkdir(parents=True, exist_ok=True)
for file in tqdm(train):
    shutil.copy(str(file.resolve()), str((train_path / file.name).resolve()))

valid_path = path.parent / 'partition' / 'valid'
valid_path.mkdir(parents=True, exist_ok=True)
for file in tqdm(valid):
    shutil.copy(str(file.resolve()), str((valid_path / file.name).resolve()))

test_path = path.parent / 'partition' / 'test'
test_path.mkdir(parents=True, exist_ok=True)
for file in tqdm(test):
    shutil.copy(str(file.resolve()), str((test_path / file.name).resolve()))



Starting...
Init finished


100%|██████████| 80/80 [00:10<00:00,  7.85it/s]
100%|██████████| 10/10 [00:00<00:00, 12.25it/s]
100%|██████████| 10/10 [00:01<00:00,  8.24it/s]
