## Train/valid/test sampling

Copies all the NetCDF files from cyclone_binaries to a sibling folder partitions with subfolders for each part of the split

In [3]:
from tqdm import tqdm
import random
import shutil
from pathlib import Path

import sys
sys.path.append(str(Path().resolve().parent/'tracks'))
import tracks_module
import importlib
importlib.reload(tracks_module)

print("Starting...")

# Root folder for our NetCDF files
def cyclone_binary_path() -> str:
    # Based on process_tracks.py (we don't import it to avoid importing the universe)
    user = 'ob2720'
    return f'/g/data/x77/{user}/cyclone_binaries/'

path = Path(cyclone_binary_path())

# List of IBTrACS cyclone NetCDF files
# one of the papers we're basing this off uses 2014253N13260 as a specific example so we want this
# in our test set to get comparable results (ie it's unseen to both models)
special = path / '2014253N13260.nc'
files = [p for p in path.iterdir() if str(p) != str(special)]

# Split:
# 80% train
# 10% valid
# 10% test

# We make a folder of copies of NetCDF files for each set
# We could use a library's dataset splitting function with a random seed, or we could use
# symlinks to the original files, but this gives us 100% confidence our split stays intact

files = files[:200] # take first 199 + add the special one after to get 200  ## TODO: remove this line, change to 199 when special exists
random.shuffle(files) # much random, very unbias
# files.append(special) ## TODO: special doesn't exist yet

part1 = int(len(files) * 0.8)
part2 = int(len(files) * 0.9)
train = files[:part1]
valid = files[part1:part2]
test = files[part2:]

print("Init finished")

# Don't do anything if existing partition is there (in this case manually delete old / edit this script)
partition_path = path.parent / 'partition'
assert not partition_path.exists()

train_path = partition_path / 'train'
train_path.mkdir(parents=True, exist_ok=True)
for file in tqdm(train):
    shutil.copy(str(file.resolve()), str((train_path / file.name).resolve()))
tracks_module.all_available_tracks(data_local=str(train_path), write_file=str(partition_path / 'train.json'))

valid_path = partition_path / 'valid'
valid_path.mkdir(parents=True, exist_ok=True)
for file in tqdm(valid):
    shutil.copy(str(file.resolve()), str((valid_path / file.name).resolve()))
tracks_module.all_available_tracks(data_local=str(valid_path), write_file=str(partition_path / 'valid.json'))

test_path = partition_path / 'test'
test_path.mkdir(parents=True, exist_ok=True)
for file in tqdm(test):
    shutil.copy(str(file.resolve()), str((test_path / file.name).resolve()))
tracks_module.all_available_tracks(data_local=str(test_path), write_file=str(partition_path / 'test.json'))

Starting...
Init finished


100%|██████████| 160/160 [00:48<00:00,  3.28it/s]
 22%|██▏       | 35/160 [00:14<00:52,  2.38it/s]

Has nan: 1993045S13170


100%|██████████| 160/160 [01:07<00:00,  2.37it/s]


Has nan: 1991166N10257


100%|██████████| 20/20 [00:06<00:00,  3.21it/s]
100%|██████████| 20/20 [00:07<00:00,  2.79it/s]
100%|██████████| 20/20 [00:05<00:00,  3.36it/s]
100%|██████████| 20/20 [00:07<00:00,  2.79it/s]
