This notebook was used to create the simpler dataset we use for covariance modelling

It loads copies of files from `/g/data/x77/jm0124/feature_vectors/` downloaded to `../.data`, ie:
```sh
cd dl-cyclones
mkdir .data
scp ob2720@gadi.nci.org.au:/g/data/x77/jm0124/feature_vectors/feature-array-outputs-uv-train.npy .data
scp ob2720@gadi.nci.org.au:/g/data/x77/jm0124/feature_vectors/feature-array-outputs-uv-val.npy .data
scp ob2720@gadi.nci.org.au:/g/data/x77/jm0124/feature_vectors/feature-array-outputs-uv-test.npy .data
scp ob2720@gadi.nci.org.au:/g/data/x77/jm0124/feature_vectors/train_feature_labels.json .data
scp ob2720@gadi.nci.org.au:/g/data/x77/jm0124/feature_vectors/val_feature_labels.json .data
scp ob2720@gadi.nci.org.au:/g/data/x77/jm0124/feature_vectors/test_feature_labels.json .data
```

In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import json

In [5]:
PREFIX = '../.data/' # or '/g/data/x77/jm0124/feature_vectors/'

def load_json(name: str) -> dict:
    with open(name, 'r') as f:
        return json.load(f)

def wrap_longitude(long: float) -> float:
    return (long + 180) % 360 - 180

def process_partition(partition: str):
    # table where each row is a movement of a cyclone
    # stores predicted and actual displacement and some metadata (start position and intensity)
    dicts = []

    # prediction vectors: shape is (n_samples, 2) of long/lat displacements in degrees
    preds = np.load(f"{PREFIX}feature-array-outputs-uv-{partition}.npy")[:,[0,1]]

    # this JSON stores a map from [cyclone_id]-[time] to objects containing some metadata
    j = load_json(f"{PREFIX}{partition}_feature_labels.json")
    for i, id_ in enumerate(j.keys()):
        long, lat, inten = j[id_]['label']
        # track_data = [[long_old, long_new], [lat_old, lat_new], [inten, inten]]
        dicts.append({
            'pred_long_disp': wrap_longitude(preds[i][0]), 'pred_lat_disp': preds[i][1],
            'true_long_disp': wrap_longitude(long[1] - long[0]), 'true_lat_disp': lat[1] - lat[0],
            'long': wrap_longitude(long[0]), 'lat': lat[0], 'intensity': inten[0]
        })

    return pd.DataFrame.from_records(dicts, columns=['pred_long_disp', 'pred_lat_disp', 'true_long_disp', 'true_lat_disp', 'long', 'lat', 'intensity'])

In [6]:
process_partition('train').to_csv('train.csv', index=False)
process_partition('val').to_csv('val.csv', index=False)
process_partition('test').to_csv('test.csv', index=False)