In [17]:
import os
import pickle as pkl
import pandas as pd
import numpy as np
import random

random.seed(107)

In [18]:
LAT, LON, SOG, COG, TS, MMSI, SWH, MWD, MWP, U10, V10 = np.arange(11)

# Methods

In [19]:
def min_max_scale_column(df, column):
    col_min = df[column].min()
    col_max = df[column].max()

    df[column] = (df[column] - col_min) / (col_max - col_min)

    return df

def max_scale_column(df, column, max_val):
    col_max = df[column].max()

    df[column] = df[column] / col_max

    return df

def df_to_list(df):

    df = df.copy()

    cols_min_max = ['latitude', 'longitude', 'swh', 'mwp', 'u10', 'v10']
    for col in cols_min_max:
        df = min_max_scale_column(df, column=col)

    cols_max = {'SOG': 30.0,
                'COG': 360.0,
                'mwd': 360.0}

    for col, val in cols_max.items():
        df = max_scale_column(df, column=col, max_val=val)

    trjs = []
    t_ids = df['traj_id'].unique().tolist()
    for trj in t_ids:
        tmp_df = df[df['traj_id'] == trj].copy()
        tmp_df = tmp_df.sort_values(by='timestamp')
        tmp_df = tmp_df.drop(columns=['traj_id'])

        mmsi = tmp_df['MMSI'].unique().tolist()
        assert len(mmsi) == 1
        mmsi = mmsi[0]

        trjs.append(
            dict(mmsi=np.float64(mmsi), traj=tmp_df.values)
        )

    return trjs

def save_to_pkl(lst, filename, directory=None):

    if directory:
        o_pth = os.path.join(directory, filename)
    else:
        o_pth = filename

    with open(o_pth, 'wb') as f:
        pkl.dump(lst, f)

    print(f'Saved to {o_pth}')

    return None

In [20]:
enriched_train_fp = '../../data/joined-stad-weather-train.parquet'
enriched_valid_fp = '../../data/joined-stad-weather-valid.parquet'

In [21]:
df_enriched_train = pd.read_parquet(enriched_train_fp)
df_enriched_valid = pd.read_parquet(enriched_valid_fp)

In [22]:
for df in [df_enriched_train, df_enriched_valid]:
  df = df[['latitude', 'longitude', 'SOG', 'COG', 'timestamp', 'MMSI', 'traj_id', 'swh', 'mwd', 'mwp', 'u10', 'v10']]

In [23]:
for df in [df_enriched_train, df_enriched_valid]:
  print(df.columns)

Index(['latitude', 'longitude', 'SOG', 'COG', 'timestamp', 'MMSI', 'traj_id',
       'swh', 'mwd', 'mwp', 'u10', 'v10'],
      dtype='object')
Index(['latitude', 'longitude', 'SOG', 'COG', 'timestamp', 'MMSI', 'traj_id',
       'swh', 'mwd', 'mwp', 'u10', 'v10'],
      dtype='object')


In [24]:
lst_train = df_to_list(df_enriched_valid)
lst_valid = df_to_list(df_enriched_valid)

# Shuffle
Training and validation datasets to ensure equal representation of weather conditions within datasets

In [25]:
total = lst_train + lst_valid
random.shuffle(total)

train_enriched_shuffled = total[len(lst_valid):]
valid_enriched_shuffled = total[:len(lst_valid)]

In [26]:
save_to_pkl(train_enriched_shuffled, directory='../../data', filename='joined-train-stad-weather.pkl')
save_to_pkl(valid_enriched_shuffled, directory='../../data', filename='joined-valid-stad-weather.pkl')

Saved to ../../data/joined-train-stad-weather.pkl
Saved to ../../data/joined-valid-stad-weather.pkl
