Notebook that transforms the data from pickle format to a dataframe for easier handling when enriching AIS-data with weather data.

In [29]:
import pandas as pd
import pickle as pkl
import numpy as np

In [30]:
LAT, LON, SOG, COG, TS, MMSI, TID = np.arange(7)

## STAD Publication West Coast Spatial Bounds

In [31]:
# Based on publication source code (see cell later)
wc_lonmin = -126
wc_lonmax = -122
wc_latmin = 34.5
wc_latmax = 38.5
wc_sogmax = 30.0
wc_cogmax = 360.0

# Methods

In [32]:
def unscale_vect(vect, x_min, x_max):
    return vect * (x_max - x_min) + x_min

In [33]:
def stadset_to_df(stad_data,
                  lonmin,
                  lonmax,
                  latmin,
                  latmax,
                  sogmax,
                  cogmax):

    """
    Function to inversely scale the training data in (Li et al., 2024)
    Based on their methodology in: https://github.com/lihui91/ais_process/blob/7f62ff2eb5072ceee723e48f91b0d1eac9218429/src/dataset_preprocessing.py#L325
    """

    colnames = [
        'latitude',
        'longitude',
        'SOG',
        'COG',
        'timestamp',
        'MMSI'
    ]

    # Unscaling
    dfs = list()
    for i, trajectory_data in enumerate(stad_data):

        mmsi = trajectory_data['mmsi']
        trj = trajectory_data['traj']

        trj_id = f"{int(mmsi)}_{i}"

        trj[:, LAT] = unscale_vect(trj[:, LAT], latmin, latmax)
        trj[:, LON] = unscale_vect(trj[:, LON], lonmin, lonmax)
        trj[:, SOG] = trj[:, SOG] * sogmax
        trj[:, COG] = trj[:, COG] * cogmax

        # Transform to dataframe
        df = pd.DataFrame(trj, columns=colnames)
        df['traj_id'] = trj_id
        dfs.append(df)

    df = pd.concat(dfs, axis=0)

    df['MMSI'] = df['MMSI'].astype(int)
    df['timestamp'] = df['timestamp'].astype(int)

    return df

# Data Loading
Dataset was retrieved from https://github.com/lihui91/ais_process

## Original Train

In [None]:
train_set = '../../data/ct_train.pkl'
with open(train_set, 'rb') as f:
    train = pkl.load(f)

In [35]:
train[0]['traj'][0, :]

array([4.12870000e-01, 8.62255000e-01, 4.20000000e-01, 9.17777778e-01,
       1.67253120e+09, 4.16497000e+08])

## Original Valid

In [None]:
valid_set = '../../data/ct_valid.pkl'
with open(valid_set, 'rb') as f:
    valid = pkl.load(f)

In [37]:
valid[0]['traj'][0, :]

array([4.72507500e-01, 7.37727500e-01, 3.53333333e-01, 9.25833333e-01,
       1.69084800e+09, 2.59157000e+08])

In [38]:
df_valid = pd.DataFrame(np.vstack([trj['traj'] for trj in valid]), columns=['latitude', 'longitude', 'SOG', 'COG', 'timestamp', 'MMSI'])
df_valid.describe()

Unnamed: 0,latitude,longitude,SOG,COG,timestamp,MMSI
count,94370.0,94370.0,94370.0,94370.0,94370.0,94370.0
mean,0.614819,0.715759,0.274238,0.546148,1692203000.0,405686400.0
std,0.262026,0.22407,0.194626,0.280503,764886.2,129901300.0
min,8e-06,1e-05,0.0,0.0,1690848000.0,205657000.0
25%,0.422599,0.599907,0.04526,0.336111,1691550000.0,311000100.0
50%,0.703124,0.768908,0.321415,0.4953,1692191000.0,369142000.0
75%,0.82424,0.908844,0.423333,0.823217,1692837000.0,538007500.0
max,0.999997,0.999997,0.790108,1.0,1693526000.0,636093000.0


# Unscale and transform to DataFrame

In [39]:
stad_train_df = stadset_to_df(train,
                              wc_lonmin,
                              wc_lonmax,
                              wc_latmin,
                              wc_latmax,
                              wc_sogmax,
                              wc_cogmax)
print(stad_train_df.shape)
stad_train_df.head()

(538223, 7)


Unnamed: 0,latitude,longitude,SOG,COG,timestamp,MMSI,traj_id
0,36.15148,-122.55098,12.6,330.4,1672531202,416497000,416497000_0
1,36.180222,-122.573689,12.2,327.474286,1672531802,416497000,416497000_0
2,36.208445,-122.596052,12.265217,328.3,1672532402,416497000,416497000_0
3,36.23686,-122.619197,12.405714,326.837143,1672533002,416497000,416497000_0
4,36.264735,-122.644051,12.322857,322.64,1672533602,416497000,416497000_0


In [40]:
stad_train_df.describe()

Unnamed: 0,latitude,longitude,SOG,COG,timestamp,MMSI
count,538223.0,538223.0,538223.0,538223.0,538223.0,538223.0
mean,37.025807,-123.082414,8.110445,199.766808,1682567000.0,405191600.0
std,0.977095,0.861746,5.822389,103.043849,5337238.0,125209600.0
min,34.50001,-125.99998,0.0,0.0,1672531000.0,205559000.0
25%,36.374493,-123.548966,1.515595,121.536149,1677991000.0,311000900.0
50%,37.335614,-122.835558,9.67971,179.0,1683208000.0,369040000.0
75%,37.794796,-122.36001,12.577049,304.7,1687362000.0,538006100.0
max,38.49998,-122.00001,24.1,360.0,1690848000.0,642122000.0


In [41]:
stad_valid_df = stadset_to_df(valid,
                              wc_lonmin,
                              wc_lonmax,
                              wc_latmin,
                              wc_latmax,
                              wc_sogmax,
                              wc_cogmax)
print(stad_valid_df.shape)
stad_valid_df.head()

(94370, 7)


Unnamed: 0,latitude,longitude,SOG,COG,timestamp,MMSI,traj_id
0,36.39003,-123.04909,10.6,333.3,1690848001,259157000,259157000_0
1,36.416742,-123.065319,10.756338,331.398592,1690848601,259157000,259157000_0
2,36.443054,-123.08344,10.528571,330.657143,1690849201,259157000,259157000_0
3,36.46874,-123.10166,10.4,330.385915,1690849801,259157000,259157000_0
4,36.494224,-123.119153,10.357143,331.585714,1690850401,259157000,259157000_0


# Save as parquet files

In [None]:
stad_train_df.to_parquet('../../data/df_stad_train_west.parquet')
stad_valid_df.to_parquet('../../data/df_stad_valid_west.parquet')