In [1]:
# import tensorflow as tf
import numpy as np
import h2o
import os
import pandas as pd

# from .feature_engineering_filter import Find_correct_port

In [2]:
train_data = pd.read_csv("../Datasets/ais_train.csv", delimiter="|")
test_data = pd.read_csv("../Datasets/ais_test.csv", delimiter=",")

In [3]:
vessel_data = pd.read_csv("../Datasets/vessels.csv", delimiter="|")
port_data = pd.read_csv("../Datasets/ports.csv", delimiter="|")

In [4]:
train_data["time"] = pd.to_datetime(train_data["time"])
test_data["time"] = pd.to_datetime(test_data["time"])

In [5]:
train_data=train_data.merge(vessel_data[['vesselId', 'shippingLineId']], on='vesselId', how='left')

In [6]:
port_data_renamed=pd.DataFrame()
port_data_renamed[["portId","port_latitude","port_longitude"]]=port_data[["portId","latitude","longitude"]]
train_data=train_data.merge(port_data_renamed, on="portId", how="left")

In [7]:
print(train_data.columns)

Index(['time', 'cog', 'sog', 'rot', 'heading', 'navstat', 'etaRaw', 'latitude',
       'longitude', 'vesselId', 'portId', 'shippingLineId', 'port_latitude',
       'port_longitude'],
      dtype='object')


In [None]:
train_data_preprocessed = train_data
train_data_preprocessed.loc[train_data_preprocessed["cog"] >= 360, "cog"] = np.nan
train_data_preprocessed.loc[train_data_preprocessed["sog"] >= 1023, "sog"] = np.nan
train_data_preprocessed.loc[train_data_preprocessed["rot"] == -128, "rot"] = np.nan
train_data_preprocessed.loc[train_data_preprocessed["heading"] == 511, "heading"] = (
    np.nan
)


pattern = r"^\d{2}-\d{2} \d{2}:\d{2}$"
train_data_preprocessed["etaRaw"] = train_data_preprocessed["etaRaw"].where(
    train_data_preprocessed["etaRaw"].str.match(pattern, na=False), np.nan
)


train_data_preprocessed = train_data_preprocessed.sort_values("time")

print(train_data_preprocessed.head())


train_data_preprocessed = (
    train_data_preprocessed.groupby("vesselId")
    .apply(lambda group: group.ffill().bfill())
    .reset_index(drop=True)
)


print(train_data_preprocessed.head())

train_data_preprocessed["heading"] = train_data_preprocessed["heading"].fillna(0)

train_data_preprocessed = train_data_preprocessed.dropna().reset_index(drop=True)


# Replace '00-' in etaRaw with the corresponding month and day from the 'time' column
train_data_preprocessed["etaRaw"] = train_data_preprocessed["etaRaw"].mask(
    train_data_preprocessed["etaRaw"].str.contains("00-", na=False),
    "01" + train_data_preprocessed["etaRaw"].str[2:],
)

train_data_preprocessed["etaRaw"] = train_data_preprocessed["etaRaw"].mask(
    train_data_preprocessed["etaRaw"].str.contains("-00", na=False),
    train_data_preprocessed["etaRaw"].str[:2]
    + "-01"
    + train_data_preprocessed["etaRaw"].str[5:],
)

train_data_preprocessed["etaRaw"] = train_data_preprocessed["etaRaw"].mask(
    train_data_preprocessed["etaRaw"].str.contains(":60", na=False),
    train_data_preprocessed["etaRaw"].str[:9] + "59",
)

train_data_preprocessed["etaRaw"] = train_data_preprocessed["etaRaw"].mask(
    train_data_preprocessed["etaRaw"].str.contains("60:", na=False),
    train_data_preprocessed["etaRaw"].str[:6] + "01:00",
)

train_data_preprocessed["etaRaw"] = train_data_preprocessed["etaRaw"].mask(
    train_data_preprocessed["etaRaw"].str.contains("24:", na=False),
    train_data_preprocessed["etaRaw"].str[:6] + "23:59",
)


train_data_preprocessed["etaRaw"] = pd.to_datetime(
    train_data_preprocessed["time"].dt.year.astype(str)
    + "-"
    + train_data_preprocessed["etaRaw"]
    + ":00",
    format="%Y-%m-%d %H:%M:%S",
)


train_data_preprocessed["seconds_to_eta"] = (
    train_data_preprocessed["etaRaw"] - train_data_preprocessed["time"]
).dt.total_seconds()

train_data_preprocessed = train_data_preprocessed.drop(columns=["etaRaw"])

                 time    cog   sog  rot  heading  navstat       etaRaw  \
0 2024-01-01 00:00:25  284.0   0.7  0.0     88.0        0  01-09 23:00   
1 2024-01-01 00:00:36  109.6   0.0 -6.0    347.0        1  12-29 20:00   
2 2024-01-01 00:01:45  111.0  11.0  0.0    112.0        0  01-02 09:00   
3 2024-01-01 00:03:11   96.4   0.0  0.0    142.0        1  12-31 20:00   
4 2024-01-01 00:03:51  214.0  19.7  0.0    215.0        0  01-25 12:00   

   latitude  longitude                  vesselId                    portId  \
0 -34.74370  -57.85130  61e9f3a8b937134a3c4bfdf7  61d371c43aeaecc07011a37f   
1   8.89440  -79.47939  61e9f3d4b937134a3c4bff1f  634c4de270937fc01c3a7689   
2  39.19065  -76.47567  61e9f436b937134a3c4c0131  61d3847bb7b7526e1adf3d19   
3 -34.41189  151.02067  61e9f3b4b937134a3c4bfe77  61d36f770a1807568ff9a126   
4  35.88379   -5.91636  61e9f41bb937134a3c4c0087  634c4de270937fc01c3a74f3   

             shippingLineId  port_latitude  port_longitude  
0  61ec65aea8cafc0e93f0e9

In [9]:
train_latitude_radians = np.deg2rad(train_data_preprocessed["latitude"])
train_longitude_radians = np.deg2rad(train_data_preprocessed["longitude"])
train_cog_radians = np.deg2rad(train_data_preprocessed["cog"])
train_heading_radians = np.deg2rad(train_data_preprocessed["heading"])

port_latitude_radians = np.deg2rad(train_data_preprocessed["port_latitude"])
port_longitude_radians = np.deg2rad(train_data_preprocessed["port_longitude"])

train_hour = np.deg2rad(train_data_preprocessed["time"].dt.hour * 360 / 24)
train_day = np.deg2rad(train_data_preprocessed["time"].dt.day * 360 / 30)
train_month = np.deg2rad(train_data_preprocessed["time"].dt.month * 360 / 12)


train_latitude_sin = np.sin(train_latitude_radians)
train_latitude_cos = np.cos(train_latitude_radians)
train_longitude_sin = np.sin(train_longitude_radians)
train_longitude_cos = np.cos(train_longitude_radians)

port_latitude_sin = np.sin(port_latitude_radians)
port_latitude_cos = np.cos(port_latitude_radians)
port_longitude_sin = np.sin(port_longitude_radians)
port_longitude_cos = np.cos(port_longitude_radians)

train_cog_sin = np.sin(train_cog_radians)
train_cog_cos = np.cos(train_cog_radians)

train_heading_sin = np.sin(train_heading_radians)
train_heading_cos = np.cos(train_heading_radians)

train_hour_sin = np.sin(train_hour)
train_hour_cos = np.cos(train_hour)

train_day_sin = np.sin(train_day)
train_day_cos = np.cos(train_day)

train_month_sin = np.sin(train_month)
train_month_cos = np.cos(train_month)


train_data_preprocessed["latitude_sin"] = train_latitude_sin
train_data_preprocessed["latitude_cos"] = train_latitude_cos
train_data_preprocessed["longitude_sin"] = train_longitude_sin
train_data_preprocessed["longitude_cos"] = train_longitude_cos
train_data_preprocessed["port_latitude_sin"] = train_latitude_sin
train_data_preprocessed["port_latitude_cos"] = train_latitude_cos
train_data_preprocessed["port_longitude_sin"] = train_longitude_sin
train_data_preprocessed["port_longitude_cos"] = train_longitude_cos
train_data_preprocessed["cog_sin"] = train_cog_sin
train_data_preprocessed["cog_cos"] = train_cog_cos
train_data_preprocessed["heading_sin"] = train_heading_sin
train_data_preprocessed["heading_cos"] = train_heading_cos

train_data_preprocessed["hour_sin"] = train_hour_sin
train_data_preprocessed["hour_cos"] = train_hour_cos
train_data_preprocessed["day_sin"] = train_day_sin
train_data_preprocessed["day_cos"] = train_day_cos
train_data_preprocessed["month_sin"] = train_month_sin
train_data_preprocessed["month_cos"] = train_month_cos


train_data_preprocessed["cog_sog_sin"] = train_data_preprocessed["cog_sin"]*train_data_preprocessed["sog"]
train_data_preprocessed["cog_sog_cos"] = train_data_preprocessed["cog_cos"]*train_data_preprocessed["sog"]

train_data_preprocessed = train_data_preprocessed.drop(
    columns=["latitude", "longitude", "cog", "heading", "portId","cog_sin","cog_cos","sog"], axis=1
)
print(train_data_preprocessed.head(10))

                 time  rot  navstat                  vesselId  \
0 2024-01-12 14:07:47 -6.0        0  61e9f38eb937134a3c4bfd8b   
1 2024-01-12 14:31:00  5.0        0  61e9f38eb937134a3c4bfd8b   
2 2024-01-12 14:57:23  5.0        0  61e9f38eb937134a3c4bfd8b   
3 2024-01-12 15:18:48  6.0        0  61e9f38eb937134a3c4bfd8b   
4 2024-01-12 15:39:47  7.0        0  61e9f38eb937134a3c4bfd8b   
5 2024-01-12 15:54:48  5.0        0  61e9f38eb937134a3c4bfd8b   
6 2024-01-12 16:14:59 -6.0        0  61e9f38eb937134a3c4bfd8b   
7 2024-01-12 16:35:24  2.0        0  61e9f38eb937134a3c4bfd8b   
8 2024-01-12 16:55:24 -1.0        0  61e9f38eb937134a3c4bfd8b   
9 2024-01-12 17:14:36  6.0        0  61e9f38eb937134a3c4bfd8b   

             shippingLineId  port_latitude  port_longitude  seconds_to_eta  \
0  61a8e672f9cba188601e84ab      13.263333       80.341111       -374867.0   
1  61a8e672f9cba188601e84ab      18.941944       72.885278        205140.0   
2  61a8e672f9cba188601e84ab      18.941944       7

In [10]:
print(train_data_preprocessed[["latitude_sin", "latitude_cos", "longitude_sin", "longitude_cos"]].describe())

       latitude_sin  latitude_cos  longitude_sin  longitude_cos
count  1.522065e+06  1.522065e+06   1.522065e+06   1.522065e+06
mean   5.674148e-01  7.325459e-01   4.657097e-02   5.090908e-01
std    3.533228e-01  1.287637e-01   5.297501e-01   6.767740e-01
min   -7.376648e-01  3.328656e-01  -1.000000e+00  -9.997826e-01
25%    5.666483e-01  6.245345e-01  -9.085523e-02  -9.307284e-02
50%    6.721562e-01  7.403239e-01   7.383488e-02   9.743727e-01
75%    7.809972e-01  8.197004e-01   3.145059e-01   9.969210e-01
max    9.429743e-01  1.000000e+00   9.923086e-01   1.000000e+00


In [11]:
def Last_known_location_training_data(
    data: pd.DataFrame, shift_lenghts
) -> pd.DataFrame:
    """_summary_  Groups training data by vesselId, and propogates all data from last known location

    Args:
    data (_type_): _description_ the data to be altered

    Returns:
        _type_:? _description_ the altered data

    """
    all_test_data = pd.DataFrame()

    for shift_length in shift_lenghts:

        grouped_data = data.groupby("vesselId").apply(lambda x: x.sort_values("time"))

        grouped_data["time_diff"] = (
            grouped_data["time"].diff(-shift_length).dt.total_seconds().abs()
        )

        original_time_and_id = grouped_data[
            [
                "time",
                "vesselId",
                "latitude_sin",
                "latitude_cos",
                "longitude_sin",
                "longitude_cos",
            ]
        ]

        shifted_data = grouped_data.shift(shift_length)
        shifted_data[
            [
                "last_latitude_sin",
                "last_latitude_cos",
                "last_longitude_sin",
                "last_longitude_cos",
            ]
        ] = shifted_data[
            ["latitude_sin", "latitude_cos", "longitude_sin", "longitude_cos"]
        ]

        shifted_data[
            [
                "time",
                "vesselId",
                "latitude_sin",
                "latitude_cos",
                "longitude_sin",
                "longitude_cos",
            ]
        ] = original_time_and_id[
            [
                "time",
                "vesselId",
                "latitude_sin",
                "latitude_cos",
                "longitude_sin",
                "longitude_cos",
            ]
        ]

        # Drops all values with nan values
        result = shifted_data.dropna().reset_index(drop=True)

        all_test_data = pd.concat([all_test_data, result], ignore_index=True)

    # Uncomment the line below if you want to remove the "time" column after processing
    # data = data.drop("time", axis=1)

    return all_test_data

In [12]:
print(train_data_preprocessed.columns)

train_data_shifted_df = Last_known_location_training_data(
    train_data_preprocessed, [10,25]
)

# train_data_shifted_df = train_data_shifted_df.drop(columns=["time"], axis=1)

Index(['time', 'rot', 'navstat', 'vesselId', 'shippingLineId', 'port_latitude',
       'port_longitude', 'seconds_to_eta', 'latitude_sin', 'latitude_cos',
       'longitude_sin', 'longitude_cos', 'port_latitude_sin',
       'port_latitude_cos', 'port_longitude_sin', 'port_longitude_cos',
       'heading_sin', 'heading_cos', 'hour_sin', 'hour_cos', 'day_sin',
       'day_cos', 'month_sin', 'month_cos', 'cog_sog_sin', 'cog_sog_cos'],
      dtype='object')


  grouped_data = data.groupby("vesselId").apply(lambda x: x.sort_values("time"))
  grouped_data = data.groupby("vesselId").apply(lambda x: x.sort_values("time"))


In [13]:
def append_last_known_data_test(
    test_data: pd.DataFrame, known_data: pd.DataFrame
) -> pd.DataFrame:
    """_summary_  Groups training data by vesselId, and propogates all data from last known location

    Args:
    data (_type_): _description_ the data to be altered

    Returns:
        _type_:? _description_ the altered data
    """

    if not test_data["vesselId"].isin(known_data["vesselId"]).all():
        missing_vessels = test_data[
            ~test_data["vesselId"].isin(known_data["vesselId"])
        ]["vesselId"].unique()
        raise ValueError(
            f"The following vesselIds are missing in known_data: {missing_vessels}"
        )
    print(
        test_data[~test_data["vesselId"].isin(known_data["vesselId"])][
            "vesselId"
        ].unique()
    )

    grouped_data = (
        known_data.sort_values("time")
        .groupby("vesselId")
        .tail(1)
        .reset_index(drop=True)
    )
    original_time = test_data[["time"]]
    test_data = test_data.drop("time", axis=1)

    result = pd.merge(test_data, grouped_data, how="left", on="vesselId")

    result["time_diff"] = (original_time["time"] - result["time"]).dt.total_seconds()

    return result

In [14]:
test_data_with_last_known_df = append_last_known_data_test(
    test_data, train_data_preprocessed
)
test_data_with_last_known_df[
    [
        "last_latitude_sin",
        "last_latitude_cos",
        "last_longitude_sin",
        "last_longitude_cos",
    ]
] = test_data_with_last_known_df[
    [
        "latitude_sin",
        "latitude_cos",
        "longitude_sin",
        "longitude_cos",
    ]
]
test_data_with_last_known_df = test_data_with_last_known_df.drop(
    columns=[
        "latitude_sin",
        "latitude_cos",
        "longitude_sin",
        "longitude_cos",
    ],
    axis=1,
)

print(test_data_with_last_known_df.head())

[]
   ID                  vesselId  scaling_factor                time  rot  \
0   0  61e9f3aeb937134a3c4bfe3d             0.3 2024-05-07 23:48:16  0.0   
1   1  61e9f473b937134a3c4c02df             0.3 2024-05-07 23:57:16  0.0   
2   2  61e9f469b937134a3c4c029b             0.3 2024-05-07 23:59:08  0.0   
3   3  61e9f45bb937134a3c4c0221             0.3 2024-05-07 23:52:34  0.0   
4   4  61e9f38eb937134a3c4bfd8d             0.3 2024-05-07 23:51:29  0.0   

   navstat            shippingLineId  port_latitude  port_longitude  \
0        5  61a8e672f9cba188601e84ac      31.140556      -81.496667   
1        5  61be24574ea00ae59d0fe388      14.808333      120.279444   
2        0  61ec6303a8cafc0e93f0e8f3      42.098889       11.780833   
3        1  61be24564ea00ae59d0fe37a     -43.606111      172.716111   
4        2  61ec94f1a8cafc0e93f0e92a      48.380556       -4.474167   

   seconds_to_eta  ...   day_cos  month_sin  month_cos  cog_sog_sin  \
0       -133396.0  ...  0.104528        0.

In [15]:
h2o.init(max_mem_size="26g")

train_data_shifted = h2o.H2OFrame(train_data_shifted_df)
test_data_with_last_known = h2o.H2OFrame(test_data_with_last_known_df)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "11.0.24" 2024-07-16; OpenJDK Runtime Environment (build 11.0.24+8-post-Debian-2deb11u1); OpenJDK 64-Bit Server VM (build 11.0.24+8-post-Debian-2deb11u1, mixed mode, sharing)
  Starting server from /opt/conda/lib/python3.10/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/tmp/tmp7oof4y9s
  JVM stdout: /var/tmp/tmp7oof4y9s/h2o_jupyter_started_from_python.out
  JVM stderr: /var/tmp/tmp7oof4y9s/h2o_jupyter_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Etc/UTC
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.5
H2O_cluster_version_age:,1 month and 24 days
H2O_cluster_name:,H2O_from_python_jupyter_2np633
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,26 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [16]:
train_data_shifted_without_validation, validation_data_shifted = (
    train_data_shifted.split_frame(ratios=[0.9], seed=42)   
)               

In [27]:
features_lat = [
    # "vesselId",
    "cog_sog_sin",
    "cog_sog_cos",
    "rot",
    # "heading_sin",
    # "heading_cos",
    # "navstat",
    "shippingLineId",
    "time_diff",
    "seconds_to_eta",
    "last_latitude_sin",
    "last_latitude_cos",
    "last_longitude_sin",
    "last_longitude_cos",
    "port_latitude_sin",
    "port_latitude_cos",
    "port_longitude_sin",
    "port_longitude_cos",
    # "hour_sin",
    # "hour_cos",
    "day_sin",
    "day_cos",
    # "month_sin",
    # "month_cos",
]
features_long = [
    # "vesselId",
    "cog_sog_sin",
    "cog_sog_cos",
    "rot",
    "shippingLineId",
    # "heading_sin",
    # "heading_cos",
    # "navstat",
    "time_diff",
    "seconds_to_eta",
    "last_latitude_sin",
    "last_latitude_cos",
    "last_longitude_sin",
    "last_longitude_cos",
    "port_latitude_sin",
    "port_latitude_cos",
    "port_longitude_sin",
    "port_longitude_cos",
    # "hour_sin",
    # "hour_cos",
    "day_sin",
    "day_cos",
    # "month_sin",
    # "month_cos",
    "latitude_sin",
    "latitude_cos",
]
target_long_sin = "longitude_sin"
target_long_cos = "longitude_cos"
target_lat_sin = "latitude_sin"
target_lat_cos = "latitude_cos"

In [18]:
params_lat_sin = {
    "ntrees": 200,  # Maximum number of trees
    "max_depth": 10,  # Maximum depth of each tree
    "min_rows": 15,  # Minimum number of rows per leaf
    "learn_rate": 0.1,  # Learning rate
    "sample_rate": 0.8,  # Row sample rate per tree
    "col_sample_rate": 0.8,  # Column sample rate per tree
    "reg_lambda": 1.0,  # L2 regularization term
    "reg_alpha": 0.1,  # L1 regularization term
    "seed": 42,  # Random seed for reproducibility
    "stopping_rounds": 10,  # Early stopping rounds
    "stopping_metric": "AUTO",  # Metric for early stopping
    "stopping_tolerance": 0.0005,  # Tolerance for early stopping
}
params_lat_cos = {
    "ntrees": 200,  # Maximum number of trees
    "max_depth": 8,  # Maximum depth of each tree
    "min_rows": 15,  # Minimum number of rows per leaf
    "learn_rate": 0.1,  # Learning rate
    "sample_rate": 0.8,  # Row sample rate per tree
    "col_sample_rate": 0.8,  # Column sample rate per tree
    "reg_lambda": 1.0,  # L2 regularization term
    "reg_alpha": 0.1,  # L1 regularization term
    "seed": 42,  # Random seed for reproducibility
    "stopping_rounds": 10,  # Early stopping rounds
    "stopping_metric": "AUTO",  # Metric for early stopping
    "stopping_tolerance": 0.0005,  # Tolerance for early stopping
}
params_long_sin = {
    "ntrees": 200,  # Maximum number of trees
    "max_depth": 10,  # Maximum depth of each tree
    "min_rows": 15,  # Minimum number of rows per leaf
    "learn_rate": 0.1,  # Learning rate
    "sample_rate": 0.8,  # Row sample rate per tree
    "col_sample_rate": 0.8,  # Column sample rate per tree
    "reg_lambda": 1.0,  # L2 regularization term
    "reg_alpha": 0.1,  # L1 regularization term
    "seed": 42,  # Random seed for reproducibility
    "stopping_rounds": 10,  # Early stopping rounds
    "stopping_metric": "AUTO",  # Metric for early stopping
    "stopping_tolerance": 0.0005,  # Tolerance for early stopping
}
params_long_cos = {
    "ntrees": 200,  # Maximum number of trees
    "max_depth": 10,  # Maximum depth of each tree
    "min_rows": 15,  # Minimum number of rows per leaf
    "learn_rate": 0.1,  # Learning rate
    "sample_rate": 0.8,  # Row sample rate per tree
    "col_sample_rate": 0.8,  # Column sample rate per tree
    "reg_lambda": 1.0,  # L2 regularization term
    "reg_alpha": 0.1,  # L1 regularization term
    "seed": 42,  # Random seed for reproducibility
    "stopping_rounds": 10,  # Early stopping rounds
    "stopping_metric": "AUTO",  # Metric for early stopping
    "stopping_tolerance": 0.0005,  # Tolerance for early stopping
}
# score_eval_metric_only if you want to predict only on the evaluation dataset, could help against overfitting

gbm_lat_sin = h2o.estimators.H2OXGBoostEstimator(**params_lat_sin)
gbm_lat_cos = h2o.estimators.H2OXGBoostEstimator(**params_lat_cos)
gbm_long_sin = h2o.estimators.H2OXGBoostEstimator(**params_long_sin)
gbm_long_cos = h2o.estimators.H2OXGBoostEstimator(**params_long_cos)


In [19]:
gbm_lat_sin.train(
    x=features_lat,
    y=target_lat_sin,
    training_frame=train_data_shifted_without_validation,
    validation_frame=validation_data_shifted,
)

xgboost Model Build progress: |



██████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees
,200.0

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance,validation_rmse,validation_mae,validation_deviance
,2024-10-23 17:51:55,0.063 sec,0.0,0.3595303,0.2734889,0.1292621,0.3611453,0.2743229,0.1304259
,2024-10-23 17:52:08,13.133 sec,1.0,0.3246105,0.2468269,0.1053720,0.3260392,0.2475796,0.1063016
,2024-10-23 17:52:15,20.560 sec,3.0,0.2651148,0.2013443,0.0702858,0.2662615,0.2019625,0.0708952
,2024-10-23 17:52:22,27.486 sec,5.0,0.2170913,0.1645002,0.0471286,0.2180250,0.1650128,0.0475349
,2024-10-23 17:52:29,34.389 sec,7.0,0.1786021,0.1347438,0.0318987,0.1793485,0.1351702,0.0321659
,2024-10-23 17:52:36,40.734 sec,9.0,0.1477417,0.1106250,0.0218276,0.1483684,0.1109934,0.0220132
,2024-10-23 17:52:43,47.834 sec,11.0,0.1231925,0.0911122,0.0151764,0.1237134,0.0914252,0.0153050
,2024-10-23 17:52:50,54.856 sec,13.0,0.1037461,0.0752777,0.0107633,0.1042222,0.0755492,0.0108623
,2024-10-23 17:52:56,1 min 1.001 sec,15.0,0.0884223,0.0624789,0.0078185,0.0888673,0.0627164,0.0078974
,2024-10-23 17:53:03,1 min 8.352 sec,17.0,0.0766632,0.0521700,0.0058772,0.0770993,0.0523815,0.0059443

variable,relative_importance,scaled_importance,percentage
port_latitude_sin,888261.6875000,1.0,0.6199992
last_latitude_sin,402101.75,0.4526839,0.2806636
time_diff,81888.5625000,0.0921897,0.0571575
seconds_to_eta,10405.1259766,0.0117140,0.0072627
port_longitude_sin,8298.9638672,0.0093429,0.0057926
port_latitude_cos,8030.0434570,0.0090402,0.0056049
port_longitude_cos,7500.4135742,0.0084439,0.0052352
cog_sog_cos,5714.5219727,0.0064334,0.0039887
cog_sog_sin,3088.9331055,0.0034775,0.0021560
day_sin,2408.0761719,0.0027110,0.0016808


In [None]:
gbm_lat_cos.train(
    x=features_lat,  # .append(latitude_sin)
    y=target_lat_cos,
    training_frame=train_data_shifted_without_validation,
    validation_frame=validation_data_shifted,
)

xgboost Model Build progress: |██████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees
,200.0

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance,validation_rmse,validation_mae,validation_deviance
,2024-10-23 18:02:38,0.003 sec,0.0,0.2658095,0.2333972,0.0706547,0.2658358,0.2334562,0.0706687
,2024-10-23 18:02:48,10.411 sec,1.0,0.2394420,0.2100813,0.0573325,0.2394698,0.2101393,0.0573458
,2024-10-23 18:02:52,14.534 sec,3.0,0.1944062,0.1702267,0.0377938,0.1944391,0.1702870,0.0378066
,2024-10-23 18:02:57,18.959 sec,5.0,0.1580008,0.1379674,0.0249642,0.1580368,0.1380254,0.0249756
,2024-10-23 18:03:02,23.966 sec,7.0,0.1286266,0.1119217,0.0165448,0.1286621,0.1119765,0.0165539
,2024-10-23 18:03:08,29.668 sec,9.0,0.1049229,0.0908898,0.0110088,0.1049599,0.0909421,0.0110166
,2024-10-23 18:03:12,34.059 sec,11.0,0.0858693,0.0739086,0.0073735,0.0859018,0.0739570,0.0073791
,2024-10-23 18:03:17,38.783 sec,13.0,0.0705884,0.0602156,0.0049827,0.0706179,0.0602582,0.0049869
,2024-10-23 18:03:21,43.380 sec,15.0,0.0583497,0.0491939,0.0034047,0.0583704,0.0492305,0.0034071
,2024-10-23 18:03:28,50.355 sec,18.0,0.0445807,0.0366670,0.0019874,0.0445927,0.0366950,0.0019885

variable,relative_importance,scaled_importance,percentage
port_latitude_cos,80885.1562500,1.0,0.4270864
port_latitude_sin,58138.1835938,0.7187744,0.3069788
last_latitude_sin,31050.7753906,0.3838872,0.1639530
last_latitude_cos,9819.4472656,0.1213999,0.0518482
time_diff,5212.4741211,0.0644429,0.0275227
cog_sog_cos,958.1893921,0.0118463,0.0050594
port_longitude_sin,776.0270386,0.0095942,0.0040975
seconds_to_eta,745.3275146,0.0092146,0.0039354
port_longitude_cos,507.5866394,0.0062754,0.0026801
last_longitude_sin,242.1687775,0.0029940,0.0012787


In [None]:
performance_lat_sin = gbm_lat_sin.model_performance(validation_data=validation_data_shifted)
performance_lat_cos = gbm_lat_cos.model_performance(validation_data=validation_data_shifted)


# Print the performance metrics
print(performance_lat_sin)
print(performance_lat_cos)

ModelMetricsRegression: xgboost
** Reported on test data. **

MSE: 0.0006835261207320038
RMSE: 0.026144332478225635
MAE: 0.007364094917138825
RMSLE: 0.028054700157270504
Mean Residual Deviance: 0.0006835261207320038
ModelMetricsRegression: xgboost
** Reported on test data. **

MSE: 0.0001531987167772356
RMSE: 0.012377346919967768
MAE: 0.004976473001395157
RMSLE: 0.006846469856594696
Mean Residual Deviance: 0.0001531987167772356


In [28]:
gbm_long_sin.train(
    x=features_long,
    y=target_long_sin,
    training_frame=train_data_shifted_without_validation,
    validation_frame=validation_data_shifted,
)
#MSE: 0.00023605270678827214

xgboost Model Build progress: |██████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees
,200.0

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance,validation_rmse,validation_mae,validation_deviance
,2024-10-23 18:35:44,0.004 sec,0.0,0.6973163,0.5570588,0.4862500,0.6972209,0.5570385,0.4861169
,2024-10-23 18:35:56,11.696 sec,1.0,0.6282376,0.5018189,0.3946824,0.6281696,0.5018207,0.3945971
,2024-10-23 18:36:00,15.976 sec,2.0,0.5660889,0.4521011,0.3204566,0.5660516,0.4521273,0.3204144
,2024-10-23 18:36:06,21.835 sec,3.0,0.5101545,0.4073787,0.2602576,0.5101391,0.4074170,0.2602419
,2024-10-23 18:36:12,27.826 sec,5.0,0.4145340,0.3307790,0.1718385,0.4145599,0.3308467,0.1718599
,2024-10-23 18:36:18,33.959 sec,7.0,0.3371456,0.2687276,0.1136671,0.3372221,0.2688168,0.1137187
,2024-10-23 18:36:24,39.954 sec,9.0,0.2746777,0.2184999,0.0754478,0.2748224,0.2186025,0.0755274
,2024-10-23 18:36:31,46.505 sec,11.0,0.2242104,0.1778185,0.0502703,0.2244557,0.1779433,0.0503804
,2024-10-23 18:36:37,53.370 sec,13.0,0.1834598,0.1448776,0.0336575,0.1837965,0.1450152,0.0337812
,2024-10-23 18:36:44,59.695 sec,15.0,0.1507228,0.1182524,0.0227174,0.1511884,0.1184092,0.0228579

variable,relative_importance,scaled_importance,percentage
port_longitude_sin,2545027.5,1.0,0.7870876
last_longitude_sin,366211.8437500,0.1438931,0.1132565
time_diff,155822.2343750,0.0612261,0.0481903
latitude_sin,58521.5507812,0.0229945,0.0180987
latitude_cos,55505.4414062,0.0218094,0.0171659
port_longitude_cos,11429.7246094,0.0044910,0.0035348
seconds_to_eta,10290.5615234,0.0040434,0.0031825
cog_sog_sin,7832.3222656,0.0030775,0.0024223
port_latitude_sin,6349.2592773,0.0024948,0.0019636
last_longitude_cos,2330.2480469,0.0009156,0.0007207


In [29]:
gbm_long_cos.train(
    x=features_long,  # .append("longitude_sin")
    y=target_long_cos,
    training_frame=train_data_shifted_without_validation,
    validation_frame=validation_data_shifted,
)

xgboost Model Build progress: |██████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees
,200.0

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance,validation_rmse,validation_mae,validation_deviance
,2024-10-23 18:46:43,0.002 sec,0.0,0.6766950,0.6008440,0.4579162,0.6781215,0.6021293,0.4598487
,2024-10-23 18:46:54,11.568 sec,1.0,0.6094457,0.5410543,0.3714240,0.6107209,0.5422068,0.3729800
,2024-10-23 18:47:00,17.248 sec,3.0,0.4946528,0.4388955,0.2446814,0.4956834,0.4398341,0.2457020
,2024-10-23 18:47:07,23.936 sec,5.0,0.4017378,0.3561276,0.1613932,0.4025713,0.3568762,0.1620636
,2024-10-23 18:47:13,29.860 sec,7.0,0.3264033,0.2890067,0.1065391,0.3270993,0.2896143,0.1069940
,2024-10-23 18:47:19,35.773 sec,9.0,0.2655192,0.2347252,0.0705004,0.2661159,0.2352144,0.0708177
,2024-10-23 18:47:25,42.142 sec,11.0,0.2162785,0.1907324,0.0467764,0.2168025,0.1911243,0.0470033
,2024-10-23 18:47:31,48.282 sec,13.0,0.1764651,0.1550742,0.0311399,0.1769301,0.1553840,0.0313043
,2024-10-23 18:47:37,54.448 sec,15.0,0.1442845,0.1261366,0.0208180,0.1447493,0.1263959,0.0209524
,2024-10-23 18:47:43,1 min 0.627 sec,17.0,0.1183662,0.1027028,0.0140106,0.1188459,0.1029225,0.0141244

variable,relative_importance,scaled_importance,percentage
port_longitude_cos,3872978.5,1.0,0.7336033
last_longitude_cos,1194061.7500000,0.3083058,0.2261742
time_diff,73537.0703125,0.0189872,0.0139291
latitude_sin,63236.4296875,0.0163276,0.0119780
latitude_cos,32448.0664062,0.0083781,0.0061462
port_longitude_sin,15314.6601562,0.0039542,0.0029008
port_latitude_sin,6217.9462891,0.0016055,0.0011778
seconds_to_eta,5036.0346680,0.0013003,0.0009539
cog_sog_sin,2522.0976562,0.0006512,0.0004777
shippingLineId.61ec6303a8cafc0e93f0e8f3,2073.7460938,0.0005354,0.0003928


In [30]:
performance_long_sin = gbm_long_sin.model_performance(test_data=validation_data_shifted)
performance_long_cos = gbm_long_cos.model_performance(test_data=validation_data_shifted)

print(performance_long_sin)
print(performance_long_cos)

ModelMetricsRegression: xgboost
** Reported on test data. **

MSE: 0.00036365483471347996
RMSE: 0.019069736094489614
MAE: 0.005900173867516693
RMSLE: NaN
Mean Residual Deviance: 0.00036365483471347996
ModelMetricsRegression: xgboost
** Reported on test data. **

MSE: 0.00019859001791834753
RMSE: 0.014092197057888011
MAE: 0.0033277104937216727
RMSLE: NaN
Mean Residual Deviance: 0.00019859001791834753


In [31]:
test_data_with_predicted_lat = test_data_with_last_known

lat_predictions_sin = gbm_lat_sin.predict(test_data_with_last_known)
test_data_with_last_known["latitude_sin"] = lat_predictions_sin
lat_predictions_cos = gbm_lat_cos.predict(test_data_with_last_known)
test_data_with_predicted_lat["latitude_cos"] = lat_predictions_cos

test_data_with_predicted_lat["latitude_sin"] = lat_predictions_sin
test_data_with_predicted_lat["latitude_cos"] = lat_predictions_cos

long_predictions_sin = gbm_long_sin.predict(test_data_with_predicted_lat)
test_data_with_last_known["longitude_sin"] = long_predictions_sin
long_predictions_cos = gbm_long_cos.predict(test_data_with_predicted_lat)
test_data_with_last_known["longitude_cos"] = long_predictions_cos

xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%
xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%
xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%
xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%


In [38]:
# Convert sine and cosine values back to radians
lat_predictions_sin = lat_predictions_sin.as_data_frame()
lat_predictions_cos = lat_predictions_cos.as_data_frame()
long_predictions_sin = long_predictions_sin.as_data_frame()
long_predictions_cos = long_predictions_cos.as_data_frame()


lat_predictions_radians = np.arctan2(lat_predictions_sin, lat_predictions_cos)
long_predictions_radians = np.arctan2(long_predictions_sin, long_predictions_cos)

# Convert radians to degrees
lat_predictions_degrees = np.rad2deg(lat_predictions_radians)
long_predictions_degrees = np.rad2deg(long_predictions_radians)

# Print the first few rows to verify the conversion
print(lat_predictions_degrees.head())
print(long_predictions_degrees.head())

AttributeError: 'DataFrame' object has no attribute 'as_data_frame'

In [33]:
def create_prediction_visualization_data(validation_data):
    lat_val_sin = gbm_lat_sin.predict(validation_data)
    lat_val_cos = gbm_lat_cos.predict(validation_data)
    long_val_sin = gbm_long_sin.predict(validation_data)
    long_val_cos = gbm_long_cos.predict(validation_data)

    lat_val_sin = lat_val_sin.as_data_frame()
    lat_val_cos = lat_val_cos.as_data_frame()
    long_val_sin = long_val_sin.as_data_frame()
    long_val_cos = long_val_cos.as_data_frame()

    validation_data = validation_data.as_data_frame()

    lat_val_radians = np.arctan2(lat_val_sin, lat_val_cos)
    long_val_radians = np.arctan2(long_val_sin, long_val_cos)

    evaluation_lat_radians = np.arctan2(
        validation_data["latitude_sin"], validation_data["latitude_cos"]
    )
    evaluation_long_radians = np.arctan2(
        validation_data["longitude_sin"], validation_data["longitude_cos"]
    )

    # Convert radians to degrees
    lat_val_degrees = np.rad2deg(lat_val_radians)
    long_val_degrees = np.rad2deg(long_val_radians)

    evaluation_lat_degrees = np.rad2deg(evaluation_lat_radians)
    evaluation_long_degrees = np.rad2deg(evaluation_long_radians)

    eval_predictions = pd.concat([lat_val_degrees, long_val_degrees], axis=1)

    eval_actual = pd.concat([evaluation_lat_degrees, evaluation_long_degrees], axis=1)

    eval_predictions.columns = ["latitude_predicted", "longitude_predicted"]
    eval_actual.columns = ["latitude", "longitude"]

    eval = pd.DataFrame()
    eval[["latitude_predicted", "longitude_predicted"]] = eval_predictions
    eval[["latitude", "longitude"]] = eval_actual
    eval[["vesselId", "time"]] = validation_data[["vesselId", "time"]]
    eval.to_csv("eval_predictions.csv")


# create_prediction_visualization_data(validation_data_shifted)

In [39]:
predictions = pd.concat([lat_predictions_degrees, long_predictions_degrees], axis=1)
predictions.columns = ["latitude_predicted", "longitude_predicted"]

In [40]:
predictions["ID"] = test_data["ID"]
predictions = predictions[["ID", "longitude_predicted", "latitude_predicted"]]

In [41]:
print(predictions.columns)

Index(['ID', 'longitude_predicted', 'latitude_predicted'], dtype='object')


In [42]:
predictions.to_csv("predictions.csv", index=False)