In [1]:
# import tensorflow as tf
import numpy as np
import h2o
import os
import pandas as pd
import dask.dataframe as dd

# from .feature_engineering_filter import Find_correct_port

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
h2o.init(max_mem_size="4g")

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
  Java Version: openjdk version "17.0.1" 2021-10-19; OpenJDK Runtime Environment Temurin-17.0.1+12 (build 17.0.1+12); OpenJDK 64-Bit Server VM Temurin-17.0.1+12 (build 17.0.1+12, mixed mode, sharing)
  Starting server from /Users/kristofferseyffarth/Downloads/lokalFiles/emner/Maskinlering/Gruppe/.venv/lib/python3.12/site-packages/h2o/backend/bin/h2o.jar
  Ice root: /var/folders/3t/hy3nmqqx6f70nkbvw0n8lbh80000gn/T/tmpf1_ta36m
  JVM stdout: /var/folders/3t/hy3nmqqx6f70nkbvw0n8lbh80000gn/T/tmpf1_ta36m/h2o_kristofferseyffarth_started_from_python.out
  JVM stderr: /var/folders/3t/hy3nmqqx6f70nkbvw0n8lbh80000gn/T/tmpf1_ta36m/h2o_kristofferseyffarth_started_from_python.err
  Server is running at http://127.0.0.1:54339
Connecting to H2O server at http://127.0.0.1:54339 ... successful.


0,1
H2O_cluster_uptime:,06 secs
H2O_cluster_timezone:,Europe/Oslo
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.5
H2O_cluster_version_age:,1 month and 20 days
H2O_cluster_name:,H2O_from_python_kristofferseyffarth_2e4g6t
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,4 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


In [3]:
train_data = pd.read_csv("../Datasets/ais_train.csv", delimiter="|")
test_data = pd.read_csv("../Datasets/ais_test.csv", delimiter=",")

In [4]:

train_data["time"] = pd.to_datetime(train_data["time"])
test_data["time"] = pd.to_datetime(test_data["time"])
train_data["navstat"] = train_data["navstat"].astype("category")
train_data_preprocessed = train_data.copy()

In [5]:
train_latitude_radians = np.deg2rad(train_data["latitude"])
train_longitude_radians = np.deg2rad(train_data["longitude"])
train_cog_radians = np.deg2rad(train_data["longitude"])
train_heading_radians = np.deg2rad(train_data["longitude"])


train_latitude_sin = np.sin(train_latitude_radians)
train_latitude_cos = np.cos(train_latitude_radians)

train_longitude_sin = np.sin(train_longitude_radians)
train_longitude_cos = np.cos(train_longitude_radians)

train_cog_sin = np.sin(train_cog_radians)
train_cog_cos = np.cos(train_cog_radians)

train_heading_sin = np.sin(train_heading_radians)
train_heading_cos = np.cos(train_heading_radians)


train_data_preprocessed["latitude_sin"] = train_latitude_sin
train_data_preprocessed["latitude_cos"] = train_latitude_cos
train_data_preprocessed["longitude_sin"] = train_longitude_sin
train_data_preprocessed["longitude_cos"] = train_longitude_cos
train_data_preprocessed["cog_sin"] = train_cog_sin
train_data_preprocessed["cog_cos"] = train_cog_cos
train_data_preprocessed["heading_sin"] = train_heading_sin
train_data_preprocessed["heading_cos"] = train_heading_cos

train_data_preprocessed = train_data_preprocessed.drop(
    columns=["latitude", "longitude", "cog", "heading"], axis=1
)
print(train_data_preprocessed.head())

                 time   sog  rot navstat       etaRaw  \
0 2024-01-01 00:00:25   0.7    0       0  01-09 23:00   
1 2024-01-01 00:00:36   0.0   -6       1  12-29 20:00   
2 2024-01-01 00:01:45  11.0    0       0  01-02 09:00   
3 2024-01-01 00:03:11   0.0    0       1  12-31 20:00   
4 2024-01-01 00:03:51  19.7    0       0  01-25 12:00   

                   vesselId                    portId  latitude_sin  \
0  61e9f3a8b937134a3c4bfdf7  61d371c43aeaecc07011a37f     -0.569906   
1  61e9f3d4b937134a3c4bff1f  634c4de270937fc01c3a7689      0.154614   
2  61e9f436b937134a3c4c0131  61d3847bb7b7526e1adf3d19      0.631903   
3  61e9f3b4b937134a3c4bfe77  61d36f770a1807568ff9a126     -0.565138   
4  61e9f41bb937134a3c4c0087  634c4de270937fc01c3a74f3      0.586143   

   latitude_cos  longitude_sin  longitude_cos   cog_sin   cog_cos  \
0      0.821710      -0.846670       0.532118 -0.846670  0.532118   
1      0.987975      -0.983189       0.182589 -0.983189  0.182589   
2      0.775048      -0

In [6]:
def Last_known_location_training_data(data: pd.DataFrame) -> pd.DataFrame:
    """_summary_  Groups training data by vesselId, and propogates all data from last known location

    Args:
    data (_type_): _description_ the data to be altered

    Returns:
        _type_:? _description_ the altered data
    """

    grouped_data = data.groupby("vesselId").apply(lambda x: x.sort_values("time"))

    print(grouped_data.index)

    grouped_data["time_diff"] = (
        grouped_data["time"].diff(-1).dt.total_seconds().abs().fillna(0)
    )

    original_time_and_id = grouped_data[["time", "vesselId"]]

    shifted_data = grouped_data.shift(1)
    shifted_data[["time", "vesselId"]] = original_time_and_id[["time", "vesselId"]]

    # Drops all values with nan values
    result = shifted_data.dropna().reset_index(drop=True)

    # Uncomment the line below if you want to remove the "time" column after processing
    # data = data.drop("time", axis=1)

    return result

In [7]:
train_data_shifted_df = Last_known_location_training_data(train_data_preprocessed)

train_data_shifted_df = train_data_shifted_df.drop(columns=["time"], axis=1)

train_data_shifted = h2o.H2OFrame(train_data_shifted_df)

  grouped_data = data.groupby("vesselId").apply(lambda x: x.sort_values("time"))


MultiIndex([( '61e9f38eb937134a3c4bfd8b',  131115),
            ( '61e9f38eb937134a3c4bfd8b',  131279),
            ( '61e9f38eb937134a3c4bfd8b',  131514),
            ( '61e9f38eb937134a3c4bfd8b',  131696),
            ( '61e9f38eb937134a3c4bfd8b',  131885),
            ( '61e9f38eb937134a3c4bfd8b',  132038),
            ( '61e9f38eb937134a3c4bfd8b',  132237),
            ( '61e9f38eb937134a3c4bfd8b',  132394),
            ( '61e9f38eb937134a3c4bfd8b',  132538),
            ( '61e9f38eb937134a3c4bfd8b',  132673),
            ...
            ('clh6aqawa0007gh0z9h6zi9bo', 1520243),
            ('clh6aqawa0007gh0z9h6zi9bo', 1520424),
            ('clh6aqawa0007gh0z9h6zi9bo', 1520635),
            ('clh6aqawa0007gh0z9h6zi9bo', 1520806),
            ('clh6aqawa0007gh0z9h6zi9bo', 1521048),
            ('clh6aqawa0007gh0z9h6zi9bo', 1521244),
            ('clh6aqawa0007gh0z9h6zi9bo', 1521409),
            ('clh6aqawa0007gh0z9h6zi9bo', 1521625),
            ('clh6aqawa0007gh0z9h6zi9bo', 152182

In [8]:
train_data_shifted_without_validation, validation_data_shifted = train_data_shifted.split_frame(
    ratios=[0.8], seed=42
)

In [9]:
def append_last_known_data_test(
    test_data: pd.DataFrame, known_data: pd.DataFrame
) -> pd.DataFrame:
    """_summary_  Groups training data by vesselId, and propogates all data from last known location

    Args:
    data (_type_): _description_ the data to be altered

    Returns:
        _type_:? _description_ the altered data
    """

    if not test_data["vesselId"].isin(known_data["vesselId"]).all():
        missing_vessels = test_data[
            ~test_data["vesselId"].isin(known_data["vesselId"])
        ]["vesselId"].unique()
        raise ValueError(
            f"The following vesselIds are missing in known_data: {missing_vessels}"
        )
    print(
        test_data[~test_data["vesselId"].isin(known_data["vesselId"])][
            "vesselId"
        ].unique()
    )

    grouped_data = (
        known_data.sort_values("time")
        .groupby("vesselId")
        .tail(1)
        .reset_index(drop=True)
    )
    original_time = test_data[["time"]]
    test_data = test_data.drop("time", axis=1)

    result = pd.merge(test_data, grouped_data, how="left", on="vesselId")

    result["time_diff"] = (original_time["time"] - result["time"]).dt.total_seconds()

    print(result.columns)

    return result

In [10]:
test_data_with_last_known_df = append_last_known_data_test(test_data, train_data_preprocessed)
test_data_with_last_known = h2o.H2OFrame(test_data_with_last_known_df)

[]
Index(['ID', 'vesselId', 'scaling_factor', 'time', 'sog', 'rot', 'navstat',
       'etaRaw', 'portId', 'latitude_sin', 'latitude_cos', 'longitude_sin',
       'longitude_cos', 'cog_sin', 'cog_cos', 'heading_sin', 'heading_cos',
       'time_diff'],
      dtype='object')
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [11]:
test_data_with_last_known_df.to_csv("../Datasets/test_data_with_last_known.csv")

In [12]:
features_lat = [
    "time",
    "vesselId",
    "cog_sin",
    "cog_cos",
    "sog",
    "rot",
    "heading_sin",
    "heading_cos",
    "navstat",
    "time_diff",
]
features_long = [
    "time",
    "vesselId",
    "cog_sin",
    "cog_cos",
    "sog",
    "rot",
    "heading_sin",
    "heading_cos",
    "navstat",
    "time_diff",
    "latitude_sin",
    "latitude_cos",
]
target_long_sin = "longitude_sin"
target_long_cos = "longitude_cos"
target_lat_sin = "latitude_sin" 
target_lat_cos = "latitude_cos" 


In [13]:
params = {
    "ntrees": 300,  # Maximum number of trees
    "max_depth": 10,  # Maximum depth of each tree
    "min_rows": 15,  # Minimum number of rows per leaf
    "learn_rate": 0.05,  # Learning rate
    "sample_rate": 0.9,  # Row sample rate per tree
    "col_sample_rate": 0.9,  # Column sample rate per tree
    "reg_lambda": 1.0,  # L2 regularization term
    "reg_alpha": 0.1,  # L1 regularization term
    "seed": 42,  # Random seed for reproducibility
}

gbm_lat_sin = h2o.estimators.H2OXGBoostEstimator(
    ntrees=200,  # Maximum number of trees
    max_depth=10,  # Maximum depth of each tree
    min_rows=15,  # Minimum number of rows per leaf
    learn_rate=0.05,  # Learning rate
    sample_rate=0.9,  # Row sample rate per tree
    col_sample_rate=0.9,  # Column sample rate per tree
    reg_lambda=1.0,  # L2 regularization term
    reg_alpha=0.1,  # L1 regularization term
    seed=42,  # Random seed for reproducibility
)
gbm_lat_cos = h2o.estimators.H2OXGBoostEstimator(
    ntrees=200,  # Maximum number of trees
    max_depth=10,  # Maximum depth of each tree
    min_rows=15,  # Minimum number of rows per leaf
    learn_rate=0.05,  # Learning rate
    sample_rate=0.9,  # Row sample rate per tree
    col_sample_rate=0.9,  # Column sample rate per tree
    reg_lambda=1.0,  # L2 regularization term
    reg_alpha=0.1,  # L1 regularization term
    seed=42,  # Random seed for reproducibility
)
gbm_long_sin = h2o.estimators.H2OXGBoostEstimator(
    ntrees=200,  # Maximum number of trees
    max_depth=10,  # Maximum depth of each tree
    min_rows=15,  # Minimum number of rows per leaf
    learn_rate=0.05,  # Learning rate
    sample_rate=0.9,  # Row sample rate per tree
    col_sample_rate=0.9,  # Column sample rate per tree
    reg_lambda=1.0,  # L2 regularization term
    reg_alpha=0.1,  # L1 regularization term
    seed=42,  # Random seed for reproducibility
)
gbm_long_cos = h2o.estimators.H2OXGBoostEstimator(
    ntrees=200,  # Maximum number of trees
    max_depth=10,  # Maximum depth of each tree
    min_rows=15,  # Minimum number of rows per leaf
    learn_rate=0.05,  # Learning rate
    sample_rate=0.9,  # Row sample rate per tree
    col_sample_rate=0.9,  # Column sample rate per tree
    reg_lambda=1.0,  # L2 regularization term
    reg_alpha=0.1,  # L1 regularization term
    seed=42,  # Random seed for reproducibility
)

# gbm_cog = h2o.estimators.H2OXGBoostEstimator()
# gbm_sog = h2o.estimators.H2OXGBoostEstimator()
# gbm_rot = h2o.estimators.H2OXGBoostEstimator()
# gbm_heading = h2o.estimators.H2OXGBoostEstimator()
# gbm_navstat = h2o.estimators.H2OXGBoostEstimator()
# # gbm_etaRaw = h2o.esti#mators.H2OXGBoostEstimator() #Remove etaRaw because it requires preprocessing
# # gbm_portId = h2o.estimators.H2OXGBoostEstimator()

In [14]:
gbm_lat_sin.train(x=features_lat, y=target_lat_sin, training_frame=train_data_shifted)
gbm_lat_cos.train(x=features_lat, y=target_lat_cos, training_frame=train_data_shifted)

xgboost Model Build progress: |██████████████████████████████████████████████████| (done) 100%
xgboost Model Build progress: |██████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees
,300.0

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2024-10-19 20:21:36,0.026 sec,0.0,0.2658871,0.2334876,0.0706960
,2024-10-19 20:21:41,5.847 sec,1.0,0.2529641,0.2218747,0.0639908
,2024-10-19 20:21:48,12.769 sec,3.0,0.2290449,0.2004179,0.0524616
,2024-10-19 20:21:55,19.921 sec,5.0,0.2075363,0.1811442,0.0430713
,2024-10-19 20:22:02,26.656 sec,7.0,0.1881947,0.1638148,0.0354172
,2024-10-19 20:22:09,33.254 sec,9.0,0.1708098,0.1482363,0.0291760
,2024-10-19 20:22:16,40.037 sec,11.0,0.1552024,0.1342199,0.0240878
,2024-10-19 20:22:21,45.309 sec,12.0,0.1479805,0.1277534,0.0218982
,2024-10-19 20:22:28,52.648 sec,14.0,0.1347182,0.1158647,0.0181490
,2024-10-19 20:22:35,59.690 sec,16.0,0.1227906,0.1051666,0.0150775

variable,relative_importance,scaled_importance,percentage
cog_cos,110726.8046875,1.0,0.4980093
cog_sin,67251.5390625,0.6073646,0.3024732
heading_cos,10444.7187500,0.0943287,0.0469766
heading_sin,9094.5302734,0.0821349,0.0409039
sog,5566.5170898,0.0502725,0.0250362
vesselId.61e9f466b937134a3c4c0273,2200.5139160,0.0198734,0.0098971
rot,1101.8889160,0.0099514,0.0049559
time,1019.9309082,0.0092112,0.0045873
navstat,672.8659668,0.0060768,0.0030263
vesselId.61e9f466b937134a3c4c0277,667.0029297,0.0060239,0.0029999


In [17]:
performance_lat_sin = gbm_lat_sin.model_performance(test_data=validation_data_shifted)
performance_lat_cos = gbm_lat_cos.model_performance(test_data=validation_data_shifted)


# Print the performance metrics
print(performance_lat_sin)
print(performance_lat_cos)

ModelMetricsRegression: xgboost
** Reported on test data. **

MSE: 0.004088381638205383
RMSE: 0.06394045384735225
MAE: 0.02762457544084362
RMSLE: 0.07094945402813012
Mean Residual Deviance: 0.004088381638205383
ModelMetricsRegression: xgboost
** Reported on test data. **

MSE: 0.0007050074374241704
RMSE: 0.026551976149133804
MAE: 0.014481068739010946
RMSLE: 0.015454602708807616
Mean Residual Deviance: 0.0007050074374241704


In [18]:
gbm_long_sin.train(x=features_long, y=target_long_sin, training_frame=train_data_shifted)
gbm_long_cos.train(x=features_long, y=target_long_cos, training_frame=train_data_shifted)

xgboost Model Build progress: |██████████████████████████

In [None]:
performance_long = gbm_long.model_performance(test_data=validation_data_shifted)
print(performance_long)

In [None]:
lat_predictions_sin = gbm_lat_sin.predict(test_data_with_last_known)
lat_predictions_cos = gbm_lat_cos.predict(test_data_with_last_known)

test_data_with_predicted_lat = test_data_with_last_known
test_data_with_predicted_lat["latitude_sin"] = lat_predictions_sin
test_data_with_predicted_lat["latitude_cos"] = lat_predictions_cos

long_predictions_sin = gbm_long_sin.predict(test_data_with_predicted_lat)
long_predictions_cos = gbm_long_cos.predict(test_data_with_predicted_lat)

In [None]:
lat_predictions = lat_predictions.as_data_frame()
long_predictions = long_predictions.as_data_frame()

In [None]:
predictions = pd.concat([lat_predictions, long_predictions], axis=1)
predictions.columns = ["latitude_predicted", "longitude_predicted"]

In [None]:
predictions["ID"] = test_data["ID"]
predictions = predictions[["ID", "longitude_predicted", "latitude_predicted"]]

In [None]:
print(predictions.columns)

In [None]:
predictions.to_csv("predictions.csv", index=False)