# XGBOOST 1

In [9]:
import numpy as np
import h2o
import os
import pandas as pd

## Data retrieval

In [10]:
train_data = pd.read_csv("../Datasets/ais_train.csv", delimiter="|")
test_data = pd.read_csv("../Datasets/ais_test.csv", delimiter=",")

In [11]:
vessel_data = pd.read_csv("../Datasets/vessels.csv", delimiter="|")
port_data = pd.read_csv("../Datasets/ports.csv", delimiter="|")

In [12]:
train_data["time"] = pd.to_datetime(train_data["time"])
test_data["time"] = pd.to_datetime(test_data["time"])

In [13]:
port_data_renamed=pd.DataFrame()
port_data_renamed[["portId","port_latitude","port_longitude"]]=port_data[["portId","latitude","longitude"]]
train_data=train_data.merge(port_data_renamed, on="portId", how="left")

## Preprocessing

In [14]:
train_data_preprocessed = train_data

# set the wrong or missing values to nan
train_data_preprocessed.loc[train_data_preprocessed["cog"] >= 360, "cog"] = np.nan
train_data_preprocessed.loc[train_data_preprocessed["sog"] >= 1023, "sog"] = np.nan
train_data_preprocessed.loc[train_data_preprocessed["rot"] == -128, "rot"] = np.nan
train_data_preprocessed.loc[train_data_preprocessed["heading"] == 511, "heading"] = (
    np.nan
)

# extract etaRaw
pattern = r"^\d{2}-\d{2} \d{2}:\d{2}$"
train_data_preprocessed["etaRaw"] = train_data_preprocessed["etaRaw"].where(
    train_data_preprocessed["etaRaw"].str.match(pattern, na=False), np.nan
)


train_data_preprocessed = train_data_preprocessed.sort_values("time")


# fill and backward fill the missing values within each group
train_data_preprocessed = (
    train_data_preprocessed.groupby("vesselId")
    .apply(lambda group: group.ffill().bfill())
    .reset_index(drop=True)
)


train_data_preprocessed["heading"] = train_data_preprocessed["heading"].fillna(0)

# drop the nan values which are not filled
train_data_preprocessed = train_data_preprocessed.dropna().reset_index(drop=True)

## Preprocess the etaRaw colum
# Replace '00-' in etaRaw with the corresponding month and day from the 'time' column
train_data_preprocessed["etaRaw"] = train_data_preprocessed["etaRaw"].mask(
    train_data_preprocessed["etaRaw"].str.contains("00-", na=False),
    "01" + train_data_preprocessed["etaRaw"].str[2:],
)

train_data_preprocessed["etaRaw"] = train_data_preprocessed["etaRaw"].mask(
    train_data_preprocessed["etaRaw"].str.contains("-00", na=False),
    train_data_preprocessed["etaRaw"].str[:2]
    + "-01"
    + train_data_preprocessed["etaRaw"].str[5:],
)

train_data_preprocessed["etaRaw"] = train_data_preprocessed["etaRaw"].mask(
    train_data_preprocessed["etaRaw"].str.contains(":60", na=False),
    train_data_preprocessed["etaRaw"].str[:9] + "59",
)

train_data_preprocessed["etaRaw"] = train_data_preprocessed["etaRaw"].mask(
    train_data_preprocessed["etaRaw"].str.contains("60:", na=False),
    train_data_preprocessed["etaRaw"].str[:6] + "01:00",
)

train_data_preprocessed["etaRaw"] = train_data_preprocessed["etaRaw"].mask(
    train_data_preprocessed["etaRaw"].str.contains("24:", na=False),
    train_data_preprocessed["etaRaw"].str[:6] + "23:59",
)


train_data_preprocessed["etaRaw"] = pd.to_datetime(
    train_data_preprocessed["time"].dt.year.astype(str)
    + "-"
    + train_data_preprocessed["etaRaw"]
    + ":00",
    format="%Y-%m-%d %H:%M:%S",
)

  .apply(lambda group: group.ffill().bfill())


## Feature Engineering

In [15]:
# calculate the seconds to eta as a feature
train_data_preprocessed["seconds_to_eta"] = (
    train_data_preprocessed["etaRaw"] - train_data_preprocessed["time"]
).dt.total_seconds()

train_data_preprocessed = train_data_preprocessed.drop(columns=["etaRaw"])

# calculate the radians of all circular features
train_latitude_radians = np.deg2rad(train_data_preprocessed["latitude"])
train_longitude_radians = np.deg2rad(train_data_preprocessed["longitude"])
train_cog_radians = np.deg2rad(train_data_preprocessed["cog"])
train_heading_radians = np.deg2rad(train_data_preprocessed["heading"])

port_latitude_radians = np.deg2rad(train_data_preprocessed["port_latitude"])
port_longitude_radians = np.deg2rad(train_data_preprocessed["port_longitude"])

train_hour = np.deg2rad(train_data_preprocessed["time"].dt.hour * 360 / 24)
train_day = np.deg2rad(train_data_preprocessed["time"].dt.day * 360 / 30)
train_month = np.deg2rad(train_data_preprocessed["time"].dt.month * 360 / 12)

# calculate the sin and cos encodings of the radians
train_latitude_sin = np.sin(train_latitude_radians)
train_latitude_cos = np.cos(train_latitude_radians)
train_longitude_sin = np.sin(train_longitude_radians)
train_longitude_cos = np.cos(train_longitude_radians)

port_latitude_sin = np.sin(port_latitude_radians)
port_latitude_cos = np.cos(port_latitude_radians)
port_longitude_sin = np.sin(port_longitude_radians)
port_longitude_cos = np.cos(port_longitude_radians)

train_cog_sin = np.sin(train_cog_radians)
train_cog_cos = np.cos(train_cog_radians)

train_heading_sin = np.sin(train_heading_radians)
train_heading_cos = np.cos(train_heading_radians)

train_hour_sin = np.sin(train_hour)
train_hour_cos = np.cos(train_hour)

train_day_sin = np.sin(train_day)
train_day_cos = np.cos(train_day)

train_month_sin = np.sin(train_month)
train_month_cos = np.cos(train_month)

# generate new features of encodings
train_data_preprocessed["latitude_sin"] = train_latitude_sin
train_data_preprocessed["latitude_cos"] = train_latitude_cos
train_data_preprocessed["longitude_sin"] = train_longitude_sin
train_data_preprocessed["longitude_cos"] = train_longitude_cos
train_data_preprocessed["port_latitude_sin"] = train_latitude_sin
train_data_preprocessed["port_latitude_cos"] = train_latitude_cos
train_data_preprocessed["port_longitude_sin"] = train_longitude_sin
train_data_preprocessed["port_longitude_cos"] = train_longitude_cos
train_data_preprocessed["cog_sin"] = train_cog_sin
train_data_preprocessed["cog_cos"] = train_cog_cos
train_data_preprocessed["heading_sin"] = train_heading_sin
train_data_preprocessed["heading_cos"] = train_heading_cos

train_data_preprocessed["hour_sin"] = train_hour_sin
train_data_preprocessed["hour_cos"] = train_hour_cos
train_data_preprocessed["day_sin"] = train_day_sin
train_data_preprocessed["day_cos"] = train_day_cos
train_data_preprocessed["month_sin"] = train_month_sin
train_data_preprocessed["month_cos"] = train_month_cos

# calculate velocity features from cog and sog
train_data_preprocessed["cog_sog_sin"] = train_data_preprocessed["cog_sin"]*train_data_preprocessed["sog"]
train_data_preprocessed["cog_sog_cos"] = train_data_preprocessed["cog_cos"]*train_data_preprocessed["sog"]

# drop the columns which are not needed
train_data_preprocessed = train_data_preprocessed.drop(
    columns=["latitude", "longitude", "cog", "heading", "portId","cog_sin","cog_cos","sog","port_latitude","port_longitude","hour_sin","hour_cos","day_sin","day_cos","month_sin","month_cos","rot","heading_sin","heading_cos","navstat"], axis=1
)
print("train_data_preprocessed")
print(train_data_preprocessed.columns)

train_data_preprocessed
Index(['time', 'vesselId', 'seconds_to_eta', 'latitude_sin', 'latitude_cos',
       'longitude_sin', 'longitude_cos', 'port_latitude_sin',
       'port_latitude_cos', 'port_longitude_sin', 'port_longitude_cos',
       'cog_sog_sin', 'cog_sog_cos'],
      dtype='object')


In [16]:
# optimize the dataframe to use lower memory (32bit)
def optimize_dataframe(df):
        """
        Downcasts numerical columns to reduce memory usage.
        """
        for col in df.select_dtypes(include=['float64']).columns:
            df[col] = pd.to_numeric(df[col], downcast='float')
        for col in df.select_dtypes(include=['int64']).columns:
            df[col] = pd.to_numeric(df[col], downcast='integer')
        return df

train_data_preprocessed = optimize_dataframe(train_data_preprocessed)

In [None]:
def Last_known_location_training_data(
    data: pd.DataFrame, max_shift_lengths, max_instances_per_group=1000
) -> pd.DataFrame:
    """_summary_  Groups training data by vesselId, and propogates all data from last known location

    Args:
    data (_type_): _description_ the data to be altered

    Returns:
        _type_:? _description_ the altered data

    """
    all_test_data = pd.DataFrame()
    shift_length=1
    while shift_length<=max_shift_lengths:

        grouped_data = data.groupby("vesselId").apply(lambda x: x.sort_values("time"))

        grouped_data["time_diff"] = (
            grouped_data["time"].diff(-shift_length).dt.total_seconds().abs()
        )

        original_time_and_id = grouped_data[
            [
                "time",
                "vesselId",
                "latitude_sin",
                "latitude_cos",
                "longitude_sin",
                "longitude_cos",
            ]
        ]

        shifted_data = grouped_data.shift(shift_length)
        shifted_data[
            [
                "last_latitude_sin",
                "last_latitude_cos",
                "last_longitude_sin",
                "last_longitude_cos",
            ]
        ] = shifted_data[
            ["latitude_sin", "latitude_cos", "longitude_sin", "longitude_cos"]
        ]

        shifted_data[
            [
                "time",
                "vesselId",
                "latitude_sin",
                "latitude_cos",
                "longitude_sin",
                "longitude_cos",
            ]
        ] = original_time_and_id[
            [
                "time",
                "vesselId",
                "latitude_sin",
                "latitude_cos",
                "longitude_sin",
                "longitude_cos",
            ]
        ]
        

        # Drops all values with nan values
        result = shifted_data.dropna().reset_index(drop=True)

        # Define a function to sample or take all if less than max_instances_per_group
        def sample_group(group):
            if len(group) > max_instances_per_group:
                return group.sample(n=max_instances_per_group, random_state=42)
            else:
                return group

        # Apply the sampling function to each group
        result = result.groupby('vesselId').apply(sample_group).reset_index(drop=True)

        all_test_data = pd.concat([all_test_data, result], ignore_index=True)

        prev_shift_length = shift_length
        shift_length = int(shift_length**(1.1))
        if shift_length == prev_shift_length:
            shift_length += 1
        print(shift_length)

    return all_test_data

In [None]:

train_data_shifted_df = Last_known_location_training_data(
    train_data_preprocessed, 400
)

In [None]:
def append_last_known_data_test(
    test_data: pd.DataFrame, known_data: pd.DataFrame
) -> pd.DataFrame:
    """_summary_  Groups training data by vesselId, and propogates all data from last known location

    Args:
    data (_type_): _description_ the data to be altered

    Returns:
        _type_:? _description_ the altered data
    """

    if not test_data["vesselId"].isin(known_data["vesselId"]).all():
        missing_vessels = test_data[
            ~test_data["vesselId"].isin(known_data["vesselId"])
        ]["vesselId"].unique()
        raise ValueError(
            f"The following vesselIds are missing in known_data: {missing_vessels}"
        )
    print(
        test_data[~test_data["vesselId"].isin(known_data["vesselId"])][
            "vesselId"
        ].unique()
    )

    grouped_data = (
        known_data.sort_values("time")
        .groupby("vesselId")
        .tail(1)
        .reset_index(drop=True)
    )
    original_time = test_data[["time"]]
    test_data = test_data.drop("time", axis=1)

    result = pd.merge(test_data, grouped_data, how="left", on="vesselId")

    result["time_diff"] = (original_time["time"] - result["time"]).dt.total_seconds()

    return result

In [None]:
# retrieve last known locations for test data
test_data_with_last_known_df = append_last_known_data_test(
    test_data, train_data_preprocessed
)

#rename features to match the training data
test_data_with_last_known_df[
    [
        "last_latitude_sin",
        "last_latitude_cos",
        "last_longitude_sin",
        "last_longitude_cos",
    ]
] = test_data_with_last_known_df[
    [
        "latitude_sin",
        "latitude_cos",
        "longitude_sin",
        "longitude_cos",
    ]
]
test_data_with_last_known_df = test_data_with_last_known_df.drop(
    columns=[
        "latitude_sin",
        "latitude_cos",
        "longitude_sin",
        "longitude_cos",
    ],
    axis=1,
)


In [None]:
memory_usage_bytes = train_data_shifted_df.memory_usage(deep=True).sum()
# Convert bytes to gigabytes
memory_usage_gb = memory_usage_bytes / (1024 ** 3)
# Print the memory usage in gigabytes
print(f"DataFrame size: {memory_usage_gb:.2f} GB")

h2o.init(max_mem_size="32g") # start h2o with 30GB of memory, alter as needed

train_data_shifted = h2o.H2OFrame(train_data_shifted_df)
test_data_with_last_known = h2o.H2OFrame(test_data_with_last_known_df)

del train_data_shifted_df
del test_data_with_last_known_df

In [None]:
# split the training data into training and validation data
train_data_shifted_without_validation, validation_data_shifted = (
    train_data_shifted.split_frame(ratios=[0.9], seed=42)   
)               

In [None]:
# define the features and target columns, features selected based on feature importance

features_lat = [
    # "vesselId",
    "cog_sog_sin",
    "cog_sog_cos",
    #"rot",
    # "heading_sin",
    # "heading_cos",
    # "navstat",
    #"shippingLineId",
    "time_diff",
    "seconds_to_eta",
    "last_latitude_sin",
    "last_latitude_cos",
    "last_longitude_sin",
    "last_longitude_cos",
    "port_latitude_sin",
    "port_latitude_cos",
    "port_longitude_sin",
    "port_longitude_cos",
    # "hour_sin",
    # "hour_cos",
    #"day_sin",
    #"day_cos",
    # "month_sin",
    # "month_cos",
]
features_long = [
    # "vesselId",
    "cog_sog_sin",
    "cog_sog_cos",
    # "rot",
    # "shippingLineId",
    # "heading_sin",
    # "heading_cos",
    # "navstat",
    "time_diff",
    "seconds_to_eta",
    "last_latitude_sin",
    "last_latitude_cos",
    "last_longitude_sin",
    "last_longitude_cos",
    "port_latitude_sin",
    "port_latitude_cos",
    "port_longitude_sin",
    "port_longitude_cos",
    # "hour_sin",
    # "hour_cos",
    # "day_sin",
    # "day_cos",
    # "month_sin",
    # "month_cos",
    "latitude_sin",  # append predicted latitude as a feature
    "latitude_cos",  # append predicted latitude as a feature
]

# define the target columns
target_long_sin = "longitude_sin"
target_long_cos = "longitude_cos"
target_lat_sin = "latitude_sin"
target_lat_cos = "latitude_cos"

In [None]:
# parameters for the models, found throug grid search
params_lat_sin = {
    "ntrees": 400,  # Maximum number of trees
    "max_depth": 10,  # Maximum depth of each tree
    "min_rows": 15,  # Minimum number of rows per leaf
    "sample_rate": 0.8,  # Row sample rate per tree
    "col_sample_rate": 0.8,  # Column sample rate per tree
    "reg_lambda": 1.0,  # L2 regularization term
    "reg_alpha": 0.1,  # L1 regularization term
    "seed": 42,  # Random seed for reproducibility
    "stopping_metric": "AUTO",  # Metric for early stopping
    "distribution": "gaussian",  # Set distribution to Gaussian for regression
    "stopping_rounds": 10,  # Early stopping rounds
}
params_lat_cos = {
    "ntrees": 400,  # Maximum number of trees
    "max_depth": 10,  # Maximum depth of each tree
    "min_rows": 15,  # Minimum number of rows per leaf
    "sample_rate": 0.8,  # Row sample rate per tree
    "col_sample_rate": 0.8,  # Column sample rate per tree
    "reg_lambda": 1.0,  # L2 regularization term
    "reg_alpha": 0.0,  # L1 regularization term
    "seed": 42,  # Random seed for reproducibility
    "stopping_metric": "AUTO",  # Metric for early stopping
    "distribution": "gaussian",  # Set distribution to Gaussian for regression
}
params_long_sin = {
    "ntrees": 400,  # Maximum number of trees
    "max_depth": 10,  # Maximum depth of each tree
    "min_rows": 15,  # Minimum number of rows per leaf
    "sample_rate": 0.8,  # Row sample rate per tree
    "col_sample_rate": 0.8,  # Column sample rate per tree
    "reg_lambda": 1.0,  # L2 regularization term
    "reg_alpha": 0.1,  # L1 regularization term
    "seed": 42,  # Random seed for reproducibility
    "stopping_metric": "AUTO",  # Metric for early stopping
    "distribution": "gaussian",  # Set distribution to Gaussian for regression
    "stopping_rounds": 10,  # Early stopping rounds
}

params_long_cos = {
    "ntrees": 600,  # Maximum number of trees
    "max_depth": 10,  # Maximum depth of each tree
    "min_rows": 15,  # Minimum number of rows per leaf
    "sample_rate": 0.7,  # Row sample rate per tree
    "col_sample_rate": 0.8,  # Column sample rate per tree
    "reg_lambda": 1.0,  # L2 regularization term
    "reg_alpha": 0.1,  # L1 regularization term
    "seed": 42,  # Random seed for reproducibility
    "stopping_metric": "AUTO",  # Metric for early stopping
    "distribution": "gaussian",  # Set distribution to Gaussian for regression
    "stopping_rounds": 10,  # Early stopping rounds
}

gbm_lat_sin = h2o.estimators.H2OXGBoostEstimator(**params_lat_sin)
gbm_lat_cos = h2o.estimators.H2OXGBoostEstimator(**params_lat_cos)
gbm_long_sin = h2o.estimators.H2OXGBoostEstimator(**params_long_sin)
gbm_long_cos = h2o.estimators.H2OXGBoostEstimator(**params_long_cos)

In [None]:
training_params = {
    "gbm_lat_sin": {
        "model": gbm_lat_sin,
        "features": features_lat,
        "target": target_lat_sin,
    },
    "gbm_lat_cos": {
        "model": gbm_lat_cos,
        "features": features_lat,
        "target": target_lat_cos,
    },
    "gbm_long_sin": {
        "model": gbm_long_sin,
        "features": features_long,  # Ensure features_long is defined
        "target": target_long_sin,
    },
    "gbm_long_cos": {
        "model": gbm_long_cos,
        "features": features_long,  # Ensure features_long is defined
        "target": target_long_cos,
    },
}

In [None]:
gbm_lat_sin.train(
    x=features_lat,  # .append(latitude_sin)
    y=target_lat_sin,
    training_frame=train_data_shifted_without_validation,
    validation_frame=validation_data_shifted,
)

In [None]:
gbm_lat_cos.train(
    x=features_lat,  # .append(latitude_sin)
    y=target_lat_cos,
    training_frame=train_data_shifted_without_validation,
    validation_frame=validation_data_shifted,
)

In [None]:
performance_lat_sin = gbm_lat_sin.model_performance(test_data=validation_data_shifted)
performance_lat_cos = gbm_lat_cos.model_performance(test_data=validation_data_shifted)


# Print the performance metrics
print(performance_lat_sin)
print(performance_lat_cos)

In [None]:
gbm_long_sin.train(
    x=features_long,
    y=target_long_sin,
    training_frame=train_data_shifted_without_validation,
    validation_frame=validation_data_shifted,
)
#MSE: 0.00023605270678827214

In [None]:
gbm_long_cos.train(
    x=features_long,  # .append("longitude_sin")
    y=target_long_cos,
    training_frame=train_data_shifted_without_validation,
    validation_frame=validation_data_shifted,
)

In [None]:
performance_long_sin = gbm_long_sin.model_performance(test_data=validation_data_shifted)
performance_long_cos = gbm_long_cos.model_performance(test_data=validation_data_shifted)

print(performance_long_sin)
print(performance_long_cos)

In [None]:
test_data_with_predicted_lat = test_data_with_last_known

lat_predictions_sin = gbm_lat_sin.predict(test_data_with_last_known)
test_data_with_last_known["latitude_sin"] = lat_predictions_sin
lat_predictions_cos = gbm_lat_cos.predict(test_data_with_last_known)
test_data_with_predicted_lat["latitude_cos"] = lat_predictions_cos

test_data_with_predicted_lat["latitude_sin"] = lat_predictions_sin
test_data_with_predicted_lat["latitude_cos"] = lat_predictions_cos

long_predictions_sin = gbm_long_sin.predict(test_data_with_predicted_lat)
test_data_with_last_known["longitude_sin"] = long_predictions_sin
long_predictions_cos = gbm_long_cos.predict(test_data_with_predicted_lat)
test_data_with_last_known["longitude_cos"] = long_predictions_cos

In [None]:
# Convert sine and cosine values back to radians
lat_predictions_sin = lat_predictions_sin.as_data_frame()
lat_predictions_cos = lat_predictions_cos.as_data_frame()
long_predictions_sin = long_predictions_sin.as_data_frame()
long_predictions_cos = long_predictions_cos.as_data_frame()


lat_predictions_radians = np.arctan2(lat_predictions_sin, lat_predictions_cos)
long_predictions_radians = np.arctan2(long_predictions_sin, long_predictions_cos)

# Convert radians to degrees
lat_predictions_degrees = np.rad2deg(lat_predictions_radians)
long_predictions_degrees = np.rad2deg(long_predictions_radians)

# Print the first few rows to verify the conversion
print(lat_predictions_degrees.head())
print(long_predictions_degrees.head())

In [None]:
def create_prediction_visualization_data(validation_data):
    lat_val_sin = gbm_lat_sin.predict(validation_data)
    lat_val_cos = gbm_lat_cos.predict(validation_data)
    long_val_sin = gbm_long_sin.predict(validation_data)
    long_val_cos = gbm_long_cos.predict(validation_data)

    lat_val_sin = lat_val_sin.as_data_frame()
    lat_val_cos = lat_val_cos.as_data_frame()
    long_val_sin = long_val_sin.as_data_frame()
    long_val_cos = long_val_cos.as_data_frame()

    validation_data = validation_data.as_data_frame()

    lat_val_radians = np.arctan2(lat_val_sin, lat_val_cos)
    long_val_radians = np.arctan2(long_val_sin, long_val_cos)

    evaluation_lat_radians = np.arctan2(
        validation_data["latitude_sin"], validation_data["latitude_cos"]
    )
    evaluation_long_radians = np.arctan2(
        validation_data["longitude_sin"], validation_data["longitude_cos"]
    )

    # Convert radians to degrees
    lat_val_degrees = np.rad2deg(lat_val_radians)
    long_val_degrees = np.rad2deg(long_val_radians)

    evaluation_lat_degrees = np.rad2deg(evaluation_lat_radians)
    evaluation_long_degrees = np.rad2deg(evaluation_long_radians)

    eval_predictions = pd.concat([lat_val_degrees, long_val_degrees], axis=1)

    eval_actual = pd.concat([evaluation_lat_degrees, evaluation_long_degrees], axis=1)

    eval_predictions.columns = ["latitude_predicted", "longitude_predicted"]
    eval_actual.columns = ["latitude", "longitude"]

    eval = pd.DataFrame()
    eval[["latitude_predicted", "longitude_predicted"]] = eval_predictions
    eval[["latitude", "longitude"]] = eval_actual
    eval[["vesselId", "time"]] = validation_data[["vesselId", "time"]]
    eval.to_csv("eval_predictions.csv")


# create_prediction_visualization_data(validation_data_shifted)

In [None]:
predictions = pd.concat([lat_predictions_degrees, long_predictions_degrees], axis=1)
predictions.columns = ["latitude_predicted", "longitude_predicted"]

In [None]:
predictions["ID"] = test_data["ID"]
predictions = predictions[["ID", "longitude_predicted", "latitude_predicted"]]

In [None]:
print(predictions.columns)

In [None]:
predictions.to_csv("predictions.csv", index=False)