In [None]:
#import tensorflow as tf
import numpy as np
import h2o
import os
import pandas as pd
import dask.dataframe as dd

# from .feature_engineering_filter import Find_correct_port

In [None]:
h2o.init(max_mem_size="4g")

In [None]:
train_data = h2o.import_file(
    path="../Datasets/ais_train.csv", sep="|", header=1
)  # time|cog|sog|rot|heading|navstat|etaRaw|latitude|longitude|vesselId|portId

test_data = h2o.import_file(
    path="../Datasets/ais_test.csv", sep=",", header=1
)  # ID,vesselId,time,scaling_factor

schedules = h2o.import_file(
    path="../Datasets/schedules_to_may_2024.csv", sep="|", header=1
)

In [None]:
from .preprocessing import Preprocessing

# Implement preprocessing

In [None]:
train_data_df = train_data.as_data_frame()
schedules_df = schedules.as_data_frame()

In [None]:

train_data=dd.read_csv("../Datasets/ais_train.csv", delimiter="|")
test_data=dd.read_csv("../Datasets/ais_test.csv", delimiter=",")
schedules=dd.read_csv("../Datasets/schedules_to_may_2024.csv", delimiter="|")

nan_counts_before = schedules.isnull().sum().compute()
print(nan_counts_before)

schedules = schedules.ffill()

nan_counts_after = schedules.isnull().sum().compute()
print(nan_counts_after)


In [None]:
# feature engineering
from h2o.frame import H2OFrame


def append_current_schedule(
    queries_frame: H2OFrame, schedules_frame: H2OFrame
) -> H2OFrame:
    schedules_frame_stripped = schedules_frame[
        ["vesselId", "portLatitude", "portLongitude", "arrivalDate", "sailingDate"]
    ]

    queries_frame_merged = queries_frame.merge(schedules_frame_stripped)
    print(queries_frame_merged.columns)

    queries_frame_merged["time"] = queries_frame_merged["time"].as_date("%Y-%m-%d %H:%M:%S")
    queries_frame_merged["arrivalDate"] = queries_frame_merged["arrivalDate"].as_date("%Y-%m-%d %H:%M:%S")
    queries_frame_merged["sailingDate"] = queries_frame_merged["sailingDate"].as_date("%Y-%m-%d %H:%M:%S")

    queries_frame_filtered = queries_frame_merged[
        (queries_frame_merged["time"] <= queries_frame_merged["arrivalDate"])
        & (queries_frame_merged["time"] >= queries_frame_merged["sailingDate"])
    ]
    return queries_frame_filtered


# implement feature engineering
train_data_appended = append_current_schedule(train_data, schedules)
print(train_data_appended.head())

In [None]:
def append_current_schedule(queries_frame: dd.DataFrame, schedules_frame: dd.DataFrame) -> dd.DataFrame:

    # Convert 'time' to string
    # queries_frame["time"] = queries_frame["time"].astype(str)
    print(queries_frame["time"].head())

    # Strip schedules_frame to keep only the relevant columns
    schedules_frame_stripped = schedules_frame[
        ["vesselId", "portLatitude", "portLongitude", "arrivalDate", "sailingDate"]
    ]
    print("Schedules frame stripped to relevant columns")

    # Strip the timezone information (+00:00) from the date columns
    schedules_frame_stripped["arrivalDate"] = schedules_frame_stripped["arrivalDate"].astype(str).str.replace(r"\+\d{2}:\d{2}", "", regex=True)
    schedules_frame_stripped["sailingDate"] = schedules_frame_stripped["sailingDate"].astype(str).str.replace(r"\+\d{2}:\d{2}", "", regex=True)

    # Convert timestamps to datetime format
    queries_frame["time"] =dd.to_datetime(queries_frame["time"], format="%Y-%m-%d %H:%M:%S")
    schedules_frame_stripped["arrivalDate"] = dd.to_datetime(schedules_frame_stripped["arrivalDate"], format="%Y-%m-%d %H:%M:%S")
    schedules_frame_stripped["sailingDate"] = dd.to_datetime(schedules_frame_stripped["sailingDate"], format="%Y-%m-%d %H:%M:%S")
    print("Converted time and date columns to correct format")
    print(queries_frame["time"].head())
    print(schedules_frame_stripped["arrivalDate"].head())

    # Sort the DataFrames by time for merge_asof
    queries_sorted = queries_frame.sort_values("time")
    schedules_sorted = schedules_frame_stripped.sort_values("arrivalDate")

    print(schedules_sorted.head())

    # Merge the two frames on 'vesselId'
    queries_frame_merged = dd.merge_asof(
        queries_frame,
        schedules_sorted,
        left_on="time",
        right_on="arrivalDate",
        by="vesselId",
        direction="forward",
    )
    print("Merged data successfully")
    print(queries_frame_merged.columns)

    # Perform filtering based on time range
    # print("Starting to filter data")
    # queries_frame_filtered = queries_frame_merged[

    # (queries_frame_merged["time"] <= queries_frame_merged["arrivalDate"]) &
    # (queries_frame_merged["time"] >= queries_frame_merged["sailingDate"])
    # ]
    # print("Successfully filtered the data")

    return queries_frame_merged

# Execute the feature engineering function
print("Displaying the head of the schedules frame for reference:")

# Assume 'train_data' and 'schedules' are your pandas DataFrames
train_data_appended = append_current_schedule(test_data, schedules)

print("sucsessfully generated data, attempting to store")

# Export the filtered frame to a CSV file
train_data_appended.compute().to_csv("intermediate/test_data_with_schedule_whole.csv", index=False)

nan_counts_after = train_data_appended.isnull().sum().compute()
print(nan_counts_after)

In [None]:
train_data_with_schedule=pd.read_csv("intermediate/train_data_with_schedule_whole.csv", delimiter=",")

In [None]:
test_data_with_schedule=pd.read_csv("intermediate/test_data_with_schedule_whole.csv", delimiter=",")

In [None]:
port_data=pd.read_csv("../Datasets/ports.csv", delimiter="|")

In [None]:
h2o.shutdown

In [None]:
print(train_data_with_schedule.columns)
train_data_with_schedule=train_data_with_schedule[["time","latitude","longitude","vesselId","portLatitude","portLongitude","arrivalDate"]]
train_data_without_schedule=train_data_with_schedule[["time","vesselId","latitude","longitude"]]


In [None]:
test_data_with_schedule=test_data_with_schedule[["time","vesselId","portLatitude","portLongitude","arrivalDate"]]
test_data_without_schedule=test_data_with_schedule[["time","vesselId"]]
test_data_with_schedule["id"]=test_data_without_schedule.index

In [None]:
train_data_with_schedule=train_data_with_schedule.dropna()

In [None]:
test_data_with_schedule=test_data_with_schedule.dropna()

In [None]:
# Convert H2O Frame to pandas DataFrame
#train_data_df = train_data.as_data_frame()
# Convert 'timestamp' column to datetime
train_data_with_schedule["time"] = pd.to_datetime(train_data_with_schedule["time"]).astype(int)//10**9
train_data_with_schedule["arrivalDate"] = pd.to_datetime(train_data_with_schedule["arrivalDate"]).astype(int)//10**9
train_data_without_schedule["time"] = pd.to_datetime(train_data_without_schedule["time"]).astype(int)//10**9


# Convert pandas DataFrame back to H2O Frame
train_data_with_schedule = h2o.H2OFrame(train_data_with_schedule)
train_data_without_schedule = h2o.H2OFrame(train_data_with_schedule)

In [None]:
print(train_data.tail())

In [None]:
# Convert H2O Frame to pandas DataFrame
#test_data_df = test_data.as_data_frame()

# Convert 'timestamp' column to datetime
test_data_with_schedule["time"] = pd.to_datetime(test_data_with_schedule["time"]).astype(int) // 10**9
test_data_without_schedule["time"] = pd.to_datetime(test_data_without_schedule["time"]).astype(int) // 10**9

# Convert pandas DataFrame back to H2O Frame
test_data_with_schedule = h2o.H2OFrame(test_data_with_schedule)
test_data_without_schedule = h2o.H2OFrame(test_data_without_schedule)


In [None]:
print(test_data_with_schedule.head())
print(test_data_without_schedule.head())

In [None]:
splits = train_data.split_frame(
    ratios=[0.7], seed=1
)  # 70% for train_data, 30% for validation_data

# create a test subset from train data
train_data = splits[0]
validation_data = splits[1]

In [None]:
print(test_data_with_schedule.columns)
#test_data = test_data.drop("ID", axis=1)
test_data = test_data.drop("scaling_factor", axis=1)

In [None]:
print(test_data.columns)

In [None]:
features_without_schedule = [
    "time",
    "vesselId",
]
features_with_schedule = [
    "time","vesselId","portLatitude","portLongitude","arrivalDate"
]
target_long = "longitude"  # replace with your actual target column
target_lat = "latitude"  # replace with your actual target column

# target_cog = "cog"
# target_sog = "sog"
# target_rot = "rot"
# target_heading = "heading"
# target_navstat = "navstat"
# # target_etaRaw = "etaRaw" #Remove etaRaw because it requires preprocessing
# target_portId = "portId"

In [None]:
hyper_params = {"ntrees": [50, 100, 200], "learn_rate": [0.01, 0.1, 0.2, 0.3]}

In [None]:
gbm_lat_with_schedule = h2o.estimators.H2OXGBoostEstimator()
gbm_long_with_schedule = h2o.estimators.H2OXGBoostEstimator()
gbm_lat_without_schedule = h2o.estimators.H2OXGBoostEstimator()
gbm_long_without_schedule = h2o.estimators.H2OXGBoostEstimator()

# gbm_cog = h2o.estimators.H2OXGBoostEstimator()
# gbm_sog = h2o.estimators.H2OXGBoostEstimator()
# gbm_rot = h2o.estimators.H2OXGBoostEstimator()
# gbm_heading = h2o.estimators.H2OXGBoostEstimator()
# gbm_navstat = h2o.estimators.H2OXGBoostEstimator()
# # gbm_etaRaw = h2o.esti#mators.H2OXGBoostEstimator() #Remove etaRaw because it requires preprocessing
# # gbm_portId = h2o.estimators.H2OXGBoostEstimator()

In [None]:
gbm_cog.train(x=features_test, y=target_cog, training_frame=train_data)

cog_mse = gbm_cog.model_performance(validation_data).mse()
print("cog_mse: ", cog_mse)
cog_predicted = gbm_cog.predict(test_data)


h2o.remove(gbm_cog)

In [None]:
gbm_sog.train(x=features_test, y=target_sog, training_frame=train_data)
sog_mse = gbm_sog.model_performance(validation_data).mse()
print("sog_mse: ", sog_mse)
sog_predicted = gbm_sog.predict(test_data)
h2o.remove(gbm_sog)

In [None]:
gbm_rot.train(x=features_test, y=target_rot, training_frame=train_data)
rot_mse = gbm_rot.model_performance(validation_data).mse()
print("rot_mse: ", rot_mse)
rot_predicted = gbm_rot.predict(test_data)
h2o.remove(gbm_rot)

In [None]:
gbm_heading.train(x=features_test, y=target_heading, training_frame=train_data)
heading_mse = gbm_heading.model_performance(validation_data).mse()
print("heading_mse: ", heading_mse)
heading_predicted = gbm_heading.predict(test_data)
h2o.remove(gbm_heading)

In [None]:
train_data[target_navstat] = train_data[target_navstat].asfactor()

gbm_navstat.train(x=features_test, y=target_navstat, training_frame=train_data)
navstat_performance = gbm_navstat.model_performance(validation_data)
navstat_predicted = gbm_navstat.predict(test_data)

In [None]:
validation_data[target_navstat] = validation_data[target_navstat].asfactor()
navstat_performance = gbm_navstat.model_performance(validation_data)

print("navstat_precision: ", navstat_performance.mse())
print("navstat_recall: ", navstat_performance.recall())
print("navstat_f1: ", navstat_performance.F1())
print("navstat_auc: ", navstat_performance.auc())

In [None]:
# gbm_etaRaw.train(x=features_test, y=target_etaRaw, training_frame=train_data)
# etaRaw_predicted = gbm_etaRaw.predict(test_data)
# h2o.remove(gbm_etaRaw)

In [None]:
# gbm_portId.train(x=features_test, y=target_portId, training_frame=train_data)
# portId_predicted = gbm_portId.predict(test_data)
# h2o.remove(gbm_portId)

In [None]:

gbm_long_with_schedule.train(x=features_with_schedule, y=target_long, training_frame=train_data_with_schedule)
gbm_lat_with_schedule.train(x=features_with_schedule, y=target_lat, training_frame=train_data_with_schedule)

In [None]:
gbm_long_without_schedule.train(x=features_without_schedule, y=target_long, training_frame=train_data_without_schedule)
gbm_lat_without_schedule.train(x=features_without_schedule, y=target_lat, training_frame=train_data_without_schedule)

In [None]:
cog_predicted.set_names(["cog"])
sog_predicted.set_names(["sog"])
rot_predicted.set_names(["rot"])
heading_predicted.set_names(["heading"])
navstat_predicted.set_names(["navstat"])
# etaRaw_predicted.set_names(["etaRaw"])
# portId_predicted.set_names(["portId"])

In [None]:
test_data_predicted = test_data
test_data_predicted = test_data.cbind(cog_predicted)
test_data_predicted = test_data.cbind(sog_predicted)
test_data_predicted = test_data.cbind(rot_predicted)
test_data_predicted = test_data.cbind(heading_predicted)
test_data_predicted = test_data.cbind(navstat_predicted)
# test_data_predicted = test_data.cbind(etaRaw_predicted)
# test_data_predicted = test_data.cbind(portId_predicted)

In [None]:
# grid_lat = h2o.grid.grid_search.H2OGridSearch(gbm_lat, hyper_params)
# grid_lat.train(x=features, y=target_lat, training_frame=train_data)

In [None]:
# gridperf = grid_lat.get_grid(sort_by="mse", decreasing=True)

# gbm_lat = gridperf.models[0]

In [None]:
# print(gbm_lat.params["learn_rate"]["actual"])
# print(gbm_lat.params["ntrees"]["actual"])

# 0.01
# 50

In [None]:
# gbm_long = h2o.estimators.H2OXGBoostEstimator(
#     learn_rate=gbm_lat.params["learn_rate"]["actual"],
#     ntrees=gbm_lat.params["ntrees"]["actual"],
# )

In [None]:
# gbm_long.train(x=features, y=target_long, training_frame=train_data)

In [129]:
lat_predictions_with_schedule = gbm_lat_with_schedule.predict(test_data_with_schedule)
long_predictions_with_schedule = gbm_long_with_schedule.predict(test_data_with_schedule)

xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%
xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%


In [130]:
lat_predictions_without_schedule = gbm_lat_without_schedule.predict(test_data_without_schedule)
long_predictions_without_schedule = gbm_long_without_schedule.predict(test_data_without_schedule)

xgboost prediction progress: |███████████████████████████████████████████████████| (done) 100%
xgboost prediction progress: |



███████████████████████████████████████████████████| (done) 100%


In [131]:
lat_predictions_with_schedule = lat_predictions_with_schedule.as_data_frame()
long_predictions_with_schedule = long_predictions_with_schedule.as_data_frame()
lat_predictions_without_schedule = lat_predictions_without_schedule.as_data_frame()
long_predictions_without_schedule = long_predictions_without_schedule.as_data_frame()







In [142]:
predictions_with_schedule_df = pd.concat([lat_predictions_with_schedule, long_predictions_with_schedule], axis=1)
predictions_with_schedule_df.columns = ["latitude_predicted", "longitude_predicted"]
predictions_without_schedule_df = pd.concat([lat_predictions_without_schedule, long_predictions_without_schedule], axis=1)
predictions_without_schedule_df.columns = ["latitude_predicted", "longitude_predicted"]

In [133]:
predictions_with_schedule_df["id"]=test_data_with_schedule.as_data_frame()["id"]




In [134]:
predictions_with_schedule_df.set_index('id', inplace=True)

In [135]:
predictions_without_schedule_df.update(predictions_with_schedule_df)


In [143]:
predictions_without_schedule_df["ID"] = predictions_without_schedule_df.index
# set the ID as the first column
predictions_without_schedule_df = predictions_without_schedule_df[["ID", "longitude_predicted", "latitude_predicted"]]

In [144]:
print(predictions_without_schedule_df.columns)

Index(['ID', 'longitude_predicted', 'latitude_predicted'], dtype='object')


In [145]:
predictions_without_schedule_df.to_csv("predictions.csv", index=False)