In [2]:
import pandas as pd
import numpy as np
from darts import TimeSeries
from darts.models import LightGBMModel
import lightgbm as lgb

In [None]:
import pandas as pd
import numpy as np
from darts import TimeSeries
from darts.models import LightGBMModel
from darts.dataprocessing.transformers import Scaler
import lightgbm as lgb
import warnings

warnings.filterwarnings("ignore")

# Load ais_train.csv with separator '|'
train_df = pd.read_csv('ais_train.csv', sep='|')
train_df['time'] = pd.to_datetime(train_df['time'])

# Load ais_test.csv with separator ','
test_df = pd.read_csv('ais_test.csv', sep=',')
test_df['time'] = pd.to_datetime(test_df['time'])

# Select only vessel IDs that are in both train and test datasets
common_vessel_ids = set(train_df['vesselId']).intersection(set(test_df['vesselId']))
train_df = train_df[train_df['vesselId'].isin(common_vessel_ids)]

# Additional features to include
feature_columns = ['latitude', 'longitude', 'sog', 'cog']  # Add more if available

# Initialize dictionaries to store TimeSeries objects and last training times
target_series_dict = {}
past_covariates_dict = {}
last_train_time = {}

# Process each vesselId group
for vessel_id, group_df in train_df.groupby('vesselId'):
    # Sort on time
    group_df = group_df.sort_values('time')
    # Set index to time
    group_df = group_df.set_index('time')
    # Select features
    features_df = group_df[feature_columns]
    # Resample data to hourly frequency with mean and interpolate
    features_df = features_df.resample('H').mean().interpolate(method='linear')
    # Create target TimeSeries object (latitude and longitude)
    target_ts = TimeSeries.from_dataframe(features_df[['latitude', 'longitude']])
    # Create past covariates TimeSeries object (speed, course)
    if {'sog', 'cog'}.issubset(features_df.columns):
        past_covariates_ts = TimeSeries.from_dataframe(features_df[['sog', 'cog']])
    else:
        past_covariates_ts = None
    # Store the TimeSeries objects and last training time
    target_series_dict[vessel_id] = target_ts
    past_covariates_dict[vessel_id] = past_covariates_ts
    last_train_time[vessel_id] = features_df.index.max()

# Initialize a dictionary to store predictions
predictions = {}

# Fit LightGBM models and predict for each TimeSeries object
for vessel_id in target_series_dict.keys():
    target_ts = target_series_dict[vessel_id]
    past_covariates_ts = past_covariates_dict[vessel_id]
    # Get the last training time
    last_time = last_train_time[vessel_id]
    # Get test times for this vessel
    vessel_test_df = test_df[test_df['vesselId'] == vessel_id]
    test_times = vessel_test_df['time']
    if test_times.empty:
        continue  # Skip if no test times for this vessel
    # Compute the time differences in hours
    time_diffs = (test_times - last_time).dt.total_seconds() / 3600
    # Get the maximum forecast horizon needed
    max_n = int(np.ceil(time_diffs.max()))
    if max_n <= 0:
        continue  # Skip if no future times to predict

    # Scale the target and past covariates
    scaler_target = Scaler()
    target_ts_scaled = scaler_target.fit_transform(target_ts)

    if past_covariates_ts is not None:
        scaler_covariates = Scaler()
        past_covariates_ts_scaled = scaler_covariates.fit_transform(past_covariates_ts)
        lags_past_covariates = 24  # Use desired lag value
    else:
        past_covariates_ts_scaled = None
        lags_past_covariates = None  # Set to None if no past covariates

    # Initialize LightGBM model with appropriate lags
    model = LightGBMModel(
        lags=24,  # Try different lag values
        lags_past_covariates=lags_past_covariates,
        output_chunk_length=max_n,  # Predict up to the maximum horizon needed
        random_state=42,
        # Hyperparameters tuning
        n_estimators=500,
        learning_rate=0.05,
        num_leaves=31,
        max_depth=-1,
        min_child_samples=20,
    )

    # Fit the model
    if past_covariates_ts_scaled is not None:
        model.fit(series=target_ts_scaled, past_covariates=past_covariates_ts_scaled)
    else:
        model.fit(series=target_ts_scaled)

    # Predict up to the maximum horizon needed
    if past_covariates_ts_scaled is not None:
        forecast_scaled = model.predict(n=max_n, past_covariates=past_covariates_ts_scaled)
    else:
        forecast_scaled = model.predict(n=max_n)

    # Inverse transform the forecast
    forecast = scaler_target.inverse_transform(forecast_scaled)

    # Store the forecast and last time
    predictions[vessel_id] = (forecast, last_time)

# Create a submission DataFrame
submission_rows = []

# Generate predictions for the submission file
for idx, row in test_df.iterrows():
    vessel_id = row['vesselId']
    test_time = row['time']
    test_id = row['ID']  # Assuming 'ID' column exists in test_df
    # Check if predictions are available for this vessel_id
    if vessel_id in predictions:
        forecast_ts, last_time = predictions[vessel_id]
        time_diff = (test_time - last_time).total_seconds() / 3600
        index = int(np.round(time_diff)) - 1  # Adjust index since forecast starts from last_time + 1 hour
        # Check if index is within forecast horizon
        if 0 <= index < len(forecast_ts):
            predicted_lat = forecast_ts['latitude'].values()[index][0]
            predicted_lon = forecast_ts['longitude'].values()[index][0]
        else:
            predicted_lat = np.nan
            predicted_lon = np.nan
    else:
        predicted_lat = np.nan
        predicted_lon = np.nan
    # Append the prediction to the submission list
    submission_rows.append({
        'ID': test_id,
        'longitude_predicted': predicted_lon,
        'latitude_predicted': predicted_lat
    })

# Create a submission DataFrame from the list
submission_df = pd.DataFrame(submission_rows)

# Save the submission file
submission_df.to_csv('submission.csv', index=False)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001878 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 24480
[LightGBM] [Info] Number of data points in the train set: 2928, number of used features: 96
[LightGBM] [Info] Start training from score 0.678728
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002090 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24480
[LightGBM] [Info] Number of data points in the train set: 2928, number of used features: 96
[LightGBM] [Info] Start training from score 0.585493
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001001 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 24480
[LightGBM] [Info] Number of data points in the train se

In [14]:
# Initialize a list to store submission rows
submission_rows = []

# Generate predictions for the submission file
for idx, row in test_df.iterrows():
    vessel_id = row['vesselId']
    test_time = row['time']
    test_id = row['ID']  # Assuming 'ID' column exists in test_df
    # Check if predictions are available for this vessel_id
    if vessel_id in predictions:
        forecast_ts, last_time = predictions[vessel_id]
        time_diff = (test_time - last_time).total_seconds() / 3600
        index = int(np.round(time_diff)) - 1  # Adjust index since forecast starts from last_time + 1 hour
        # Convert forecast_ts to DataFrame
        forecast_df = forecast_ts.pd_dataframe()
        # Check if index is within forecast horizon
        if 0 <= index < len(forecast_df):
            predicted_lat = forecast_df['latitude'].iloc[index]
            predicted_lon = forecast_df['longitude'].iloc[index]
        else:
            predicted_lat = np.nan
            predicted_lon = np.nan
    else:
        predicted_lat = np.nan
        predicted_lon = np.nan
    # Append the prediction to the submission list
    submission_rows.append({
        'ID': test_id,
        'longitude_predicted': predicted_lon,
        'latitude_predicted': predicted_lat
    })

# Create a submission DataFrame from the list
submission_df = pd.DataFrame(submission_rows)

# Save the submission file
submission_df.to_csv('submission.csv', index=False)

print(submission_df)


          ID  longitude_predicted  latitude_predicted
0          0           -81.495496           31.214338
1          1           120.286498           14.889166
2          2            10.808803           38.368990
3          3           172.918765          -43.541015
4          4            -6.262637           48.254646
...      ...                  ...                 ...
51734  51734           -76.568329           26.032523
51735  51735           150.671586          -20.580527
51736  51736           141.360545           35.446813
51737  51737            21.812115           58.795670
51738  51738            10.279530           56.519043

[51739 rows x 3 columns]
