In [5]:
import pandas as pd
import numpy as np
from darts import TimeSeries
from darts.models import LightGBMModel
import lightgbm as lgb

In [28]:
import pandas as pd
import numpy as np
from darts import TimeSeries
from darts.models import LightGBMModel
import lightgbm as lgb

# Load ais_train.csv with separator '|'
train_df = pd.read_csv('ais_train.csv', sep='|')
train_df['time'] = pd.to_datetime(train_df['time'])

###

# Filter out unrealistic speeds
train_df = train_df[train_df['sog'] < 25]

# Map 'navstat' values
train_df['navstat'] = train_df['navstat'].replace(8, 0)  # Under way sailing -> Under way using engine
train_df = train_df[~((train_df['navstat'].isin([1, 5])) & (train_df['sog'] > 0))]
train_df = train_df[~((train_df['navstat'] == 2) & (train_df['sog'] > 5))]

train_df = train_df.drop_duplicates()

###

# Load ais_test.csv with separator ','
test_df = pd.read_csv('ais_test.csv', sep=',')
test_df['time'] = pd.to_datetime(test_df['time'])

# Use 'vesselId' instead of 'vessel_id'
# Select only vessel IDs that are in both train and test datasets
common_vessel_ids = set(train_df['vesselId']).intersection(set(test_df['vesselId']))
train_df = train_df[train_df['vesselId'].isin(common_vessel_ids)]

# Group the training data by vesselId
groups = train_df.groupby('vesselId')

# Initialize dictionaries to store TimeSeries objects and last training times
timeseries_dict = {}
last_train_time = {}

# Process each vesselId group
for vessel_id, group_df in groups:
    # Sort on time
    group_df = group_df.sort_values('time')
    # Set index to time
    group_df = group_df.set_index('time')
    # Select features (latitude and longitude)
    features_df = group_df[['latitude', 'longitude', 'cog', 'sog', 'heading']]
    # Resample data to hourly frequency with mean and linear interpolation
    features_df = features_df.resample('h').mean().interpolate(method='cubic') 
    # Create Darts TimeSeries object
    ts = TimeSeries.from_dataframe(features_df, value_cols=['latitude', 'longitude', 'cog', 'sog', 'heading'])
    # Store the TimeSeries object and last training time
    timeseries_dict[vessel_id] = ts
    last_train_time[vessel_id] = features_df.index.max()

# Initialize a dictionary to store predictions
predictions = {}

# Fit LightGBM models and predict for each TimeSeries object
for vessel_id, ts in timeseries_dict.items():
    # Get the last training time
    last_time = last_train_time[vessel_id]
    # Get test times for this vessel
    vessel_test_df = test_df[test_df['vesselId'] == vessel_id]
    test_times = vessel_test_df['time']
    # Compute the time differences in hours
    time_diffs = (test_times - last_time).dt.total_seconds() / 3600
    # Get the maximum forecast horizon needed
    max_n = int(np.ceil(time_diffs.max()))
    if max_n <= 0:
        continue  # Skip if no future times to predict
    # Initialize LightGBM model with lag parameters
    model = LightGBMModel(
        lags=48, # the correct value is between 48 and 96
        #learning_rate=0.1,
        #n_estimators=50,
        #max_depth=10,
    )
    # Fit the model
    model.fit(ts)
    # Predict up to the maximum horizon needed
    forecast = model.predict(max_n)
    # Store the forecast and last time
    predictions[vessel_id] = (forecast, last_time)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004047 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 61200
[LightGBM] [Info] Number of data points in the train set: 3024, number of used features: 240
[LightGBM] [Info] Start training from score 44.152506
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002881 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 61200
[LightGBM] [Info] Number of data points in the train set: 3024, number of used features: 240
[LightGBM] [Info] Start training from score -4.685347
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002193 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 61200
[LightGBM] [Info] Number of data points in the train set: 3024, number of used features: 240
[LightGBM] [Info] Star

In [31]:
# Initialize a list to store submission rows
submission_rows = []

# Generate predictions for the submission file
for idx, row in test_df.iterrows():
    vessel_id = row['vesselId']
    test_time = row['time']
    test_id = row['ID']  # Assuming 'ID' column exists in test_df
    # Check if predictions are available for this vessel_id
    if vessel_id in predictions:
        forecast_ts, last_time = predictions[vessel_id]
        time_diff = (test_time - last_time).total_seconds() / 3600
        index = int(np.round(time_diff)) - 1  # Adjust index since forecast starts from last_time + 1 hour
        # Convert forecast_ts to DataFrame
        forecast_df = forecast_ts.pd_dataframe()
        # Check if index is within forecast horizon
        if 0 <= index < len(forecast_df):
            predicted_lat = forecast_df['latitude'].iloc[index]
            predicted_lon = forecast_df['longitude'].iloc[index]
        else:
            predicted_lat = np.nan
            predicted_lon = np.nan
    else:
        predicted_lat = np.nan
        predicted_lon = np.nan
    # Append the prediction to the submission list
    submission_rows.append({
        'ID': test_id,
        'longitude_predicted': predicted_lon,
        'latitude_predicted': predicted_lat
    })

# Create a submission DataFrame from the list
submission_df = pd.DataFrame(submission_rows)

# Save the submission file
submission_df.to_csv('submission.csv', index=False)

print(submission_df)


          ID  longitude_predicted  latitude_predicted
0          0           -81.935237           31.382728
1          1           120.325526           14.871281
2          2            10.810347           38.431123
3          3           172.800364          -43.541216
4          4            -6.258304           48.376097
...      ...                  ...                 ...
51734  51734           -82.393327           26.236814
51735  51735           143.215373           41.380120
51736  51736           141.362491           35.222822
51737  51737            18.632427           57.637358
51738  51738            10.589267           56.297888

[51739 rows x 3 columns]


In [32]:
submission_df.describe()

Unnamed: 0,ID,longitude_predicted,latitude_predicted
count,51739.0,51739.0,51739.0
mean,25869.0,10.087505,38.850523
std,14935.907126,56.342379,21.636129
min,0.0,-122.856405,-43.556698
25%,12934.5,-5.208432,36.047248
50%,25869.0,4.363376,43.539921
75%,38803.5,14.581402,52.557843
max,51738.0,174.882949,63.640495
