In [1]:
import pandas as pd
from darts import TimeSeries
from darts.models import LightGBMModel
from darts.dataprocessing.transformers import Scaler
from darts.metrics import mae, mse
from sklearn.model_selection import train_test_split

In [2]:
"""
ais_train.csv:
Index(['time', 'cog', 'sog', 'rot', 'heading', 'navstat', 'etaRaw', 'latitude',
       'longitude', 'vesselId', 'portId', 'elapsed_time'], dtype='object')
"""
ais_train = pd.read_csv("ais_train.csv", sep="|")
ais_train['time'] = pd.to_datetime(ais_train['time'])

# map vessel ids
vessel_mapping = {vessel: idx for idx, vessel in enumerate(ais_train['vesselId'].unique())}
ais_train['vesselId'] = ais_train['vesselId'].map(vessel_mapping)

ais_train = ais_train[ais_train['cog']!=360] # cog=360 is not available
ais_train = ais_train[(ais_train['cog'] <= 360) | (ais_train['cog'] > 409.5)] # this range should not be used

ais_train = ais_train[ais_train['heading'] != 511] # unavailable
ais_train = ais_train[ais_train['sog'] < 25]

# Map 'navstat' values
ais_train['navstat'] = ais_train['navstat'].replace(8, 0)  # Under way sailing -> Under way using engine
ais_train = ais_train[~((ais_train['navstat'].isin([1, 5])) & (ais_train['sog'] > 0))]
ais_train = ais_train[~((ais_train['navstat'] == 2) & (ais_train['sog'] > 5))]
ais_train = ais_train.drop(['cog', 'sog', 'rot', 'heading', 'navstat', 'etaRaw'], axis=1)


print("Describe ais_train.csv after pruning")
print(ais_train.describe())

"""
schedules_to_may_2024.csv
Index(['vesselId', 'shippingLineId', 'shippingLineName', 'arrivalDate',
       'sailingDate', 'portName', 'portId', 'portLatitude', 'portLongitude'],
      dtype='object')
"""
schedules = pd.read_csv("schedules_to_may_2024.csv", sep="|")
print("Describe schedules_to_may_2024.csv")
print(schedules.describe())
schedules['vesselId'] = schedules['vesselId'].map(vessel_mapping)
schedules['sailingDate'] = pd.to_datetime(schedules['sailingDate']).dt.tz_localize(None)
schedules['arrivalDate'] = pd.to_datetime(schedules['arrivalDate']).dt.tz_localize(None)
schedules = schedules.dropna(subset=['portLatitude']) # drop nan values
schedules = schedules.drop_duplicates() # many duplicate values

"""
ports.csv
Index(['portId', 'name', 'portLocation', 'longitude', 'latitude', 'UN_LOCODE',
       'countryName', 'ISO'],
      dtype='object')
"""
ports = pd.read_csv('ports.csv', sep='|')
ports = ports.drop('portLocation', axis=1)
ports = ports.drop('UN_LOCODE', axis=1)
ports = ports.drop('countryName', axis=1)
ports = ports.drop('ISO', axis=1)
ports = ports.drop('name', axis=1)
ports = ports.rename(columns={'longitude': 'portLon', 'latitude': 'portLat'})
print("Describe ports.csv")
print(ports.describe())

"""
ais_test.csv
Index(['ID', 'vesselId', 'time', 'scaling_factor'], dtype='object')
"""
ais_test = pd.read_csv("ais_test.csv") # sep=","
ais_test['time'] = pd.to_datetime(ais_test['time'])
ais_test['vesselId'] = ais_test['vesselId'].map(vessel_mapping)
print("Describe ais_test.csv")
print(ais_test.describe())

Describe ais_train.csv after pruning
                                time      latitude     longitude      vesselId
count                        1399413  1.399413e+06  1.399413e+06  1.399413e+06
mean   2024-03-06 08:34:26.835677184  3.703542e+01  1.112335e+01  2.695627e+02
min              2024-01-01 00:00:25 -4.753287e+01 -1.675409e+02  0.000000e+00
25%              2024-02-03 09:35:12  3.468584e+01 -5.053500e+00  1.140000e+02
50%              2024-03-07 19:52:24  4.292247e+01  4.221300e+00  2.290000e+02
75%              2024-04-07 08:39:04  5.137469e+01  1.811785e+01  4.180000e+02
max              2024-05-07 23:59:08  7.055720e+01  1.788054e+02  6.870000e+02
std                              NaN  2.266437e+01  6.733346e+01  1.887221e+02
Describe schedules_to_may_2024.csv
        portLatitude  portLongitude
count  131848.000000  131848.000000
mean       28.021038       4.338822
std        27.401476      80.059006
min       -37.832778    -149.571389
25%        19.208333     -76.558889
5

In [3]:
import pandas as pd
from tqdm import tqdm

# Create an empty list to hold new rows
new_rows = []

# Iterate through each row in schedules
for _, sail in tqdm(schedules.iterrows(), desc="Generating training data from schedules", total=len(schedules)):
    # Skip if portId is NaN
    if pd.isna(sail['portId']):
        continue
    
    # Find corresponding port information in the ports DataFrame
    port_info = ports[ports['portId'] == sail['portId']]
    if port_info.empty:
        continue

    # Extract port latitude and longitude
    port_lat = port_info['portLat'].values[0]
    port_lon = port_info['portLon'].values[0]

    # Create a new row with the required data
    new_row = {
        'time': sail['arrivalDate'],  # or 'sailingDate' depending on what you need
        'latitude': port_lat,
        'longitude': port_lon,
        'vesselId': sail['vesselId'],
        'portId': sail['portId']
    }

    # Add the new row to the list of new rows
    new_rows.append(new_row)

# Convert the list of new rows to a DataFrame
new_rows_df = pd.DataFrame(new_rows)

# Concatenate the new rows DataFrame with ais_train
ais_train = pd.concat([ais_train, new_rows_df], ignore_index=True)
ais_train.dropna(subset=['vesselId'], inplace=True)
ais_train = ais_train.drop(['portId'], axis=1)
print(ais_train)

Generating training data from schedules: 100%|██████████████████████| 47043/47043 [00:08<00:00, 5626.04it/s]


                       time   latitude   longitude  vesselId
0       2024-01-01 00:00:25 -34.743700  -57.851300       0.0
1       2024-01-01 00:00:36   8.894400  -79.479390       1.0
2       2024-01-01 00:01:45  39.190650  -76.475670       2.0
3       2024-01-01 00:03:11 -34.411890  151.020670       3.0
4       2024-01-01 00:03:51  35.883790   -5.916360       4.0
...                     ...        ...         ...       ...
1446451 2024-02-19 00:00:00 -34.098889  -59.007778     342.0
1446452 2023-12-24 00:00:00  53.563611    8.554722     309.0
1446453 2023-12-22 00:00:00  50.902500   -1.428889     309.0
1446454 2023-12-26 00:00:00  51.336389    3.207222     309.0
1446455 2023-12-15 00:00:00  41.340278    2.164722     309.0

[1441863 rows x 4 columns]


In [4]:
import pandas as pd
import numpy as np
from scipy.interpolate import CubicSpline
from tqdm import tqdm

# Assuming ais_train is already loaded as a DataFrame
print(f"Initial number of entries: {len(ais_train)}")
print(f"Columns: {ais_train.columns.tolist()}")

# Step 1: Handle NaT Values in 'time' Column
num_nat = ais_train['time'].isna().sum()
print(f"Number of NaT in 'time' column: {num_nat}")

if num_nat > 0:
    print("Sample rows with NaT in 'time':")
    print(ais_train[ais_train['time'].isna()].head())
    
    # Drop rows where 'time' is NaT
    ais_train = ais_train.dropna(subset=['time'])
    print(f"After dropping NaT, number of entries: {len(ais_train)}")

# Step 2: Remove Duplicate (vesselId, time) Pairs
ais_train = ais_train.drop_duplicates(subset=['vesselId', 'time'])
print(f"After dropping duplicates: {len(ais_train)}")

# Step 3: Convert 'time' to datetime if it's not already
if not np.issubdtype(ais_train['time'].dtype, np.datetime64):
    ais_train['time'] = pd.to_datetime(ais_train['time'], errors='coerce')  # Coerce invalid formats to NaT
    
    # Check again for NaT after conversion
    num_nat = ais_train['time'].isna().sum()
    if num_nat > 0:
        print(f"Number of NaT after conversion: {num_nat}")
        print("Dropping rows with NaT after conversion.")
        ais_train = ais_train.dropna(subset=['time'])
        print(f"After dropping NaT post-conversion, number of entries: {len(ais_train)}")

# Step 4: Sort the DataFrame by 'vesselId' and 'time'
ais_train = ais_train.sort_values(by=['vesselId', 'time'])
print("DataFrame sorted by 'vesselId' and 'time'.")

# Step 5: Define the Interpolation Function with Enhanced Robustness
def interpolate_vessel(df, new_time_resolution='1min'):
    """
    Interpolate latitude and longitude for a single vessel.

    Parameters:
    - df: DataFrame containing 'time', 'latitude', 'longitude' for a vessel
    - new_time_resolution: String representing the new time frequency (e.g., '1T' for 1 minute)

    Returns:
    - DataFrame with interpolated 'time', 'latitude', 'longitude', 'vesselId'
    """
    # Ensure the data is sorted by time
    df = df.sort_values('time')
    
    # Check for any NaT in 'time'
    if df['time'].isna().any():
        print(f"Skipping vesselId {df['vesselId'].iloc[0]} due to NaT in 'time'")
        return df
    
    # Convert time to numerical format (e.g., timestamp in seconds)
    try:
        # Use .view('int64') instead of .astype(np.int64) for better compatibility
        time_num = df['time'].view('int64') // 10**9  # Convert to seconds
    except AttributeError as e:
        print(f"AttributeError for vesselId {df['vesselId'].iloc[0]}: {e}")
        return df
    
    # Define the new time range
    start_time = df['time'].iloc[0]
    end_time = df['time'].iloc[-1]
    
    # Check if start_time and end_time are valid and not equal
    if pd.isna(start_time) or pd.isna(end_time):
        print(f"Invalid start or end time for vesselId {df['vesselId'].iloc[0]}")
        return df
    
    if start_time == end_time:
        print(f"Start and end times are the same for vesselId {df['vesselId'].iloc[0]}")
        return df
    
    # Define the new time points based on the desired resolution
    new_time = pd.date_range(start=start_time,
                             end=end_time,
                             freq=new_time_resolution)
    
    # Convert new_time to numerical format (seconds)
    try:
        new_time_num = new_time.view('int64') // 10**9  # Convert to seconds
    except AttributeError as e:
        print(f"AttributeError when converting new_time for vesselId {df['vesselId'].iloc[0]}: {e}")
        return df
    
    # Handle cases with insufficient data points
    if len(df) < 4:
        # Cubic spline requires at least 4 points
        print(f"Insufficient data points for vesselId {df['vesselId'].iloc[0]}. Required: 4, Available: {len(df)}")
        return df

    try:
        # Create cubic spline interpolators
        cs_lat = CubicSpline(time_num, df['latitude'])
        cs_lon = CubicSpline(time_num, df['longitude'])
        
        # Interpolate latitude and longitude
        interp_lat = cs_lat(new_time_num)
        interp_lon = cs_lon(new_time_num)
        
        # Create the interpolated DataFrame
        interp_df = pd.DataFrame({
            'time': new_time,
            'latitude': interp_lat,
            'longitude': interp_lon,
            'vesselId': df['vesselId'].iloc[0]
        })
        
        return interp_df
    except Exception as e:
        # In case of any error (e.g., duplicate time points), return the original data
        print(f"Interpolation failed for vesselId {df['vesselId'].iloc[0]}: {e}")
        return df

# Step 6: Define the Desired Time Resolution for Interpolation
new_time_resolution = '1d'  # 1 minute intervals
# Note: In your original code, you set '1d' with a comment indicating 1 minute intervals. 
#       Ensure the frequency string matches the intended resolution.

# Step 7: Initialize a List to Hold Interpolated DataFrames
interpolated_dfs = []

# Step 8: Perform Interpolation Using a For Loop
print("Starting interpolation for each vessel...")
for vessel_id, group in tqdm(ais_train.groupby('vesselId'), total=ais_train['vesselId'].nunique()):
    interp_df = interpolate_vessel(group, new_time_resolution=new_time_resolution)
    interpolated_dfs.append(interp_df)

# Step 9: Concatenate All Interpolated DataFrames
ais_interpolated = pd.concat(interpolated_dfs, ignore_index=True)
print("Interpolation completed.")

print(f"Original number of entries after preprocessing: {len(ais_train)}")
print(f"Interpolated number of entries: {len(ais_interpolated)}")
print("Sample of interpolated data:")
print(ais_interpolated.head())

ais_train = pd.concat([ais_train, ais_interpolated], ignore_index=True)
print(f"Final number of interpolated entries: {len(ais_train)}")


Initial number of entries: 1441863
Columns: ['time', 'latitude', 'longitude', 'vesselId']
Number of NaT in 'time' column: 2726
Sample rows with NaT in 'time':
        time   latitude   longitude  vesselId
1399486  NaT   1.292778  103.725278     532.0
1399487  NaT  51.297778    4.299722     532.0
1399488  NaT  35.562222  140.064444     532.0
1399489  NaT  24.258333  120.506111     532.0
1399490  NaT  51.498889   -2.712222     532.0
After dropping NaT, number of entries: 1439137
After dropping duplicates: 1427120
DataFrame sorted by 'vesselId' and 'time'.
Starting interpolation for each vessel...


  time_num = df['time'].view('int64') // 10**9  # Convert to seconds
  time_num = df['time'].view('int64') // 10**9  # Convert to seconds
  time_num = df['time'].view('int64') // 10**9  # Convert to seconds
  time_num = df['time'].view('int64') // 10**9  # Convert to seconds
  time_num = df['time'].view('int64') // 10**9  # Convert to seconds
  time_num = df['time'].view('int64') // 10**9  # Convert to seconds
  time_num = df['time'].view('int64') // 10**9  # Convert to seconds
  time_num = df['time'].view('int64') // 10**9  # Convert to seconds
  time_num = df['time'].view('int64') // 10**9  # Convert to seconds
  time_num = df['time'].view('int64') // 10**9  # Convert to seconds
  time_num = df['time'].view('int64') // 10**9  # Convert to seconds
  time_num = df['time'].view('int64') // 10**9  # Convert to seconds
  time_num = df['time'].view('int64') // 10**9  # Convert to seconds
  time_num = df['time'].view('int64') // 10**9  # Convert to seconds
  time_num = df['time'].view('int6

Start and end times are the same for vesselId 682.0
Interpolation completed.
Original number of entries after preprocessing: 1427120
Interpolated number of entries: 173458
Sample of interpolated data:
                 time   latitude  longitude  vesselId
0 2024-01-01 00:00:25 -34.743700 -57.851300       0.0
1 2024-01-02 00:00:25 -37.797839 -55.876592       0.0
2 2024-01-03 00:00:25 -41.848525 -59.062143       0.0
3 2024-01-04 00:00:25 -44.511504 -61.696796       0.0
4 2024-01-05 00:00:25 -45.802562 -63.890997       0.0
Final number of interpolated entries: 1600578


In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error
from lightgbm import LGBMRegressor

# Feature engineering
def create_features(df):
    df['datetime'] = df['time']
    df['year'] = df['datetime'].dt.year
    df['month'] = df['datetime'].dt.month
    df['day'] = df['datetime'].dt.day
    df['hour'] = df['datetime'].dt.hour
    df['minute'] = df['datetime'].dt.minute
    df['elapsed_time'] = df['datetime'].astype(np.int64) // 10**9  # Convert to seconds since epoch
    return df

# Load your datasets
# ais_train = pd.read_csv('ais_train.csv')
# ais_test = pd.read_csv('ais_test.csv')

# Apply feature engineering
ais_train = create_features(ais_train)
ais_test = create_features(ais_test)

# Convert vesselId to integer
ais_train['vesselId'] = ais_train['vesselId'].astype(int)
ais_test['vesselId'] = ais_test['vesselId'].astype(int)

# Define features
features = ['vesselId', 'elapsed_time']

# Prepare training data
X = ais_train[features]
y_lat = ais_train['latitude']
y_lon = ais_train['longitude']

# Train/test split
X_train, X_val, y_train_lat, y_val_lat, y_train_lon, y_val_lon = train_test_split(X, y_lat, y_lon, test_size=0.2, random_state=42)

# Train LightGBM models for latitude and longitude
lat_model = LGBMRegressor()
lon_model = LGBMRegressor()

lat_model.fit(X_train, y_train_lat)
lon_model.fit(X_train, y_train_lon)

# Predict on validation set to evaluate the model
lat_val_pred = lat_model.predict(X_val)
lon_val_pred = lon_model.predict(X_val)

# Calculate MAE and MSE for latitude
mae_lat = mean_absolute_error(y_val_lat, lat_val_pred)
mse_lat = mean_squared_error(y_val_lat, lat_val_pred)

# Calculate MAE and MSE for longitude
mae_lon = mean_absolute_error(y_val_lon, lon_val_pred)
mse_lon = mean_squared_error(y_val_lon, lon_val_pred)

# Print the evaluation metrics
print(f'Latitude MAE: {mae_lat:.4f}')
print(f'Latitude MSE: {mse_lat:.4f}')
print(f'Longitude MAE: {mae_lon:.4f}')
print(f'Longitude MSE: {mse_lon:.4f}')

# Prepare the test data
X_test = ais_test[features]

# Make predictions on the test set
lat_test_pred = lat_model.predict(X_test)
lon_test_pred = lon_model.predict(X_test)

# Create the submission file
submission = pd.DataFrame({
    'ID': ais_test['ID'],
    'longitude_predicted': lon_test_pred,
    'latitude_predicted': lat_test_pred
})

# Save submission to CSV
submission.to_csv('submission.csv', index=False)

print("Submission file created.")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001971 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 1280462, number of used features: 2
[LightGBM] [Info] Start training from score -644082.729993
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002209 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 510
[LightGBM] [Info] Number of data points in the train set: 1280462, number of used features: 2
[LightGBM] [Info] Start training from score 636366.361832
Latitude MAE: 214048.0912
Latitude MSE: 10288524649498.9531
Longitude MAE: 211498.0607
Longitude MSE: 10042303338311.0527
Submission file created.
