In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping

"""
    Load and Preprocess Data
"""

# Read ais_train.csv
ais_train = pd.read_csv("ais_train.csv", sep='|')

# Temporal features
ais_train['time'] = pd.to_datetime(ais_train['time'])
ais_train['elapsed_time'] = (ais_train['time'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
ais_train['day_of_week'] = ais_train['time'].dt.dayofweek  # Monday=0, Sunday=6
ais_train['hour_of_day'] = ais_train['time'].dt.hour
ais_train = pd.get_dummies(ais_train, columns=['day_of_week', 'hour_of_day'], drop_first=True)

# Filter out unrealistic speeds
ais_train = ais_train[ais_train['sog'] < 25]

# Map 'navstat' values
ais_train['navstat'] = ais_train['navstat'].replace(8, 0)  # Under way sailing -> Under way using engine
ais_train = ais_train[~((ais_train['navstat'].isin([1, 5])) & (ais_train['sog'] > 0))]
ais_train = ais_train[~((ais_train['navstat'] == 2) & (ais_train['sog'] > 5))]

# One-hot encode 'navstat'
ais_train = pd.get_dummies(ais_train, columns=['navstat'])

# Merge with vessel data
vessels = pd.read_csv("vessels.csv", sep='|')[['shippingLineId', 'vesselId']]
vessels['new_id'] = range(len(vessels))
vessel_id_to_new_id = dict(zip(vessels['vesselId'], vessels['new_id']))
ais_train = pd.merge(ais_train, vessels, on='vesselId', how='left')

# Define input and target features
input_features = ['latitude', 'longitude', 'sog', 'cog', 'heading', 'elapsed_time']
input_features.extend([col for col in ais_train.columns if 'day_of_week_' in col])
input_features.extend([col for col in ais_train.columns if 'hour_of_day_' in col])
navstat_columns = [col for col in ais_train.columns if col.startswith('navstat_')]
input_features.extend(navstat_columns)
target_columns = ['latitude', 'longitude']

# Initialize scalers
scaler_input = MinMaxScaler()
scaler_output = MinMaxScaler()

# Scale input and output features
input_data = scaler_input.fit_transform(ais_train[input_features])
output_data = scaler_output.fit_transform(ais_train[target_columns])

# Add scaled features back to DataFrame
ais_train_scaled = ais_train.copy()
ais_train_scaled[input_features] = input_data
ais_train_scaled[target_columns] = output_data

# Function to create sequences per vessel
def create_sequences_per_vessel(df, time_steps):
    X, y = [], []
    vessel_ids = df['vesselId'].unique()
    for vessel_id in vessel_ids:
        vessel_data = df[df['vesselId'] == vessel_id].sort_values('elapsed_time')
        inputs = vessel_data[input_features].values
        targets = vessel_data[target_columns].values
        if len(inputs) < time_steps:
            continue  # Skip sequences shorter than time_steps
        for i in range(len(inputs) - time_steps):
            X.append(inputs[i:i + time_steps])
            y.append(targets[i + time_steps])
    return np.array(X), np.array(y)

# Create sequences
time_step = 10
X, y = create_sequences_per_vessel(ais_train_scaled, time_step)

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True)

"""
    Define and Train the Model
"""

# Define the LSTM Model
model = Sequential()
model.add(LSTM(64, return_sequences=True, input_shape=(time_step, X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(32))
model.add(Dropout(0.2))
model.add(Dense(y_train.shape[1]))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=64,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping],
    verbose=1
)

"""
    Prepare Test Data and Make Predictions
"""

# Load test data
ais_test = pd.read_csv("ais_test.csv")
ais_test['time'] = pd.to_datetime(ais_test['time'])
ais_test['elapsed_time'] = (ais_test['time'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
ais_test['new_id'] = ais_test['vesselId'].map(vessel_id_to_new_id)

ais_test['day_of_week'] = ais_test['time'].dt.dayofweek
ais_test['hour_of_day'] = ais_test['time'].dt.hour

# One-hot encode
ais_test = pd.get_dummies(ais_test, columns=['day_of_week', 'hour_of_day'], drop_first=True)

# Ensure all columns in ais_test match those in ais_train
for col in input_features:
    if col not in ais_test.columns:
        ais_test[col] = 0

# One-hot encode 'navstat' in test data (if available)
# If 'navstat' is not available in test data, you may need to handle this accordingly

# Merge with last known positions from training data
# Get the last 'time_step' records for each vessel from training data
last_positions = ais_train_scaled.groupby('vesselId').apply(lambda x: x.sort_values('elapsed_time').tail(time_step))
last_positions = last_positions.reset_index(drop=True)

# Prepare sequences for each vessel in the test set
vessel_sequences = {}
for vessel_id in ais_test['vesselId'].unique():
    if vessel_id in last_positions['vesselId'].values:
        vessel_data = last_positions[last_positions['vesselId'] == vessel_id]
        seq = vessel_data[input_features].values
        if len(seq) < time_step:
            # Pad sequences if necessary
            seq = np.pad(seq, ((time_step - len(seq), 0), (0, 0)), mode='constant')
        vessel_sequences[vessel_id] = seq
    else:
        # If no data available, create a default sequence (e.g., zeros or mean values)
        seq = np.zeros((time_step, len(input_features)))
        vessel_sequences[vessel_id] = seq

# Create test sequences
X_test = []
for idx, row in ais_test.iterrows():
    vessel_id = row['vesselId']
    seq = vessel_sequences[vessel_id]
    X_test.append(seq)
X_test = np.array(X_test)

# Make predictions
y_pred = model.predict(X_test)

# Inverse transform predictions
y_pred_inverse = scaler_output.inverse_transform(y_pred)

"""
    Prepare Submission File
"""

# Prepare submission
submission_df = pd.DataFrame({
    'ID': ais_test['ID'].values,
    'longitude_predicted': y_pred_inverse[:, target_columns.index('longitude')],
    'latitude_predicted': y_pred_inverse[:, target_columns.index('latitude')]
})

# Ensure the submission file has the required columns
submission_df = submission_df[['ID', 'longitude_predicted', 'latitude_predicted']]

# Save submission file
submission_df.to_csv("submission.csv", index=False)

# Display submission
print(submission_df.head())
print(f"Submission DataFrame shape: {submission_df.shape}")

                 time    cog   sog  rot  heading  navstat       etaRaw  \
0 2024-01-01 00:00:25  284.0   0.7    0       88        0  01-09 23:00   
1 2024-01-01 00:00:36  109.6   0.0   -6      347        1  12-29 20:00   
2 2024-01-01 00:01:45  111.0  11.0    0      112        0  01-02 09:00   
3 2024-01-01 00:03:11   96.4   0.0    0      142        1  12-31 20:00   
4 2024-01-01 00:03:51  214.0  19.7    0      215        0  01-25 12:00   

   latitude  longitude                  vesselId  ... hour_of_day_14  \
0 -34.74370  -57.85130  61e9f3a8b937134a3c4bfdf7  ...          False   
1   8.89440  -79.47939  61e9f3d4b937134a3c4bff1f  ...          False   
2  39.19065  -76.47567  61e9f436b937134a3c4c0131  ...          False   
3 -34.41189  151.02067  61e9f3b4b937134a3c4bfe77  ...          False   
4  35.88379   -5.91636  61e9f41bb937134a3c4c0087  ...          False   

   hour_of_day_15  hour_of_day_16  hour_of_day_17  hour_of_day_18  \
0           False           False           False    

  super().__init__(**kwargs)


Epoch 1/100
[1m17520/17520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m156s[0m 9ms/step - loss: 0.0045 - val_loss: 2.5583e-04
Epoch 2/100
[1m17520/17520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 9ms/step - loss: 9.2231e-04 - val_loss: 2.9519e-04
Epoch 3/100
[1m17520/17520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 8ms/step - loss: 8.6831e-04 - val_loss: 3.4145e-04
Epoch 4/100
[1m17520/17520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 9ms/step - loss: 8.5906e-04 - val_loss: 2.4289e-04
Epoch 5/100
[1m17520/17520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m151s[0m 9ms/step - loss: 8.4516e-04 - val_loss: 2.3921e-04
Epoch 6/100
[1m17520/17520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 9ms/step - loss: 8.4418e-04 - val_loss: 2.4918e-04
Epoch 7/100
[1m17520/17520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m158s[0m 9ms/step - loss: 8.3263e-04 - val_loss: 2.5516e-04
Epoch 8/100
[1m17520/17520[0m [32m━━━━━━━━━━━━━━━━━━━━[

  last_positions = ais_train_scaled.groupby('vesselId').apply(lambda x: x.sort_values('elapsed_time').tail(time_step))


[1m1617/1617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
   ID  longitude_predicted  latitude_predicted
0   0           -84.068459           31.791695
1   1           122.680740           14.680503
2   2            10.316170           37.626579
3   3           175.012146          -40.195789
4   4            -5.778167           49.089722
Submission DataFrame shape: (51739, 3)
