In [5]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dropout, Dense
from tensorflow.keras.callbacks import EarlyStopping

"""
    Load and Preprocess Data
"""

# Read ais_train.csv
ais_train = pd.read_csv("ais_train.csv", sep='|')
ais_train['time'] = pd.to_datetime(ais_train['time'])
ais_train['elapsed_time'] = (ais_train['time'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')

# Filter out unrealistic speeds
ais_train = ais_train[ais_train['sog'] < 25]

# Map 'navstat' values
ais_train['navstat'] = ais_train['navstat'].replace(8, 0)  # Under way sailing -> Under way using engine
ais_train = ais_train[~((ais_train['navstat'].isin([1, 5])) & (ais_train['sog'] > 0))]
ais_train = ais_train[~((ais_train['navstat'] == 2) & (ais_train['sog'] > 5))]

# One-hot encode 'navstat'
ais_train = pd.get_dummies(ais_train, columns=['navstat'])

# Merge with vessel data
vessels = pd.read_csv("vessels.csv", sep='|')[['shippingLineId', 'vesselId']]
vessels['new_id'] = range(len(vessels))
vessel_id_to_new_id = dict(zip(vessels['vesselId'], vessels['new_id']))
ais_train = pd.merge(ais_train, vessels, on='vesselId', how='left')

# Define input and target features
input_features = ['latitude', 'longitude', 'sog', 'cog', 'heading', 'elapsed_time']
navstat_columns = [col for col in ais_train.columns if col.startswith('navstat_')]
input_features.extend(navstat_columns)
target_columns = ['latitude', 'longitude']

# Initialize scalers
scaler_input = MinMaxScaler()
scaler_output = MinMaxScaler()

# Scale input and output features
input_data = scaler_input.fit_transform(ais_train[input_features])
output_data = scaler_output.fit_transform(ais_train[target_columns])

# Add scaled features back to DataFrame
ais_train_scaled = ais_train.copy()
ais_train_scaled[input_features] = input_data
ais_train_scaled[target_columns] = output_data

# Function to create sequences per vessel
def create_sequences_per_vessel(df, time_steps):
    X, y = [], []
    vessel_ids = df['vesselId'].unique()
    for vessel_id in vessel_ids:
        vessel_data = df[df['vesselId'] == vessel_id].sort_values('elapsed_time')
        inputs = vessel_data[input_features].values
        targets = vessel_data[target_columns].values
        if len(inputs) < time_steps:
            continue  # Skip sequences shorter than time_steps
        for i in range(len(inputs) - time_steps):
            X.append(inputs[i:i + time_steps])
            y.append(targets[i + time_steps])
    return np.array(X), np.array(y)

# Create sequences
time_step = 10
X, y = create_sequences_per_vessel(ais_train_scaled, time_step)

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, shuffle=True)

"""
    Define and Train the Model
"""

# Define the LSTM Model
model = Sequential()
model.add(LSTM(64, return_sequences=True, input_shape=(time_step, X_train.shape[2])))
model.add(Dropout(0.2))
model.add(LSTM(32))
model.add(Dropout(0.2))
model.add(Dense(y_train.shape[1]))

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Train the model
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

model.fit(
    X_train, y_train,
    epochs=100,
    batch_size=64,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping],
    verbose=1
)

"""
    Prepare Test Data and Make Predictions
"""

# Load test data
ais_test = pd.read_csv("ais_test.csv")
ais_test['time'] = pd.to_datetime(ais_test['time'])
ais_test['elapsed_time'] = (ais_test['time'] - pd.Timestamp("1970-01-01")) // pd.Timedelta('1s')
ais_test['new_id'] = ais_test['vesselId'].map(vessel_id_to_new_id)

# One-hot encode 'navstat' in test data (if available)
# If 'navstat' is not available in test data, you may need to handle this accordingly

# Merge with last known positions from training data
# Get the last 'time_step' records for each vessel from training data
last_positions = ais_train_scaled.groupby('vesselId').apply(lambda x: x.sort_values('elapsed_time').tail(time_step))
last_positions = last_positions.reset_index(drop=True)

# Prepare sequences for each vessel in the test set
vessel_sequences = {}
for vessel_id in ais_test['vesselId'].unique():
    if vessel_id in last_positions['vesselId'].values:
        vessel_data = last_positions[last_positions['vesselId'] == vessel_id]
        seq = vessel_data[input_features].values
        if len(seq) < time_step:
            # Pad sequences if necessary
            seq = np.pad(seq, ((time_step - len(seq), 0), (0, 0)), mode='constant')
        vessel_sequences[vessel_id] = seq
    else:
        # If no data available, create a default sequence (e.g., zeros or mean values)
        seq = np.zeros((time_step, len(input_features)))
        vessel_sequences[vessel_id] = seq

# Create test sequences
X_test = []
for idx, row in ais_test.iterrows():
    vessel_id = row['vesselId']
    seq = vessel_sequences[vessel_id]
    X_test.append(seq)
X_test = np.array(X_test)

# Make predictions
y_pred = model.predict(X_test)

# Inverse transform predictions
y_pred_inverse = scaler_output.inverse_transform(y_pred)

"""
    Prepare Submission File
"""

# Prepare submission
submission_df = pd.DataFrame({
    'ID': ais_test['ID'].values,
    'longitude_predicted': y_pred_inverse[:, target_columns.index('longitude')],
    'latitude_predicted': y_pred_inverse[:, target_columns.index('latitude')]
})

# Ensure the submission file has the required columns
submission_df = submission_df[['ID', 'longitude_predicted', 'latitude_predicted']]

# Save submission file
submission_df.to_csv("submission.csv", index=False)

# Display submission
print(submission_df.head())
print(f"Submission DataFrame shape: {submission_df.shape}")

  super().__init__(**kwargs)


Epoch 1/100
[1m17520/17520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 6ms/step - loss: 0.0039 - val_loss: 2.6323e-04
Epoch 2/100
[1m17520/17520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 6ms/step - loss: 8.8203e-04 - val_loss: 2.9042e-04
Epoch 3/100
[1m17520/17520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m107s[0m 6ms/step - loss: 8.5491e-04 - val_loss: 2.2642e-04
Epoch 4/100
[1m17520/17520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m111s[0m 6ms/step - loss: 8.3340e-04 - val_loss: 2.3481e-04
Epoch 5/100
[1m17520/17520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m108s[0m 6ms/step - loss: 8.2362e-04 - val_loss: 2.3615e-04
Epoch 6/100
[1m17520/17520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 6ms/step - loss: 8.1254e-04 - val_loss: 2.5413e-04
Epoch 7/100
[1m17520/17520[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m103s[0m 6ms/step - loss: 8.3622e-04 - val_loss: 2.8646e-04
Epoch 8/100
[1m17520/17520[0m [32m━━━━━━━━━━━━━━━━━━━━[

  last_positions = ais_train_scaled.groupby('vesselId').apply(lambda x: x.sort_values('elapsed_time').tail(time_step))


[1m1617/1617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 1ms/step
   ID  longitude_predicted  latitude_predicted
0   0           -82.688011           30.940355
1   1           119.445145           14.739825
2   2            11.130962           38.543983
3   3           168.906738          -42.034435
4   4            -2.105234           48.945454
Submission DataFrame shape: (51739, 3)
