# Imports, read csv, functions definitions

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from utils_rnn import add_xy_and_deltas
from utils_rnn import split_train_val_test
from utils_rnn import create_sequences
from utils_rnn import autoregressive_predict
from utils_rnn import reconstruct_positions
from utils_rnn import plot_input_and_predictions
from utils_rnn import folium_plot_trip_with_prediction
from utils_rnn import mass_xy_to_latlon
from utils_rnn import compute_errors
from utils_rnn import haversine

# --------------------------
# Configurable parameters
# --------------------------
CSV_PATH = "data/ais_data_5min_clean.csv"   # <-- replace with your file
INPUT_FEATURES = ["dx", "dy"]  # easy to change later
OUTPUT_FEATURES = ["dx", "dy"] # identical for autoregression
TEST_SIZE = 0.2
VAL_SIZE = 0.1
RANDOM_STATE = 42

# Train Validation Test split

In [None]:
from sklearn.preprocessing import StandardScaler

# 1. Load data
df = pd.read_csv(CSV_PATH)

# Expect columns: MMSI, segment, lat, lon, timestamp (optional)
print("Loaded data:", df.shape)

# 2. Convert lat/lon to x/y and compute deltas
df = add_xy_and_deltas(df)

# 3. Split into train/val/test
train_df, val_df, test_df = split_train_val_test(df)

print("Train size:", train_df.shape)
print("Val size:", val_df.shape)
print("Test size:", test_df.shape)

SEQ_LEN = 10
X_train, y_train, _         = create_sequences(train_df, INPUT_FEATURES, OUTPUT_FEATURES, seq_len=SEQ_LEN)
X_val, y_val, _             = create_sequences(val_df, INPUT_FEATURES, OUTPUT_FEATURES, seq_len=SEQ_LEN)
X_test, y_test, test_meta   = create_sequences(test_df, INPUT_FEATURES, OUTPUT_FEATURES, seq_len=SEQ_LEN)

num_sequences, seq_len, num_features = X_train.shape
X_train_flat = X_train.reshape(-1, num_features)

# WITHOUT TRANSFORM IT WORKS BEST
""" # Fit scaler on training data only
scaler = StandardScaler()
scaler.fit(X_train_flat)

# Transform all sets
X_train = scaler.transform(X_train_flat).reshape(num_sequences, seq_len, num_features)

# Validation
X_val = scaler.transform(X_val_original.reshape(-1, num_features)).reshape(X_val_original.shape)
X_test = scaler.transform(X_test_original.reshape(-1, num_features)).reshape(X_test_original.shape)

# Targets (Y) also normalized with same scaler
y_train = scaler.transform(y_train_original)
y_val   = scaler.transform(y_val_original)
y_test  = scaler.transform(y_test_original) """



print("Feature shapes:", X_train.shape, y_train.shape)


Loaded data: (71321, 9)
Train size: (52983, 13)
Val size: (5339, 13)
Test size: (12999, 13)
Feature shapes: (50543, 10, 2) (50543, 2)


# RNN

In [None]:
%matplotlib inline
# Import deep learning libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import numpy as np

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")


2025-11-28 21:16:14.016794: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-28 21:16:14.897234: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2025-11-28 21:16:14.900599: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


TensorFlow version: 2.13.1
GPU Available: []


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2

def build_rnn_model(input_shape, output_dim):
    model = Sequential()

    # Stacked RNN layers
    model.add(SimpleRNN(2*128, return_sequences=True, activation='relu', input_shape=input_shape))
    model.add(Dropout(0.3))
    model.add(SimpleRNN(2*64, return_sequences=False, activation='relu'))

    # Dense layers for nonlinear mapping
    model.add(BatchNormalization())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))

    # Final regression output
    model.add(Dense(output_dim, kernel_regularizer=l2(1e-4)))

    # Compile
    model.compile(
        optimizer=Adam(learning_rate=1e-4),
        loss='mse',
        metrics=['mae']
    )
    
    return model


In [None]:
n_timesteps = X_train.shape[1]
n_features = X_train.shape[2]
n_targets = y_train.shape[1]

model = build_rnn_model(
    input_shape=(n_timesteps, n_features),
    output_dim=n_targets
)

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 simple_rnn (SimpleRNN)      (None, 10, 256)           66304     
                                                                 
 dropout (Dropout)           (None, 10, 256)           0         
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 128)               49280     
                                                                 
 batch_normalization (Batch  (None, 128)               512       
 Normalization)                                                  
                                                                 
 dense (Dense)               (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                        

In [None]:
import signal
from tensorflow.keras.callbacks import Callback

class GracefulInterrupt(Callback):
    def __init__(self):
        super().__init__()
        self.stop_training = False
        signal.signal(signal.SIGINT, self.handle_sigint)

    def handle_sigint(self, signum, frame):
        print("\nSIGINT received: Training will stop after this epoch.\n")
        self.stop_training = True

    def on_epoch_end(self, epoch, logs=None):
        if self.stop_training:
            print(f"Stopping at epoch {epoch+1}.")
            self.model.stop_training = True


callbacks = [
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5),
    GracefulInterrupt()
]

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=64,
    callbacks=callbacks,
    verbose=1
)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100


# Evaluation

In [35]:
horizon = 10
all_stats = []

# container for per-step errors across all sequences
step_errors = [[] for _ in range(horizon)]

for seq_id in range(len(X_test)):
    if seq_id > 1000:
        break
    if seq_id % 50 == 0:
        print(seq_id, "out of", len(X_test))
    
    # --- Predict horizon steps ---
    preds = autoregressive_predict(model, X_test[seq_id], horizon)
    # --- Reconstruct positions ---
    start_idx = test_meta[seq_id]["end_index"]  # last input row
    start_xy = df.loc[start_idx, ["x","y"]].values
    pred_positions_xy = reconstruct_positions(preds, start_xy)[1:] # I want only the predictions, not the "starting" point (which is the last true)
    
    # --- Convert to lat/lon ---
    pred_positions_latlon = mass_xy_to_latlon(pred_positions_xy)
    target_indices = [test_meta[seq_id]["target_index"] + k for k in range(horizon)]
    true_positions_latlon = df.loc[target_indices, ["Latitude","Longtitude"]].values
    
    # --- Compute stats per step ---
    # compute haversine distance for each step
    for step in range(horizon):
        err = haversine(tuple(true_positions_latlon[step]), tuple(pred_positions_latlon[step]))
        step_errors[step].append(err)

# --- Aggregate error per step ---
print("\nStep-wise error statistics (meters):")
for step in range(horizon):
    errs = np.array(step_errors[step])
    mean_e = np.mean(errs)
    std_e = np.std(errs)
    med_e = np.median(errs)
    print(f"Step {step+1}: mean={mean_e:.2f}, std={std_e:.2f}, median={med_e:.2f}, n={len(errs)}")



0 out of 12309
Object was never used (type <class 'tensorflow.python.ops.tensor_array_ops.TensorArray'>):
<tensorflow.python.ops.tensor_array_ops.TensorArray object at 0x7256c9a05970>
If you want to mark it as used call its "mark_used()" method.
It was originally created here:
  File "/home/chris/ml_env/lib/python3.8/site-packages/keras/src/backend.py", line 5161, in <genexpr>
    ta.write(ta_index_to_write, out)  File "/home/chris/ml_env/lib/python3.8/site-packages/tensorflow/python/util/tf_should_use.py", line 288, in wrapped
50 out of 12309
100 out of 12309
150 out of 12309
200 out of 12309
250 out of 12309
300 out of 12309
350 out of 12309
400 out of 12309
450 out of 12309
500 out of 12309
550 out of 12309
600 out of 12309
650 out of 12309
700 out of 12309
750 out of 12309
800 out of 12309
850 out of 12309
900 out of 12309
950 out of 12309
1000 out of 12309

Step-wise error statistics (meters):
Step 1: mean=2153.72, std=10714.67, median=1337.28, n=1001
Step 2: mean=4219.32, std=149