In [None]:
import pandas as pd
from utils import create_mmsi_dict_from_file
from utils import filter_stationary_ships

In [None]:
file_name = "data/mmsi_type.txt"
mmsi_map = create_mmsi_dict_from_file(file_name)


if mmsi_map:
    print("--- Successfully created dictionary ---")

In [None]:
df = pd.read_csv("data/ais_combined.csv")
df.head()

In [None]:
df_with_types = df.copy()
df_with_types['Type'] = df_with_types['MMSI'].astype(str).map(mmsi_map)
df_with_types.head()

In [None]:
unique_mmsi = df['MMSI'].unique()
unique_types = df_with_types['Type'].unique()

print("Total unique MMSI count:", len(unique_mmsi))
print("Unique ship types in dataset:", unique_types)

In [None]:
allowed_type = ['Cargo ship', 'Cargo ship (HAZ-A)', 'Cargo ship (HAZ-B)', 'Cargo ship (HAZ-D)', 'Tanker', 'Tanker (HAZ-A)', 'Tanker (HAZ-B)', 'Tanker (HAZ-C)', 'Tanker (HAZ-D)']
df_cargo = df_with_types[df_with_types['Type'].isin(allowed_type)]
df_cargo = df_cargo.drop(columns=["Type"], axis= 1)
df_cargo.head()
df_cargo_filtered = filter_stationary_ships(df_cargo) # This df has dropped stationary ships

In [None]:
import numpy as np
from utils import segment_and_renumber, haversine_m

# Configuration parameters
GAP_BREAK_MIN = 10          # minutes to start a new segment
INTERP_LIMIT_MIN = 10        # interpolate gaps up to 10 minutes
MAX_DISTANCE_M = 3000       # ~97 knots max distance per minute
MAX_SOG_KNOTS = 40          # maximum speed over ground
OUTPUT_PATH = "data/ais_data_1min_clean.csv"
NUM_COLS = ["SOG", "COG", "Longtitude", "Latitude"]

print("="*60)
print("STEP 1: Data Preprocessing")
print("="*60)

# Sort data by MMSI and Timestamp
df_cargo = df_cargo.sort_values(["MMSI", "Timestamp"]).reset_index(drop=True)
df_cargo["Timestamp"] = pd.to_datetime(df_cargo["Timestamp"], errors="coerce")

print(f"Initial data shape: {df_cargo.shape}")
print(f"Data types:\n{df_cargo.dtypes}\n")

# Segment trajectories based on time gaps
print("Segmenting trajectories...")
df = segment_and_renumber(df_cargo, GAP_BREAK_MIN)

# Downsample & interpolate per segment
print("Downsampling to 1-minute intervals and interpolating...")
results = []

for (mmsi, seg), g in df.groupby(["MMSI", "Segment"], observed=True):
    g = g.set_index("Timestamp")
    
    # Downsample to 1-minute intervals (keep last observation)
    g1 = g.resample("1min").last()
    
    # Interpolate numeric columns for short gaps only
    g1[NUM_COLS] = g1[NUM_COLS].interpolate(
        method="time", limit=INTERP_LIMIT_MIN, limit_direction="both"
    )
    
    # Drop minutes still NaN (beyond real range or long gaps)
    g1 = g1.dropna(subset=NUM_COLS, how="all")
    
    # Fill identifiers
    g1["MMSI"] = mmsi
    g1["Segment"] = seg
    
    # Calculate distance and speed between consecutive points
    lat = g1["Latitude"].to_numpy()
    lon = g1["Longtitude"].to_numpy()
    lat_prev, lon_prev = np.roll(lat, 1), np.roll(lon, 1)
    lat_prev[0], lon_prev[0] = lat[0], lon[0]
    
    g1["distance_m"] = haversine_m(lat, lon, lat_prev, lon_prev)
    g1.loc[g1.index[0], "distance_m"] = 0.0
    g1["speed_mps_track"] = g1["distance_m"] / 60.0
    
    # Filter unrealistic movement or SOG
    g1 = g1[(g1["distance_m"] < MAX_DISTANCE_M) & (g1["SOG"] <= MAX_SOG_KNOTS)]
    
    results.append(g1)

# Combine all segments
df_clean = pd.concat(results).reset_index()

print("="*60)
print("STEP 2: Data Quality Check")
print("="*60)
print(f"Rows before cleaning: {len(df_clean)}")

# Check for missing data
missing = df_clean[df_clean[["SOG", "COG", "Latitude", "Longtitude"]].isna().any(axis=1)]
print(f"Rows with missing numeric data: {len(missing)} ({len(missing)/len(df_clean)*100:.2f}%)")
print(f"MMSI with missing data: {missing['MMSI'].nunique()}")

# Remove rows with missing critical data
df_clean = df_clean.dropna(subset=["SOG", "COG", "Latitude", "Longtitude", "MMSI", "Segment"])
print(f"Rows after cleaning: {len(df_clean)}")

# Verify time gaps
max_gap = df_clean.groupby(["MMSI","Segment"])["Timestamp"].diff().dt.total_seconds().div(60).max()
print(f"Maximum time gap in cleaned data: {max_gap:.2f} minutes")
has_large_gaps = (df_clean.groupby(["MMSI","Segment"])["Timestamp"]
                  .diff().dt.total_seconds().div(60).max() > 5).any()
print(f"Has gaps > 5 minutes: {has_large_gaps}")

print("\n" + "="*60)
print("STEP 3: Final Dataset Summary")
print("="*60)
print(f"Total rows: {len(df_clean)}")
print(f"Unique vessels (MMSI): {df_clean['MMSI'].nunique()}")
print(f"Total segments: {df_clean.groupby(['MMSI', 'Segment']).ngroups}")
print(f"Average segment length: {df_clean.groupby(['MMSI', 'Segment']).size().mean():.1f} minutes")
print(f"Columns: {list(df_clean.columns)}")

# Save cleaned data
df_clean.to_csv(OUTPUT_PATH, index=False)
print(f"\nCleaned data saved to: {OUTPUT_PATH}")


In [None]:
%matplotlib inline
# Import deep learning libraries
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

print(f"TensorFlow version: {tf.__version__}")
print(f"GPU Available: {tf.config.list_physical_devices('GPU')}")


In [None]:
# Configuration for GRU model
SEQUENCE_LENGTH = 10  # Use 10 minutes of history to predict next minute
FEATURES = ["Latitude", "Longtitude", "SOG", "COG"]  # Input features
TARGET_FEATURES = ["Latitude", "Longtitude", "SOG", "COG"]  # What to predict
MIN_SEGMENT_LENGTH = SEQUENCE_LENGTH + 5  # Minimum segment length to use

print(f"Sequence length: {SEQUENCE_LENGTH} minutes")
print(f"Input features: {FEATURES}")
print(f"Target features: {TARGET_FEATURES}")


In [None]:
def create_sequences(data, sequence_length, features, target_features):
    """
    Create sequences for time series prediction.
    
    Parameters:
    -----------
    data : pd.DataFrame
        Input dataframe for a single segment
    sequence_length : int
        Number of timesteps to use as input
    features : list
        List of feature column names to use as input
    target_features : list
        List of feature column names to predict
        
    Returns:
    --------
    X : np.array
        Input sequences of shape (n_samples, sequence_length, n_features)
    y : np.array
        Target values of shape (n_samples, n_target_features)
    """
    X, y = [], []
    
    data_values = data[features].values
    target_values = data[target_features].values
    
    for i in range(len(data_values) - sequence_length):
        X.append(data_values[i:i+sequence_length])
        y.append(target_values[i+sequence_length])
    
    return np.array(X), np.array(y)


def prepare_training_data(df, sequence_length, features, target_features, min_segment_length):
    """
    Prepare training data from the entire dataset.
    
    Parameters:
    -----------
    df : pd.DataFrame
        Cleaned AIS data
    sequence_length : int
        Number of timesteps for input sequences
    features : list
        Input feature names
    target_features : list
        Target feature names
    min_segment_length : int
        Minimum segment length to include
        
    Returns:
    --------
    X : np.array
        All input sequences
    y : np.array
        All target values
    segment_info : list
        Information about which segments were used
    """
    X_all, y_all = [], []
    segment_info = []
    
    for (mmsi, seg), group in df.groupby(["MMSI", "Segment"]):
        # Skip short segments
        if len(group) < min_segment_length:
            continue
            
        # Sort by timestamp to ensure correct order
        group = group.sort_values("Timestamp")
        
        # Create sequences for this segment
        X_seg, y_seg = create_sequences(group, sequence_length, features, target_features)
        
        if len(X_seg) > 0:
            X_all.append(X_seg)
            y_all.append(y_seg)
            segment_info.append({
                'mmsi': mmsi,
                'segment': seg,
                'length': len(group),
                'sequences': len(X_seg)
            })
    
    # Concatenate all sequences
    X = np.concatenate(X_all, axis=0)
    y = np.concatenate(y_all, axis=0)
    
    return X, y, segment_info


print("Data preparation functions defined successfully.")


In [None]:
print("="*60)
print("Preparing Training Data")
print("="*60)

# Prepare sequences
X, y, segment_info = prepare_training_data(
    df_clean, 
    SEQUENCE_LENGTH, 
    FEATURES, 
    TARGET_FEATURES, 
    MIN_SEGMENT_LENGTH
)

print(f"Total sequences created: {len(X)}")
print(f"Input shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"Segments used: {len(segment_info)}")
print(f"Average sequences per segment: {len(X) / len(segment_info):.1f}")

# Display some statistics
segment_lengths = [s['length'] for s in segment_info]
print(f"\nSegment statistics:")
print(f"  Min length: {min(segment_lengths)} minutes")
print(f"  Max length: {max(segment_lengths)} minutes")
print(f"  Mean length: {np.mean(segment_lengths):.1f} minutes")
print(f"  Median length: {np.median(segment_lengths):.1f} minutes")


In [None]:
# Normalize the data
print("="*60)
print("Normalizing Data")
print("="*60)

# Reshape X for normalization: (samples, timesteps, features) -> (samples * timesteps, features)
n_samples, n_timesteps, n_features = X.shape
X_reshaped = X.reshape(-1, n_features)

# Fit scaler on training data
scaler_X = StandardScaler()
X_normalized = scaler_X.fit_transform(X_reshaped)
X_normalized = X_normalized.reshape(n_samples, n_timesteps, n_features)

# Normalize targets
scaler_y = StandardScaler()
y_normalized = scaler_y.fit_transform(y)

print(f"Input data normalized: {X_normalized.shape}")
print(f"Target data normalized: {y_normalized.shape}")
print(f"\nFeature means: {scaler_X.mean_}")
print(f"Feature stds: {scaler_X.scale_}")


In [None]:
# Split data into train, validation, and test sets BY SHIP (MMSI)
print("="*60)
print("Splitting Data by Ships (MMSI)")
print("="*60)

# Get unique MMSIs from segment_info
unique_mmsis = list(set([seg['mmsi'] for seg in segment_info]))
n_ships = len(unique_mmsis)

print(f"Total unique ships: {n_ships}")

# Split ships into train (64%), val (16%), test (20%)
# First split: 80% train+val, 20% test
mmsi_temp, mmsi_test = train_test_split(
    unique_mmsis, test_size=0.2, random_state=42, shuffle=True
)

# Second split: 80% train, 20% val (of the temp set)
mmsi_train, mmsi_val = train_test_split(
    mmsi_temp, test_size=0.2, random_state=42, shuffle=True
)

print(f"\nShips in training set: {len(mmsi_train)} ({len(mmsi_train)/n_ships*100:.1f}%)")
print(f"Ships in validation set: {len(mmsi_val)} ({len(mmsi_val)/n_ships*100:.1f}%)")
print(f"Ships in test set: {len(mmsi_test)} ({len(mmsi_test)/n_ships*100:.1f}%)")

# Create sets of MMSIs for fast lookup
mmsi_train_set = set(mmsi_train)
mmsi_val_set = set(mmsi_val)
mmsi_test_set = set(mmsi_test)

# Split sequences based on which ship they belong to
train_indices = [i for i, seg in enumerate(segment_info) if seg['mmsi'] in mmsi_train_set]
val_indices = [i for i, seg in enumerate(segment_info) if seg['mmsi'] in mmsi_val_set]
test_indices = [i for i, seg in enumerate(segment_info) if seg['mmsi'] in mmsi_test_set]

# Get the actual sequences for each set
X_train = X_normalized[train_indices]
y_train = y_normalized[train_indices]

X_val = X_normalized[val_indices]
y_val = y_normalized[val_indices]

X_test = X_normalized[test_indices]
y_test = y_normalized[test_indices]

print(f"\nSequences in training set: {X_train.shape[0]} ({X_train.shape[0]/X_normalized.shape[0]*100:.1f}%)")
print(f"Sequences in validation set: {X_val.shape[0]} ({X_val.shape[0]/X_normalized.shape[0]*100:.1f}%)")
print(f"Sequences in test set: {X_test.shape[0]} ({X_test.shape[0]/X_normalized.shape[0]*100:.1f}%)")

print("\nâœ“ Data split by ships - no temporal leakage between sets!")
