In [4]:
# Import required libraries
import pandas as pd
import logging
import sys
from typing import Tuple, Dict
from colorlog import ColoredFormatter
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Input, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
import numpy as np
import tensorflow as tf
from tqdm import tqdm

In [5]:
# Configure the colorful logger
def setup_logger() -> logging.Logger:
    """Set up a colorful logger for the pipeline.
    
    Returns:
        Configured logger instance.
    """
    logger = logging.getLogger("ML_Pipeline")
    logger.setLevel(logging.INFO)
    
    # Define log colors for different levels
    formatter = ColoredFormatter(
        "%(log_color)s%(levelname)-8s%(reset)s | %(log_color)s%(message)s%(reset)s",
        datefmt=None,
        log_colors={
            'DEBUG':    'cyan',
            'INFO':     'white',
            'WARNING':  'yellow',
            'ERROR':    'red',
            'CRITICAL': 'bold_red',
        }
    )
    
    # Stream handler for console output
    handler = logging.StreamHandler(sys.stdout)
    handler.setLevel(logging.INFO)
    handler.setFormatter(formatter)
    
    logger.addHandler(handler)
    
    return logger

# Initialize the logger
logger = setup_logger()

In [6]:
def load_data() -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
    """Load the AIS and auxiliary datasets.

    Returns:
        A tuple containing the loaded DataFrames:
        (ais_train, ais_test, vessels, ports, schedules).
    """
    logger.info("Loading datasets.")
    ais_train = pd.read_csv('ais_train.csv', sep='|')
    ais_test = pd.read_csv('ais_test.csv')
    vessels = pd.read_csv('vessels.csv', sep='|')
    ports = pd.read_csv('ports.csv', sep='|')
    schedules = pd.read_csv('schedules_to_may_2024.csv', sep='|', on_bad_lines='skip')
    logger.info("Datasets loaded successfully.")
    return ais_train, ais_test, vessels, ports, schedules

# Load the data
ais_train, ais_test, vessels, ports, schedules = load_data()

[37mINFO    [0m | [37mLoading datasets.[0m
[37mINFO    [0m | [37mLoading datasets.[0m
[37mINFO    [0m | [37mLoading datasets.[0m
[37mINFO    [0m | [37mDatasets loaded successfully.[0m
[37mINFO    [0m | [37mDatasets loaded successfully.[0m
[37mINFO    [0m | [37mDatasets loaded successfully.[0m


In [14]:


def prepare_data(
    ais_train: pd.DataFrame,
    vessels: pd.DataFrame, 
    ports: pd.DataFrame, 
    schedules: pd.DataFrame,
    vessel_ids: Dict
) -> Tuple[np.ndarray, np.ndarray, MinMaxScaler, MinMaxScaler]:
    """Prepare the data for the LSTM model, including additional time-based, vessel-specific, and port proximity features.
    
    Args:
        ais_train: DataFrame containing AIS training data.
        
    Returns:
        Tuple containing the feature array (X), target array (y), and the fitted scaler.
    """
    logger.info("Preparing data for the model.")
    
    # Convert the 'time' column to datetime format for feature extraction
    ais_train['time'] = pd.to_datetime(ais_train['time'])
    for col in ['latitude', 'longitude', 'cog', 'sog', 'rot', 'heading', 'etaRaw']:
        ais_train[col] = pd.to_numeric(ais_train[col], errors='coerce')

    # Clip latitude and longitude to their valid ranges
    ais_train['latitude'] = ais_train['latitude'].clip(-90, 90)
    ais_train['longitude'] = ais_train['longitude'].clip(-180, 180)

    # List to store the interpolated data points
    interpolated_data = []

    # Loop through each vessel ID
    for vessel_id in tqdm(vessel_ids.keys(), desc="Interpolating Missing Days For Vessels", unit="vessel"):
        # Filter and sort the data for the current vessel ID
        vessel_data = ais_train[ais_train['vesselId'] == vessel_id].sort_values(by='time').reset_index(drop=True)

        # Proceed only if vessel_data is not empty
        if len(vessel_data) > 0:
            # Loop through the sorted data to check time differences
            for i in range(len(vessel_data) - 1):
                current_row = vessel_data.iloc[i]
                next_row = vessel_data.iloc[i + 1]

                time_difference = next_row['time'] - current_row['time']
                
                # Add the current row to the interpolated data list
                interpolated_data.append(current_row)

                # Check if the time difference is greater than 1 day
                if time_difference > pd.Timedelta(days=1):
                    # Calculate the number of missing days
                    num_missing_days = (time_difference.days - 1)

                    # Linearly interpolate values for each missing day
                    for day in range(1, num_missing_days + 1):
                        interpolated_time = current_row['time'] + pd.Timedelta(days=day)
                        
                        # Interpolate all relevant columns
                        interpolated_values = {}
                        for col in ['latitude', 'longitude']:
                            value_diff = (next_row[col] - current_row[col]) / (num_missing_days + 1)
                            interpolated_values[col] = current_row[col] + value_diff * day
                        
                        # Create a new interpolated data point
                        interpolated_point = current_row.copy()
                        interpolated_point['time'] = interpolated_time
                        interpolated_point['latitude'] = interpolated_values['latitude']
                        interpolated_point['longitude'] = interpolated_values['longitude']
                        interpolated_point['vesselId'] = current_row['vesselId']

                        # Add the interpolated point to the list
                        interpolated_data.append(interpolated_point)

            # Add the last row to the interpolated data list
            interpolated_data.append(vessel_data.iloc[-1])
    
    # Convert the list of interpolated data back into a DataFrame
    interpolated_df = pd.DataFrame(interpolated_data)

    # Combine the interpolated data with the original ais_train DataFrame
    combined_df = pd.concat([ais_train, interpolated_df]).drop_duplicates().sort_values(by=['vesselId', 'time']).reset_index(drop=True)
    
    ais_train = combined_df

    # Calculate the time elapsed since the first recorded entry for each vessel
    ais_train['time_elapsed'] = (ais_train['time'] - ais_train['time'].min()).dt.total_seconds()
   
    # Map vesselId to its encoded value using vessel_ids dictionary
    ais_train['vesselId_encoded'] = ais_train['vesselId'].map(vessel_ids).fillna(-1).astype(int)

    # Extract the relevant features, including the new ones
    features = ais_train[['vesselId_encoded', 'time_elapsed']].values
    target = ais_train[['latitude', 'longitude']].shift(-1).ffill().values

    # Normalize features
    feature_scaler = MinMaxScaler()
    features_scaled = feature_scaler.fit_transform(features)

    
    y = ais_train[['latitude', 'longitude']].shift(-1).ffill().values

    # Reshape for LSTM input: (samples, timesteps, features)
    X = features_scaled.reshape((features_scaled.shape[0], 1, features_scaled.shape[1]))
    
    # Normalize target data (latitude and longitude)
    target_scaler = MinMaxScaler()
    y = target_scaler.fit_transform(target)    
    
    logger.info("Data preparation complete.")
    return X, y, feature_scaler, target_scaler


# Prepare the data
vessel_id_dict = {row["vesselId"]: i for i, row in vessels.iterrows()}
X_train, y_train, feature_scaler, target_scaler = prepare_data(ais_train, vessels, ports, schedules, vessel_id_dict)

[37mINFO    [0m | [37mPreparing data for the model.[0m
[37mINFO    [0m | [37mPreparing data for the model.[0m
[37mINFO    [0m | [37mPreparing data for the model.[0m


Interpolating Missing Days For Vessels: 100%|██████████████████████████| 711/711 [02:05<00:00,  5.67vessel/s]


[37mINFO    [0m | [37mData preparation complete.[0m
[37mINFO    [0m | [37mData preparation complete.[0m
[37mINFO    [0m | [37mData preparation complete.[0m


In [15]:

def geodesic_loss(y_true, y_pred):
    """Calculate the Haversine distance between true and predicted coordinates.
    
    Args:
        y_true: Tensor of true coordinates (latitude, longitude).
        y_pred: Tensor of predicted coordinates (latitude, longitude).
    
    Returns:
        Tensor representing the geodesic distance (Haversine distance) between the true and predicted points.
    """
    # Radius of the Earth in kilometers
    R = 6371.0
    
    # Convert latitude and longitude from degrees to radians
    y_true = tf.cast(y_true, dtype=tf.float32)
    y_pred = tf.cast(y_pred, dtype=tf.float32)
    
    # Split the latitude and longitude into separate tensors
    lat_true, lon_true = tf.split(y_true, num_or_size_splits=2, axis=1)
    lat_pred, lon_pred = tf.split(y_pred, num_or_size_splits=2, axis=1)
    
    # Convert degrees to radians manually
    lat_true = lat_true * tf.constant(np.pi / 180.0)
    lon_true = lon_true * tf.constant(np.pi / 180.0)
    lat_pred = lat_pred * tf.constant(np.pi / 180.0)
    lon_pred = lon_pred * tf.constant(np.pi / 180.0)
    
    # Compute the differences between true and predicted coordinates
    dlat = lat_pred - lat_true
    dlon = lon_pred - lon_true
    
    # Haversine formula
    a = tf.square(tf.sin(dlat / 2)) + tf.cos(lat_true) * tf.cos(lat_pred) * tf.square(tf.sin(dlon / 2))
    c = 2 * tf.atan2(tf.sqrt(a), tf.sqrt(1 - a))
    distance = R * c
    
    return tf.reduce_mean(distance)


In [16]:
def build_model(input_shape: Tuple[int, int]) -> Sequential:
    """Build the LSTM model.
    
    Args:
        input_shape: Shape of the input data (timesteps, features).
        
    Returns:
        Compiled LSTM model.
    """
    logger.info("Building the LSTM model.")
 
    model = Sequential()
    model.add(Input(shape=input_shape))  # Use Input layer to specify the shape
    model.add(LSTM(units=50, return_sequences=True))
    model.add(Dropout(0.2))
    model.add(LSTM(units=50))
    model.add(Dropout(0.2))
    model.add(Dense(units=2))  # Output: latitude and longitude
    model.compile(optimizer='adam', loss=geodesic_loss)
    logger.info("Model built successfully.")
    return model


# Build the model
model = build_model(input_shape=(X_train.shape[1], X_train.shape[2]))

[37mINFO    [0m | [37mBuilding the LSTM model.[0m
[37mINFO    [0m | [37mBuilding the LSTM model.[0m
[37mINFO    [0m | [37mBuilding the LSTM model.[0m
[37mINFO    [0m | [37mModel built successfully.[0m
[37mINFO    [0m | [37mModel built successfully.[0m
[37mINFO    [0m | [37mModel built successfully.[0m


In [17]:
def train_model(model: Sequential, X_train: np.ndarray, y_train: np.ndarray) -> None:
    """Train the LSTM model.
    
    Args:
        model: The LSTM model to train.
        X_train: Training features.
        y_train: Training targets.
    """
    logger.info("Starting model training.")
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
    reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, min_lr=1e-6)

    model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2,callbacks=[early_stopping, reduce_lr])
    logger.info("Model training complete.")

# Train the model
train_model(model, X_train, y_train)

[37mINFO    [0m | [37mStarting model training.[0m
[37mINFO    [0m | [37mStarting model training.[0m
[37mINFO    [0m | [37mStarting model training.[0m
Epoch 1/10
[1m38965/38965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 934us/step - loss: 24.9886 - val_loss: 35.4740 - learning_rate: 0.0010
Epoch 2/10
[1m38965/38965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 975us/step - loss: 23.6152 - val_loss: 19.8003 - learning_rate: 0.0010
Epoch 3/10
[1m38965/38965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 1ms/step - loss: 23.5554 - val_loss: 25.7895 - learning_rate: 0.0010
Epoch 4/10
[1m38965/38965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 1ms/step - loss: 23.5387 - val_loss: 27.1325 - learning_rate: 0.0010
Epoch 5/10
[1m38965/38965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 1ms/step - loss: 23.4886 - val_loss: 27.8399 - learning_rate: 0.0010
Epoch 6/10
[1m38965/38965[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[

In [18]:
def generate_submission(model: Sequential, ais_test: pd.DataFrame, feature_scaler: MinMaxScaler, target_scaler: MinMaxScaler, vessel_ids: Dict) -> None:
    """Generate a submission file with the predicted vessel positions.
    
    Args:
        model: Trained LSTM model.
        ais_test: DataFrame containing AIS test data.
        feature_scaler: Scaler used to normalize the features.
        target_scaler: Scaler used to normalize the target coordinates.
        vessel_ids: Dictionary to map vessel IDs to numerical indices.
    """
    logger.info("Generating predictions for the test set.")
    
    # Convert the 'time' column to datetime format to handle arithmetic operations
    ais_test['time'] = pd.to_datetime(ais_test['time'], errors='coerce')
    
    # Map vesselId to its encoded value using vessel_ids dictionary
    ais_test['vesselId_encoded'] = ais_test['vesselId'].map(vessel_ids).fillna(-1).astype(int)
   
    # Calculate the time elapsed since the first recorded entry for each vessel
    ais_test['time_elapsed'] = (ais_test['time'] - ais_test['time'].min()).dt.total_seconds()

    # Extract the relevant features for the test data
    test_features = ais_test[['vesselId_encoded', 'time_elapsed']].values

    # Since the test data only has one feature, we need to adjust the input shape to match the model's expectation
    num_train_features = feature_scaler.n_features_in_
    test_features_padded = np.zeros((test_features.shape[0], num_train_features))
    test_features_padded[:, :test_features.shape[1]] = test_features

    # Normalize the padded test features to match the training data scale
    test_features_scaled = feature_scaler.transform(test_features_padded)
    X_test = test_features_scaled.reshape((test_features_scaled.shape[0], 1, test_features_scaled.shape[1]))
    
    # Make predictions using the model
    predictions = model.predict(X_test)
    
    # Inverse transform the predictions using the target scaler
    predictions = target_scaler.inverse_transform(predictions)
    
    # Create the submission DataFrame in the required format
    submission = pd.DataFrame({
        'ID': ais_test['ID'],
        'longitude_predicted': predictions[:, 1],
        'latitude_predicted': predictions[:, 0]
    })
   
    # Ensure that the submission file has exactly 51739 rows as required
    assert submission.shape[0] == 51739, "The submission file must have exactly 51739 rows."
    
    # Save the predictions to submission.csv
    submission.to_csv('submission.csv', index=False)
    logger.info("Submission file saved as submission.csv.")


# Generate the submission file
generate_submission(model, ais_test, feature_scaler, target_scaler, vessel_id_dict)

[37mINFO    [0m | [37mGenerating predictions for the test set.[0m
[37mINFO    [0m | [37mGenerating predictions for the test set.[0m
[37mINFO    [0m | [37mGenerating predictions for the test set.[0m
[1m1617/1617[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 438us/step
[37mINFO    [0m | [37mSubmission file saved as submission.csv.[0m
[37mINFO    [0m | [37mSubmission file saved as submission.csv.[0m
[37mINFO    [0m | [37mSubmission file saved as submission.csv.[0m


In [19]:
# Load the CSV file
df = pd.read_csv('submission.csv')

# Calculate the maximum and minimum values for longitude_predicted and latitude_predicted
longitude_max = df['longitude_predicted'].max()
longitude_min = df['longitude_predicted'].min()
latitude_max = df['latitude_predicted'].max()
latitude_min = df['latitude_predicted'].min()

# Print the results
print(f"Maximum Longitude Predicted: {longitude_max}")
print(f"Minimum Longitude Predicted: {longitude_min}")
print(f"Maximum Latitude Predicted: {latitude_max}")
print(f"Minimum Latitude Predicted: {latitude_min}")

Maximum Longitude Predicted: 28.37741
Minimum Longitude Predicted: 3.8418357
Maximum Latitude Predicted: 56.508556
Minimum Latitude Predicted: 38.977943
