# Imports

In [14]:
from utils import (
    segment_and_renumber,
    haversine_m,
    create_mmsi_dict_from_file
)
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np


# Fecthing Data

In [15]:
file_name = "data/mmsi_type.txt"
mmsi_map = create_mmsi_dict_from_file(file_name)

if mmsi_map:
    print("--- Successfully created dictionary ---")

--- Successfully created dictionary ---


In [16]:
df = pd.read_csv("data/ais_combined_merged.csv")
df_with_types = df.copy()
df_with_types['Type'] = df_with_types['MMSI'].astype(str).map(mmsi_map)
allowed_type = ['Cargo ship', 'Cargo ship (HAZ-A)', 'Cargo ship (HAZ-B)', 'Cargo ship (HAZ-D)', 'Tanker', 'Tanker (HAZ-A)', 'Tanker (HAZ-B)', 'Tanker (HAZ-C)', 'Tanker (HAZ-D)']
df_cargo = df_with_types[df_with_types['Type'].isin(allowed_type)]

df_cargo = df_cargo.drop(columns=["Type"], axis= 1)

# Data preprocessing

In [17]:
# Configuration Parameters
GAP_BREAK_MIN = 180
INTERPOLATION_LIMIT_MIN = 3
MAX_DISTANCE_M = 3000 # meters
MAX_SOG_KNOTS = 40 * 0.514444  # Convert knots to m/s
NUM_COLS = ["SOG", "COG", "Longtitude", "Latitude"]
MIN_SEGMENT_LENGTH = 35
INTERVAL = 5

### Check Stats

In [18]:
df_cargo = df_cargo.sort_values(["MMSI", "Timestamp"]).reset_index(drop=True)
df_cargo["Timestamp"] = pd.to_datetime(df_cargo["Timestamp"], errors="coerce")
print(f"Initial data shape: {df_cargo.shape}")
print(f"Data types:\n{df_cargo.dtypes}\n")

Initial data shape: (1737515, 7)
Data types:
MMSI                   int64
SOG                  float64
COG                  float64
Longtitude           float64
Latitude             float64
Timestamp     datetime64[ns]
Segment                int64
dtype: object



### Segmentation

In [19]:
df = segment_and_renumber(df_cargo, GAP_BREAK_MIN)

### Downsampling & interpolation

In [20]:
results = []

for (mmsi, seg), g in df.groupby(["MMSI", "Segment"]):
    g = g.set_index("Timestamp")

    g1 = g.resample(f"{INTERVAL}min").last()
    
    # Cubic spline interpolation
    non_null_counts = g1[NUM_COLS].notnull().sum()
    min_points = non_null_counts.min()
    if min_points >= 4:
        g1_numeric_idx = g1.copy()
        g1_numeric_idx.index = (g1_numeric_idx.index - g1_numeric_idx.index[0]).total_seconds()

        g1_numeric_idx[NUM_COLS] = g1_numeric_idx[NUM_COLS].interpolate(
            method="spline", order=3, limit=INTERPOLATION_LIMIT_MIN, limit_direction="both"
        )

        g1[NUM_COLS] = g1_numeric_idx[NUM_COLS].values
    else:
        g1[NUM_COLS] = g1[NUM_COLS].interpolate(
            method="linear", limit=INTERPOLATION_LIMIT_MIN, limit_direction="both"
        )

    g1 = g1.dropna(subset=NUM_COLS, how="any")
    
    if len(g1) < 1:
        continue
    
    g1["COG"] = g1["COG"] % 360
    
    g1["MMSI"] = mmsi
    g1["Segment"] = seg

    lat = g1["Latitude"].to_numpy()
    lon = g1["Longtitude"].to_numpy()
    lat_prev = np.roll(lat, 1)
    lon_prev = np.roll(lon, 1)

    g1["distance_m"] = haversine_m(lat, lon, lat_prev, lon_prev)
    g1.loc[g1.index[0], "distance_m"] = 0.0
    
    g1 = g1[(g1["distance_m"] <= MAX_DISTANCE_M) & (g1["SOG"] <= MAX_SOG_KNOTS)]

    results.append(g1)

df_clean = pd.concat(results).reset_index()
print(f"Cleaned data shape: {df_clean.shape}")

Cleaned data shape: (70343, 8)


### Data quality check

In [21]:
print(f"Rows before cleaning: {len(df_clean)}")

missing = df_clean[df_clean[NUM_COLS].isna().any(axis=1)]
print(f"Rows with missing numeric data: {len(missing)} ({len(missing)/len(df_clean)*100:.2f}%)")
print(f"MMSI with missing data: {missing['MMSI'].nunique()}")

Rows before cleaning: 70343
Rows with missing numeric data: 0 (0.00%)
MMSI with missing data: 0


In [22]:
df_clean = df_clean.dropna(subset=NUM_COLS+["MMSI", "Segment"])
print(f"Rows after cleaning: {len(df_clean)}")

Rows after cleaning: 70343


### Re-segmentation

In [23]:
print(f"\nRe-segmenting based on gaps > {INTERVAL} minutes")
df_clean = df_clean.sort_values(["MMSI", "Segment", "Timestamp"]).reset_index(drop=True)
df_clean = segment_and_renumber(df_clean, GAP_BREAK_MIN=5)

max_gap_after = df_clean.groupby(["MMSI","Segment"])["Timestamp"].diff().dt.total_seconds().div(60).max()
print(f"Maximum time gap after re-segmentation: {max_gap_after:.2f} minutes")


Re-segmenting based on gaps > 5 minutes
Maximum time gap after re-segmentation: 5.00 minutes


### Segment length filtering

In [24]:
print(f"\nFiltering segments with < {MIN_SEGMENT_LENGTH} points")
print(f"Segments before filtering: {df_clean.groupby(['MMSI', 'Segment']).ngroups}")
print(f"Rows before filtering: {len(df_clean)}")

segment_sizes = df_clean.groupby(["MMSI", "Segment"]).size()
valid_segments = segment_sizes[segment_sizes >= MIN_SEGMENT_LENGTH].index
df_clean = df_clean.set_index(["MMSI", "Segment"]).loc[valid_segments].reset_index()

print(f"Segments after filtering: {df_clean.groupby(['MMSI', 'Segment']).ngroups}")
print(f"Rows after filtering: {len(df_clean)}")


Filtering segments with < 35 points
Segments before filtering: 873
Rows before filtering: 70343
Segments after filtering: 388
Rows after filtering: 67102


### Dataset summary

In [25]:
print(f"Total rows: {len(df_clean)}")
print(f"Unique vessels (MMSI): {df_clean['MMSI'].nunique()}")
print(f"Total segments: {df_clean.groupby(['MMSI', 'Segment']).ngroups}")
print(f"Average segment length: {df_clean.groupby(['MMSI', 'Segment']).size().mean()*INTERVAL:.1f} minutes")
print(f"Columns: {list(df_clean.columns)}")

Total rows: 67102
Unique vessels (MMSI): 272
Total segments: 388
Average segment length: 864.7 minutes
Columns: ['MMSI', 'Segment', 'Timestamp', 'SOG', 'COG', 'Longtitude', 'Latitude', 'distance_m']


### Save cleaned data

In [26]:
OUTPUT_PATH = "data/ais_data_5min_clean.csv"
df_clean.to_csv(OUTPUT_PATH, index=False)
print(f"\nCleaned data saved to: {OUTPUT_PATH}")


Cleaned data saved to: data/ais_data_5min_clean.csv
