In [None]:
import pandas as pd
!pip install folium

from utils import create_mmsi_dict_from_file




In [None]:
file_name = "data/mmsi_type.txt"
mmsi_map = create_mmsi_dict_from_file(file_name)


if mmsi_map:
    print("--- Successfully created dictionary ---")

--- Successfully created dictionary ---


In [None]:
df = pd.read_csv("data/ais_combined.csv")
df.head()

Unnamed: 0,MMSI,SOG,COG,Longtitude,Latitude,Timestamp,Segment
0,200000000,6.739216,90.1,11.591552,56.123522,2025-02-27 00:00:05,0
1,200000000,6.739216,90.0,11.59177,56.123522,2025-02-27 00:00:06,0
2,200000000,6.739216,90.0,11.59177,56.123522,2025-02-27 00:00:07,0
3,200000000,6.739216,89.8,11.59253,56.123518,2025-02-27 00:00:13,0
4,200000000,6.739216,89.7,11.592637,56.123518,2025-02-27 00:00:14,0


In [4]:
df_with_types = df.copy()
df_with_types['Type'] = df_with_types['MMSI'].astype(str).map(mmsi_map)
df_with_types.head()

Unnamed: 0,MMSI,SOG,COG,Longtitude,Latitude,Timestamp,Segment,Type
0,200000000,6.739216,90.1,11.591552,56.123522,2025-02-27 00:00:05,0,Tanker
1,200000000,6.739216,90.0,11.59177,56.123522,2025-02-27 00:00:06,0,Tanker
2,200000000,6.739216,90.0,11.59177,56.123522,2025-02-27 00:00:07,0,Tanker
3,200000000,6.739216,89.8,11.59253,56.123518,2025-02-27 00:00:13,0,Tanker
4,200000000,6.739216,89.7,11.592637,56.123518,2025-02-27 00:00:14,0,Tanker


In [5]:
unique_mmsi = df['MMSI'].unique()
unique_types = df_with_types['Type'].unique()

print("Total unique MMSI count:", len(unique_mmsi))
print("Unique ship types in dataset:", unique_types)

Total unique MMSI count: 991
Unique ship types in dataset: ['Tanker' 'Unknown' 'Sailing vessel' 'Other type' 'Cargo ship'
 'Passenger ship' 'Tanker (HAZ-D)' 'Tanker (HAZ-A)' 'Cargo ship (HAZ-A)'
 'Tanker (HAZ-B)' 'Fishing vessel' 'Law enforcment' 'Pilot' 'Military ops'
 'Dredging or UW ops' 'Tug' 'Anti-polution' 'HSC' 'SAR'
 'Ship type not found' 'Pleasure craft' 'Towing vessel' 'Port tender'
 'Diving ops' 'Cargo ship (HAZ-D)' 'Passenger ship (HAZ-A)'
 'Other type (HAZ-A)' 'Other type (HAZ-B)' 'Cargo ship (HAZ-B)'
 'Tanker (HAZ-C)']


In [6]:
allowed_type = ['Cargo ship', 'Cargo ship (HAZ-A)', 'Cargo ship (HAZ-B)']
df_cargo = df_with_types[df_with_types['Type'].isin(allowed_type)]
df_cargo = df_cargo.drop(columns=["Type"], axis= 1)
df_cargo.head()

Unnamed: 0,MMSI,SOG,COG,Longtitude,Latitude,Timestamp,Segment
33012,209056000,5.504551,286.9,14.087317,54.67755,2025-02-27 00:00:08,0
33013,209056000,5.504551,287.0,14.08655,54.677682,2025-02-27 00:00:17,0
33014,209056000,5.504551,287.0,14.0857,54.677825,2025-02-27 00:00:27,0
33015,209056000,5.504551,287.1,14.084912,54.677972,2025-02-27 00:00:38,0
33016,209056000,5.504551,287.1,14.084072,54.678125,2025-02-27 00:00:47,0


In [7]:
import numpy as np
from utils import segment_and_renumber, haversine_m

GAP_BREAK_MIN = 10          # minutes to start a new segment
INTERP_LIMIT_MIN = 5        # interpolate gaps up to 5 minutes
MAX_DISTANCE_M = 3000       # ~97 knots
MAX_SOG_KNOTS = 40
OUTPUT_PATH = "data/ais_data_1min_clean.csv"
NUM_COLS = ["SOG", "COG", "Longtitude", "Latitude"]
# ---------------------------------------

# --- Load data
df_cargo = df_cargo.sort_values(["MMSI", "Timestamp"]).reset_index(drop=True)
# --- Segment first (sequential per MMSI)
# print(df_cargo)
print(df_cargo.dtypes)
df_cargo["Timestamp"] = pd.to_datetime(df_cargo["Timestamp"], errors="coerce")
df = segment_and_renumber(df_cargo, GAP_BREAK_MIN)

# --- Downsample & interpolate per segment
results = []
for (mmsi, seg), g in df.groupby(["MMSI", "Segment"], observed=True):
    g = g.set_index("Timestamp")

    # Downsample to 1-minute intervals (keep last)
    g1 = g.resample("1min").last()

    # Interpolate numeric columns for short gaps only
    g1[NUM_COLS] = g1[NUM_COLS].interpolate(
        method="time", limit=INTERP_LIMIT_MIN, limit_direction="both"
    )

    # Drop minutes still NaN (beyond real range or long gaps)
    g1 = g1.dropna(subset=NUM_COLS, how="all")

    # Fill identifiers
    g1["MMSI"] = mmsi
    g1["Segment"] = seg

    # --- Outlier guards ---
    lat = g1["Latitude"].to_numpy()
    lon = g1["Longtitude"].to_numpy()
    lat_prev, lon_prev = np.roll(lat, 1), np.roll(lon, 1)
    lat_prev[0], lon_prev[0] = lat[0], lon[0]

    g1["distance_m"] = haversine_m(lat, lon, lat_prev, lon_prev)
    g1.loc[g1.index[0], "distance_m"] = 0.0
    g1["speed_mps_track"] = g1["distance_m"] / 60.0

    # Filter unrealistic movement or SOG
    g1 = g1[(g1["distance_m"] < MAX_DISTANCE_M) & (g1["SOG"] <= MAX_SOG_KNOTS)]

    results.append(g1)


# --- Combine & save
df_clean = pd.concat(results).reset_index()
print("Before deleting", len(df_clean))
missing = df_clean[df_clean[["SOG", "COG", "Latitude", "Longtitude"]].isna().any(axis=1)]
unique_mmsi_total = missing["MMSI"].nunique()

mmsi_ids_missing = missing["MMSI"].unique()
print("MMSI IDs with missing data:")
print(mmsi_ids_missing)


print("Unique MMSI in full dataset:", unique_mmsi_total)
print(f"Missing numeric data rows: {len(missing)}")
# Removing rows with empty data approximately 6%
df_clean = df_clean.dropna(subset=["SOG", "COG", "Latitude", "Longtitude", "MMSI", "Segment"])
print("After deleting", len(df_clean))

print((df_clean.groupby(["MMSI","Segment"])["Timestamp"]
   .diff().dt.total_seconds().div(60)
   .max() > 5).any())

MMSI            int64
SOG           float64
COG           float64
Longtitude    float64
Latitude      float64
Timestamp      object
Segment         int64
dtype: object
Before deleting 200338
MMSI IDs with missing data:
[219003217 219004907 219009081 219011283]
Unique MMSI in full dataset: 4
Missing numeric data rows: 2933
After deleting 197405
True
