In [None]:
import pandas as pd
import numpy as np
from pyproj import Transformer


INPUT_CSV   = "ais_combined.csv"
OUTPUT_CSV  = "ais_clean_XY.csv"
CHUNKSIZE   = 300_000

# Set your custom (0,0) origin in lat/lon (degrees)
lat0, lon0  = 56.0, 10.0

# If Segment is missing, we will create it by splitting on gaps > MAX_GAP (per MMSI)
MAX_GAP = pd.Timedelta("15min")

# Column candidates to tolerate naming differences
LAT_CANDS = ["Latitude", "lat", "Lat"]
LON_CANDS = ["Longitude", "Longtitude", "lon", "Lon"]
MMSI_CANDS = ["MMSI", "mmsi"]
SOG_CANDS  = ["SOG", "sog"]
COG_CANDS  = ["COG", "cog"]
TS_CANDS   = ["Timestamp", "timestamp", "time", "DateTime", "datetime"]
SEG_CANDS  = ["Segment", "segment", "SegmentID", "segment_id"]


def find_col(cols, candidates, required=True):
    for c in candidates:
        if c in cols:
            return c
    if required:
        raise ValueError(f"None of {candidates} found in columns: {list(cols)}")
    return None

def ensure_timestamp_series(s):
    # Parse to timezone-aware UTC; drop invalid
    ts = pd.to_datetime(s, errors="coerce", utc=True)
    return ts

def make_segments_if_missing(df, ts_col, mmsi_col):
    """
    Create 'Segment' per MMSI by splitting when time gaps > MAX_GAP.
    Assumes df[ts_col] is datetime64[ns, UTC].
    """
    # sort per MMSI + time
    df = df.sort_values([mmsi_col, ts_col])
    # compute per-MMSI time diffs
    diffs = df.groupby(mmsi_col)[ts_col].diff()
    seg_id = diffs.gt(MAX_GAP).groupby(df[mmsi_col]).cumsum().astype("int64")
    return seg_id


to_laea = Transformer.from_crs("EPSG:4326", "EPSG:3035", always_xy=True)
X0, Y0 = to_laea.transform(lon0, lat0)

# Read header to identify columns
head = pd.read_csv(INPUT_CSV, nrows=0)
cols = head.columns

lat_col  = find_col(cols, LAT_CANDS)
lon_col  = find_col(cols, LON_CANDS)
mmsi_col = find_col(cols, MMSI_CANDS)
sog_col  = find_col(cols, SOG_CANDS, required=False)  # SOG may be missing
cog_col  = find_col(cols, COG_CANDS, required=False)  # COG may be missing
ts_col   = find_col(cols, TS_CANDS)
seg_col  = find_col(cols, SEG_CANDS, required=False)

first_write = True

for i, chunk in enumerate(pd.read_csv(INPUT_CSV, chunksize=CHUNKSIZE)):
    print(f"Processing chunk {i+1} with {len(chunk)} rows...")

    # Keep only rows with valid lat/lon
    chunk[lat_col] = pd.to_numeric(chunk[lat_col], errors="coerce")
    chunk[lon_col] = pd.to_numeric(chunk[lon_col], errors="coerce")
    chunk = chunk.dropna(subset=[lat_col, lon_col])

    # Timestamp handling
    chunk[ts_col] = ensure_timestamp_series(chunk[ts_col])
    chunk = chunk.dropna(subset=[ts_col])

    # MMSI as string (keeps leading zeros if ever present)
    chunk[mmsi_col] = chunk[mmsi_col].astype(str)

    # If SOG/COG exist, ensure numeric; else create NaNs so columns exist
    if sog_col is not None:
        chunk[sog_col] = pd.to_numeric(chunk[sog_col], errors="coerce")
    else:
        chunk["SOG"] = np.nan
        sog_col = "SOG"

    if cog_col is not None:
        chunk[cog_col] = pd.to_numeric(chunk[cog_col], errors="coerce")
    else:
        chunk["COG"] = np.nan
        cog_col = "COG"

    # Segment handling: keep existing, else create by gap rule
    if seg_col is None:
        # Build segments per MMSI on this chunk's scope
        chunk["Segment"] = make_segments_if_missing(chunk, ts_col, mmsi_col)
        seg_col = "Segment"
    else:
        # Normalize name to 'Segment' for uniform output
        chunk["Segment"] = chunk[seg_col].astype("int64", errors="ignore")

    # PROJECT to meters (single CRS) and RECENTER to your origin
    X, Y = to_laea.transform(chunk[lon_col].values, chunk[lat_col].values)
    X_local = X - X0
    Y_local = Y - Y0

    # Assemble output with the exact column order you want
    out = pd.DataFrame({
        "MMSI":      chunk[mmsi_col].values,
        "SOG":       chunk[sog_col].values,
        "COG":       chunk[cog_col].values,
        "X":         X_local,
        "Y":         Y_local,
        "Timestamp": chunk[ts_col].dt.strftime("%Y-%m-%d %H:%M:%S%z"),  # ISO-like string with UTC offset
        "Segment":   chunk["Segment"].values
    })

    # Write/append
    out.to_csv(
        OUTPUT_CSV,
        mode="a",
        index=False,
        header=first_write,
        float_format="%.3f"
    )
    first_write = False

print(f" Done. Saved ordered columns to: {OUTPUT_CSV}")


Processing chunk 1 with 300000 rows...
Processing chunk 2 with 300000 rows...
Processing chunk 3 with 300000 rows...
Processing chunk 4 with 300000 rows...
Processing chunk 5 with 300000 rows...
Processing chunk 6 with 300000 rows...
Processing chunk 7 with 300000 rows...
Processing chunk 8 with 300000 rows...
Processing chunk 9 with 300000 rows...
Processing chunk 10 with 300000 rows...
Processing chunk 11 with 300000 rows...
Processing chunk 12 with 300000 rows...
Processing chunk 13 with 300000 rows...
Processing chunk 14 with 300000 rows...
Processing chunk 15 with 300000 rows...
Processing chunk 16 with 300000 rows...
Processing chunk 17 with 300000 rows...
Processing chunk 18 with 142595 rows...
âœ… Done. Saved ordered columns to: ais_clean_XY.csv
