In [None]:
import pandas as pd


file_path = "/content/drive/MyDrive/F AIS.csv"
ais_df = pd.read_csv(file_path)


required_columns = ["MMSI", "BaseDateTime", "LAT", "LON", "SOG", "COG", "Heading"]
gps_df = ais_df[required_columns]


print(gps_df.head())
print(gps_df.info())


        MMSI         BaseDateTime       LAT        LON   SOG    COG  Heading
0  538008468  2020-01-01T00:00:00  38.25802  -76.29487  14.9  338.6    337.0
1  368120510  2020-01-01T00:00:00  27.58610  -82.75991   0.0   58.6    511.0
2  368063930  2020-01-01T00:00:00  40.71045  -73.97588  11.2  208.9    207.0
3  368106220  2020-01-01T00:00:00  38.53932  -90.25523   0.2  161.8     31.0
4  367336180  2020-01-01T00:00:00  56.02945 -132.68705   9.0  325.5    511.0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5401 entries, 0 to 5400
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MMSI          5401 non-null   int64  
 1   BaseDateTime  5401 non-null   object 
 2   LAT           5401 non-null   float64
 3   LON           5401 non-null   float64
 4   SOG           5401 non-null   float64
 5   COG           5401 non-null   float64
 6   Heading       5401 non-null   float64
dtypes: float64(5), int64(1), object(1)
memor

## Remove missing or invalid values:
SOG (Speed Over Ground)
0 – 30 knots (typical)
*****
COG (Course Over Ground)
0° – 360°(normal values)


In [None]:

gps_df.dropna(inplace=True)


gps_df = ais_df[(gps_df['SOG'] >= 0) &
                (gps_df['LAT'].between(-90, 90)) &
                (gps_df['LON'].between(-180, 180))]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  gps_df.dropna(inplace=True)


In [None]:

gps_df.drop_duplicates(inplace=True)


gps_df.drop_duplicates(subset=['MMSI', 'BaseDateTime', 'LAT', 'LON'], inplace=True)


##Convert timestamp and sort

In [None]:

gps_df['BaseDateTime'] = pd.to_datetime(gps_df['BaseDateTime'])


gps_df.sort_values(by=['MMSI', 'BaseDateTime'], inplace=True)


In [None]:
print(gps_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 5401 entries, 3216 to 1079
Data columns (total 7 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   MMSI          5401 non-null   int64         
 1   BaseDateTime  5401 non-null   datetime64[ns]
 2   LAT           5401 non-null   float64       
 3   LON           5401 non-null   float64       
 4   SOG           5401 non-null   float64       
 5   COG           5401 non-null   float64       
 6   Heading       5401 non-null   float64       
dtypes: datetime64[ns](1), float64(5), int64(1)
memory usage: 337.6 KB
None


## feature engineering

In [None]:
import numpy as np

# Haversine formula to calculate distance between two lat/lon points
def haversine(lat1, lon1, lat2, lon2):
    R = 6371e3  # Earth radius in meters
    phi1 = np.radians(lat1)
    phi2 = np.radians(lat2)
    delta_phi = np.radians(lat2 - lat1)
    delta_lambda = np.radians(lon2 - lon1)
    a = np.sin(delta_phi/2)**2 + np.cos(phi1)*np.cos(phi2)*np.sin(delta_lambda/2)**2
    c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
    return R * c


gps_df["distance"] = 0.0
gps_df["delta_time"] = 0.0
gps_df["speed_calc"] = 0.0
gps_df["acceleration"] = 0.0
gps_df["delta_heading"] = 0.0

for mmsi, group in gps_df.groupby("MMSI"):
    group = group.sort_values("BaseDateTime")
    group["delta_time"] = group["BaseDateTime"].diff().dt.total_seconds().fillna(1)
    group["distance"] = haversine(group["LAT"].shift(), group["LON"].shift(), group["LAT"], group["LON"]).fillna(0)
    group["speed_calc"] = group["distance"] / group["delta_time"]  # m/s
    group["acceleration"] = group["speed_calc"].diff() / group["delta_time"]
    group["delta_heading"] = group["Heading"].diff().fillna(0)

    gps_df.loc[group.index, ["distance", "delta_time", "speed_calc", "acceleration", "delta_heading"]] = \
        group[["distance", "delta_time", "speed_calc", "acceleration", "delta_heading"]]


gps_df["speed_ma"] = gps_df.groupby("MMSI")["speed_calc"].transform(lambda x: x.rolling(5, min_periods=1).mean())
gps_df["acceleration_ma"] = gps_df.groupby("MMSI")["acceleration"].transform(lambda x: x.rolling(5, min_periods=1).mean())
gps_df["delta_heading_ma"] = gps_df.groupby("MMSI")["delta_heading"].transform(lambda x: x.rolling(5, min_periods=1).mean())


gps_df["LAT_pred"] = gps_df["LAT"].shift() + (gps_df["LAT"].diff() / gps_df["delta_time"]).fillna(0) * gps_df["delta_time"]
gps_df["LON_pred"] = gps_df["LON"].shift() + (gps_df["LON"].diff() / gps_df["delta_time"]).fillna(0) * gps_df["delta_time"]

gps_df["prediction_error"] = np.sqrt((gps_df["LAT_pred"] - gps_df["LAT"])**2 + (gps_df["LON_pred"] - gps_df["LON"])**2)

features = [
    "MMSI", "BaseDateTime", "LAT", "LON", "SOG", "COG", "Heading",
    "speed_calc", "acceleration", "delta_heading",
    "speed_ma", "acceleration_ma", "delta_heading_ma",
    "LAT_pred", "LON_pred", "prediction_error"
]
df_final = gps_df[features]


df_final.to_csv("AIS_features_ready.csv", index=False)

print("Processed dataset ready for AI model!")
print(df_final.head())


Processed dataset ready for AI model!
         MMSI        BaseDateTime       LAT        LON  SOG    COG  Heading  \
3216   270995 2020-01-01 00:00:03  35.09744  -90.10038  0.0  245.1    511.0   
5326   507027 2020-01-01 00:00:08  37.00643  -76.25625  0.1  308.3    511.0   
374   1056261 2020-01-01 00:00:12  26.11822  -80.14813  0.0  248.4    511.0   
922   1193046 2020-01-01 00:00:08  32.68644 -117.13303  0.0    0.0    511.0   
336   3669999 2020-01-01 00:00:01  33.72340 -118.28068  0.1  258.5    511.0   

      speed_calc  acceleration  delta_heading  speed_ma  acceleration_ma  \
3216         0.0           NaN            0.0       0.0              NaN   
5326         0.0           NaN            0.0       0.0              NaN   
374          0.0           NaN            0.0       0.0              NaN   
922          0.0           NaN            0.0       0.0              NaN   
336          0.0           NaN            0.0       0.0              NaN   

      delta_heading_ma  LAT_pr