In [95]:
### IMPORTS AND GLOBAL VARIABLES

import pandas as pd
import folium
from folium.plugins import AntPath, MousePosition
import numpy as np
import pandas as pd
from geopy.distance import geodesic
from sklearn.cluster import DBSCAN
from geopy.distance import great_circle
from geopy.point import Point
from sklearn.metrics import pairwise_distances
import random
import datetime
from itertools import combinations
from pyproj import Geod
from math import radians, cos, sin, asin, sqrt
import matplotlib.pyplot as plt
import seaborn as sns
import os
from tqdm import tqdm
from folium.plugins import HeatMap
MIN_LAT = 54.00
MAX_LAT = 56.00
MIN_LONG = 12.00
MAX_LONG = 15.00

In [42]:
df = pd.read_csv(r'D:\Python\master thesis\data\cleaned_aisdk-2024-09-14.csv')

In [67]:
### FUNCTIONS

def draw_ship(mmsi,df):
    df_mmsi = df[df['MMSI'] == mmsi]
    path = []
    for row in df_mmsi.iterrows():
        lat_long = (row[1]['Latitude'],row[1]['Longitude'])
        path.append(lat_long)
    map_object = folium.Map(location=[np.average(df_mmsi['Latitude']),np.average(df_mmsi['Longitude'])],zoom_start=8)
    AntPath(path,delay=400,weight=3,dash_array=[30,15]).add_to(map_object)
    folium.CircleMarker(location=path[0],radius=10,fill=True,fill_opacity=0.6,popup="start" + str(mmsi)).add_to(map_object)
    folium.CircleMarker(location=path[-1],radius=10,fill=True,fill_opacity=0.6,popup="end" + str(mmsi)).add_to(map_object)
    # for a,enumer in path:
    #     folium.CircleMarker(location=a,radius=1,fill=True,fill_opacity=0.9).add_to(map_object)
    MousePosition().add_to(map_object)
    map_object.save('map.html')

def draw_bounding_box(lats, longs):
    map_object = folium.Map(location=[np.average(lats),np.average(longs)],zoom_start=8)
    path = [(lats[0],longs[0]),(lats[0],longs[1]),(lats[1],longs[1]),(lats[1],longs[0]),(lats[0],longs[0])]
    AntPath(path,delay=1000000,weight=3,color='red',dash_array=[0,0]).add_to(map_object)
    MousePosition().add_to(map_object)
    map_object.save('map.html')

# def haversine_distance(point1, point2):
#     return great_circle(Point(point1), Point(point2)).kilometers

def haversine(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance in kilometers between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])

    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    r = 6371 # Radius of earth in kilometers. Use 3956 for miles. Determines return value units.
    return c * r

def draw_ships(number_of_ships,df,map_object=None):
    all_mmsis = df['MMSI'].unique()
    selected_mmsis = []
    if map_object is None:
        map_object = folium.Map(location=[54.8292,12.9694],zoom_start=8)
    while (len(selected_mmsis) < number_of_ships) and (len(selected_mmsis) < len(all_mmsis)):
        mmsi = random.choice(all_mmsis)
        if mmsi not in selected_mmsis:
            selected_mmsis.append(mmsi)
    for i in tqdm(selected_mmsis):
        df_mmsi = df[df['MMSI'] == i]
        path = []
        color = "#"+''.join([random.choice('0123456789ABCDEF') for j in range(6)])
        for row in df_mmsi.iterrows():
            lat_long = (row[1]['Latitude'],row[1]['Longitude'])
            path.append(lat_long)
        AntPath(path,delay=1,weight=20,color='blue',dash_array=[1000,1],paused=True).add_to(map_object)
        # folium.CircleMarker(location=path[0],radius=10,fill=True,fill_opacity=0.6,popup="start" + str(i)).add_to(map_object)
        # folium.CircleMarker(location=path[-1],radius=10,fill=True,fill_opacity=0.6,popup="end" + str(i)).add_to(map_object)
    MousePosition().add_to(map_object)
    map_object.save('map.html')

def static_value_clean(df):
    for i in ['Ship type','Width','Length']:
        most_frequent_type = (df
                            .groupby('MMSI')[i]
                            .agg(lambda x: x.value_counts(dropna=False).idxmax())
                            .reset_index()
                            .rename(columns={i: 'most_frequent_type'}))

        df = df.merge(most_frequent_type,on='MMSI',how='left')
        df[i] = df.apply(lambda row: row['most_frequent_type'] if row[i] != row['most_frequent_type'] else row[i], axis=1)
        df.drop('most_frequent_type', axis=1, inplace=True)
    return df

In [None]:
### MASS CLEANING
files_list = [i for i in os.listdir('D:\Python\master thesis\master-thesis')]
path = 'D:\Python\master thesis\data'
for file in tqdm(os.listdir(path)):
    if 'cleaned' not in file and f'cleaned_{file}' not in files_list:
        print(file)
        file_path = os.path.join(path,file)
        df = pd.read_csv(file_path)
        df = df[(df['Latitude']<MAX_LAT)&(df['Latitude']>MIN_LAT)&(df['Longitude']<MAX_LONG)&(df['Longitude']>MIN_LONG)] 
        df = df[df['Ship type'] == 'Cargo']
        df['# Timestamp'] = pd.to_datetime(df['# Timestamp']) 
        df = df.sort_values(by=['MMSI', '# Timestamp'])
        df = df.drop_duplicates(subset=['# Timestamp','MMSI'],keep='first')
        df = df.drop(columns=['Type of position fixing device','Data source type','IMO','Callsign','ROT','Cargo type','Destination','Name','A','B','C','D'], errors='ignore')
        df.to_csv(f'D:\Python\master thesis\data\cleaned_{file}')

In [None]:
## Joining all files
try:
    del df
    del final_path
except:
    pass
path = 'D:\Python\master thesis\data'
for file in tqdm(os.listdir(path)):
    if 'cleaned' in file:
        file_path = os.path.join(path,file)
        df = pd.read_csv(os.path.join(path,file))
        if 'final_path' in locals():
            final_path = pd.concat([final_path,df])
        else:
            final_path = df
        

In [44]:
final_path.to_csv('joined_cleaned_aisdk-2024-09-14 - 2024-10-14.csv')

In [3]:
### READ DATA
df = pd.read_csv(r'D:\Python\master thesis\data\joined_cleaned_aisdk-2024-09-14 - 2024-10-14.csv')
# df = pd.read_csv(r'D:\Python\master thesis\data\aisdk-2024-10-14.csv')
# df = pd.read_csv(r"C:\Users\Kasparas\Desktop\master thesis\data\first_mil_rows.csv")
# df = pd.read_csv(r"C:\Users\Kasparas\Desktop\master thesis\data\downsample_dataset.csv")
# df = pd.read_csv(r"C:\Users\Kasparas\Desktop\master thesis\data\final_dataset.csv")
df = df.drop(columns=['Unnamed: 0','Unnamed: 0.1'])

In [11]:
### DROPING DUPLICATES (ONLY WHERE DUPLICATES ARE TIMESTAMP AND MMSI) KEEPING FIRST RECORD

df = df.drop_duplicates(subset=['# Timestamp','MMSI'],keep='first')

In [16]:
### REMOVING IRRELEVANT COLUMNS
###   Type of position fixing device - data related to transmission and not to vessel traffic
###   Data source type - data related to transmission and not to vessel traffic
###   IMO - unique identifier for vessel, MMSI does the job
###   Callsign - unique identifier for vessel, MMSI does the job
###   ROT - too many empty values for the columns to be considered useful
###   Name - too many empty values for the columns to be considered useful
###   Cargo type - too many empty values for the columns to be considered useful
###   Destination - too many empty values for the columns to be considered useful

df = df.drop(columns=['Type of position fixing device','Data source type','IMO','Callsign','ROT','Cargo type','Destination','Name','A','B','C','D'], errors='ignore')


In [None]:
### Resampling

mmsis = df['MMSI'].unique()
mmsis_df = pd.DataFrame([])
for mmsi in tqdm(mmsis):
    mmsi_df= df[df['MMSI'] == mmsi]
    mmsi_df = mmsi_df.set_index('# Timestamp')
    mmsi_df = mmsi_df.resample('60s').nearest(limit=1).dropna(how='all')
    mmsi_df = mmsi_df.reset_index()
    mmsi_df = mmsi_df.drop_duplicates(subset=['# Timestamp','MMSI'],keep='first')
    mmsi_df = mmsi_df.sort_values(by=['MMSI', '# Timestamp'])
    mmsis_df = pd.concat([mmsis_df,mmsi_df])
mmsis_df

In [22]:
mmsis_df.to_csv('downsampled_joined_cleaned_aisdk_2024-09-14 - 2024-10-14.csv')

In [28]:
### DATEDIFF BETWEEN CURRENT AND PREVIOUS TIMESTAMP

df['DateDiff'] =  (df['# Timestamp'] - df.groupby('MMSI')['# Timestamp'].shift(1)).dt.total_seconds().fillna(0).astype(int) / 60

In [30]:
### LATITUDE/LONGITUDE SPEED AND LAGS

df['Lat_speed'] =  ((df['Latitude'] - df.groupby('MMSI')['Latitude'].shift(1)) / (df['DateDiff']/60)).fillna(0)
df['Long_speed'] =  ((df['Longitude'] - df.groupby('MMSI')['Longitude'].shift(1)) / (df['DateDiff']/60)).fillna(0)
df['Lat_lag'] = df.groupby('MMSI')['Latitude'].shift(1)
df['Long_lag'] = df.groupby('MMSI')['Longitude'].shift(1)

In [32]:
### HAVERSINE DISTANCE COLUMN

haversine_distance = []
for index,row in df.iterrows():
    hav = haversine(row['Longitude'],row['Latitude'],row['Long_lag'],row['Lat_lag'])
    haversine_distance.append(hav)
df['haversine_distance'] = haversine_distance
df['haversine_distance'] = df['haversine_distance']*1000 

In [33]:
### RELEVANT COLUMNS FILTER

df = df[['MMSI', '# Timestamp', 'Latitude', 'Longitude', 'SOG', 'Heading', 'DateDiff', 'Lat_speed','Long_speed','Lat_lag','Long_lag','haversine_distance']]

In [35]:
df.to_csv('final_dataset.csv')