## Data Preprocessing

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import folium
#import movingpandas as mpd
from shapely.geometry import Point
import os

In [3]:
file_path = r"C:\Users\slab\Desktop\Slab Project\Stage2 ETA\Raw Data\477769500_VesselHistoryLineInfo.csv"


In [4]:
try:
    df
except NameError:
    df = pd.read_csv(file_path, low_memory=False)
df = pd.read_csv(file_path, low_memory=False)

In [5]:
# 我只想畫出477769500這艘船的航線圖，先把其他的洗掉
mmsi_target = 477769500
df_filtered = df[df['MMSI'] == mmsi_target].copy()
print(len(df_filtered))
print(df_filtered.head())

264586
      Pky      IMO       MMSI CallSign  FacNumber ShipName_CH ShipName_ENG  \
0  378737  9322736  477769500  VRPK2@@        NaN         豐順輪    FENG SHUN   
1  378738  9322736  477769500  VRPK2@@        NaN         豐順輪    FENG SHUN   
2  378741  9322736  477769500  VRPK2@@        NaN         豐順輪    FENG SHUN   
3  378742  9322736  477769500  VRPK2@@        NaN         豐順輪    FENG SHUN   
4  378744  9322736  477769500  VRPK2@@        NaN         豐順輪    FENG SHUN   

   VesselLine VesselType  VesselSeries  ...  IsDel           CreateTime  \
0         NaN        散裝輪           NaN  ...      1  2025/09/15 11:40:19   
1         NaN        散裝輪           NaN  ...      1  2025/09/15 11:42:19   
2         NaN        散裝輪           NaN  ...      1  2025/09/15 11:48:19   
3         NaN        散裝輪           NaN  ...      1  2025/09/15 11:50:19   
4         NaN        散裝輪           NaN  ...      1  2025/09/15 11:54:19   

   LastUpdateUserID       LastUpdateTime      LastUpdatePage  B_Rudder  \

In [6]:
print("有效data數量約為:", len(df_filtered)/len(df))

有效data數量約為: 1.0


In [7]:
df_filtered = df_filtered[
    (df_filtered['Lat'] >= -90) & (df_filtered['Lat'] <= 90) &
    (df_filtered['Lng'] >= -180) & (df_filtered['Lng'] <= 180)
].copy()

# 再檢查經緯度是否都合理
print(df_filtered[['Lat', 'Lng']].describe())

                 Lat            Lng
count  263812.000000  263812.000000
mean       26.857002     123.005259
std         3.074700       5.045676
min        25.029188     119.479290
25%        25.155197     120.273480
50%        25.507810     121.389634
75%        26.133184     121.391440
max        36.123352     139.860230


In [9]:
df_filtered['Lat'] = df_filtered['Lat'].astype(float)
df_filtered['Lng'] = df_filtered['Lng'].astype(float)

In [10]:
df_filtered['CreateTime'] = pd.to_datetime(df_filtered['CreateTime'], errors='coerce')

In [11]:
df_filtered = df_filtered.dropna(subset=['Lat', 'Lng', 'CreateTime'])

In [12]:
# 為了不讓圖片上線斷掉，將經度[-180, 180] 轉成 [0, 360]
df_filtered['Lng_360'] = df_filtered['Lng'] % 360

In [13]:
# 計算地圖中心
map_center_lat = df_filtered['Lat'].mean()
map_center_lon = df_filtered['Lng_360'].mean()

In [14]:
df_filtered = df_filtered.sort_values('CreateTime').reset_index(drop=True)

## Trajectory Reconstruction

In [15]:
m = folium.Map(
    location=[map_center_lat, map_center_lon],
    zoom_start=4,
    tiles='OpenStreetMap'
)

In [16]:
import folium

coords = df_filtered[['Lat', 'Lng_360']].iloc[::1].values.tolist()  # ::10 可抽樣10倍
folium.PolyLine(
    coords,
    color='blue',
    weight=3,
    opacity=0.7,
    popup=f"航線點數: {len(coords)}"
).add_to(m)

# 起訖點標記

if len(coords) > 0:
    # 起點
    folium.Marker(
        location=coords[0],
        popup=f"起點\n時間: {df_filtered.iloc[0]['CreateTime']}",
        icon=folium.Icon(color='green', icon='play')
    ).add_to(m)

    # 終點
    folium.Marker(
        location=coords[-1],
        popup=f"終點\n時間: {df_filtered.iloc[-1]['CreateTime']}",
        icon=folium.Icon(color='red', icon='stop')
    ).add_to(m)

In [17]:
html_file = "ship_trajectory_360.html"
m.save(html_file)

In [18]:
# 自動用瀏覽器開啟
import webbrowser
webbrowser.open('file://' + os.path.realpath(html_file))

True

## DBSCAN clustering

In [20]:
from sklearn.cluster import DBSCAN

In [21]:
from haversine import haversine

In [22]:
import numpy as np

In [25]:
stop_df = df_filtered[df_filtered["Sog"] < 0.5].copy()

MemoryError: Unable to allocate 1.38 MiB for an array with shape (181082,) and data type int64

In [24]:
## DBSCAN clustering for single ship
# 假設 Sog 欄位存在，可以篩選停泊點
stop_df = df_filtered[df_filtered["Sog"] < 0.5].copy()

if stop_df.empty:
    print("沒有偵測到停泊點")
else:
    # DBSCAN 需要經緯度轉成弧度
    coords = stop_df[['Lat', 'Lng_360']].to_numpy()
    coords_rad = np.radians(coords)

    # DBSCAN 聚類
    # eps 的單位是弧度，0.01 弧度約 0.01*6371km ≈ 63km，視需求調整
    db = DBSCAN(eps=0.001, min_samples=10, metric='haversine').fit(coords_rad)
    stop_df['cluster'] = db.labels_

    # 計算每個 cluster 的中心與半徑
    port_list = []
    for cluster_id in stop_df['cluster'].unique():
        if cluster_id == -1:
            continue  # 忽略噪聲點

        cluster_points = stop_df[stop_df['cluster'] == cluster_id][['Lat', 'Lng_360']]
        center_lat = cluster_points['Lat'].mean()
        center_lon = cluster_points['Lng_360'].mean()

        # 半徑：到中心最遠距離
        distances = cluster_points.apply(
            lambda row: haversine((center_lat, center_lon), (row['Lat'], row['Lng_360'])),
            axis=1
        )
        radius_km = distances.max()

        port_list.append({
            'MMSI': mmsi_target,
            'cluster': cluster_id,
            'lat': center_lat,
            'lon': center_lon,
            'radius_km': radius_km
        })

    # 合併成 DataFrame
    port_df = pd.DataFrame(port_list)
    print(f"共偵測到 {port_df.shape[0]} 個停泊點 / 港口")
    print(port_df.head())


MemoryError: 

In [None]:
# 假設 cleaned_data 已經有經緯度清洗完成的所有船舶資料
port_list = []

for df_ship in cleaned_data:
    device_id = df_ship["DeviceID"].iloc[0]  # 取得船舶 ID
    
    # 篩選停泊點（假設 SOG 欄位存在）
    stop_df = df_ship[df_ship["Sog"] < 0.5].copy()
    
    if stop_df.empty:
        continue  # 沒有停泊資料就跳過
    
    # 取經緯度
    coords = stop_df[['Lat', 'Long']].to_numpy()
    coords_rad = np.radians(coords)  # DBSCAN 使用 haversine 需轉成弧度
    
    # DBSCAN 聚類
    db = DBSCAN(eps=0.01, min_samples=10, metric='haversine').fit(coords_rad)
    stop_df['cluster'] = db.labels_
    
    # 計算每個 cluster 的中心與半徑
    for cluster_id in stop_df['cluster'].unique():
        if cluster_id == -1:
            continue  # 忽略噪聲點
        
        cluster_points = stop_df[stop_df['cluster']==cluster_id][['Lat','Long']]
        center_lat = cluster_points['Lat'].mean()
        center_lon = cluster_points['Long'].mean()
        
        # 半徑：到中心最遠距離
        distances = cluster_points.apply(lambda row: haversine((center_lat, center_lon), (row['Lat'], row['Long'])), axis=1)
        radius_km = distances.max()
        
        port_list.append({
            'DeviceID': device_id,
            'cluster': cluster_id,
            'lat': center_lat,
            'lon': center_lon,
            'radius_km': radius_km
        })

# 合併成 DataFrame
port_df = pd.DataFrame(port_list)
print(f"共偵測到 {port_df.shape[0]} 個港口")
print(port_df.head())