In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import MinMaxScaler
import joblib
import os

# 檢查裝置
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# --- 設定路徑 ---
data_path = '../data/processed/youbike_weather_merged.csv'
info_path = '../data/raw/station_info.csv'

# 1. 讀取主訓練資料
if os.path.exists(data_path):
    df = pd.read_csv(data_path)
    df['record_time'] = pd.to_datetime(df['record_time'])
    print(f"[Info] 成功讀取訓練資料，共 {len(df)} 筆")
else:
    raise FileNotFoundError(f"找不到訓練資料：{data_path}")

# 2. 讀取站點資訊 (用來查中文站名)
if os.path.exists(info_path):
    # 防呆：嘗試用 Tab 分隔讀取，失敗則用逗號
    try:
        df_info = pd.read_csv(info_path, sep='\t')
        if len(df_info.columns) <= 1: df_info = pd.read_csv(info_path)
    except:
        df_info = pd.read_csv(info_path)
        
    # 建立對照表：station_no -> 中文名稱 (去除 YouBike2.0_ 前綴)
    name_map = dict(zip(df_info['station_no'].astype(str), df_info['name_tw'].str.replace('YouBike2.0_', '')))
    print(f"[Info] 成功讀取站點資訊檔")
else:
    print(f"[Warning] 找不到站點資訊檔，將使用預設名稱")
    name_map = {}

Using device: cpu
[Info] 成功讀取訓練資料，共 4377102 筆
[Info] 成功讀取站點資訊檔


In [2]:
# --- 定義降雨分級函數 ---
def get_rain_category(val):
    if val == 0: return 0      # 無雨
    elif val <= 2: return 1    # 毛毛雨 (Drizzle)
    elif val <= 10: return 2   # 小雨/中雨 (Rain)
    else: return 3             # 大雨 (Heavy)

# 確保 rain 欄位無空值
df['rain'] = df['rain'].fillna(0)

# 應用函數產生新特徵
df['Rain_Cat'] = df['rain'].apply(get_rain_category)

print(" 特徵工程完成：已新增 'Rain_Cat' (降雨分級)")
print(df[['record_time', 'rain', 'Rain_Cat']].head())

 特徵工程完成：已新增 'Rain_Cat' (降雨分級)
          record_time  rain  Rain_Cat
0 2025-12-09 21:31:29   0.0         0
1 2025-12-09 21:31:29   0.0         0
2 2025-12-09 21:31:29   0.0         0
3 2025-12-09 21:31:29   0.0         0
4 2025-12-09 21:31:29   0.0         0


In [3]:
selected_stations = []
station_info_map = {} # Dashboard 專用字典

# 檢查是否有行政區欄位
if 'district' in df.columns:
    districts = df['district'].unique()
    print(f"發現行政區: {districts}")
    
    for dist in districts:
        # 找出該行政區的所有資料
        dist_df = df[df['district'] == dist]
        
        if not dist_df.empty:
            # 找出該區流量最大者 (出現頻率最高)
            top_station = dist_df['station_no'].value_counts().idxmax()
            selected_stations.append(top_station)
            
            # 取得真實中文名稱
            real_name = name_map.get(str(top_station), f"{dist}熱門站")
            
            # 存入字典： "500101": "捷運科技大樓站 (大安區)"
            station_info_map[str(top_station)] = f"{real_name} ({dist})"
            
    # 取前 5 個不同區域作為代表
    top_5_stations = selected_stations[:5]
    print(f" 鎖定 5 個分區代表站點: {top_5_stations}")

else:
    print("[Error] 資料中找不到 'district' 欄位")
    top_5_stations = []

發現行政區: ['大安區' '大同區' '士林區' '文山區' '中正區' '中山區' '內湖區' '北投區' '松山區' '南港區' '信義區' '萬華區'
 '臺大公館校區']
✅ 鎖定 5 個分區代表站點: [500101037, 500103021, 500104112, 500105044, 500106074]


In [4]:
# 只保留 Top 5 站點的資料
df_filtered = df[df['station_no'].isin(top_5_stations)].copy()

# 建立內部訓練用的 ID Mapping (把站號轉成 0, 1, 2, 3, 4)
station_mapping = {str(station): idx for idx, station in enumerate(top_5_stations)}
df_filtered['station_idx'] = df_filtered['station_no'].astype(str).map(station_mapping)

print("Station Mapping:", station_mapping)

# 設定需要補值的特徵 (包含新的 Rain_Cat)
features_to_fill = ['bikes_available', 'temperature', 'rain', 'Rain_Cat']

# 排序後進行補值 (使用線性插值 + 前後填補)
df_filtered = df_filtered.sort_values(['station_idx', 'record_time'])
df_filtered[features_to_fill] = df_filtered.groupby('station_idx')[features_to_fill].transform(
    lambda x: x.interpolate(method='linear').ffill().bfill()
)

print(f" 資料過濾與補值完成，剩餘筆數: {len(df_filtered)}")

Station Mapping: {'500101037': 0, '500103021': 1, '500104112': 2, '500105044': 3, '500106074': 4}
 資料過濾與補值完成，剩餘筆數: 12815


In [6]:
# 設定特徵欄位 (共 4 個)
feature_cols = ['bikes_available', 'temperature', 'rain', 'Rain_Cat']

# 1. 數據縮放 (Scaler)
scaler = MinMaxScaler()
df_filtered[feature_cols] = scaler.fit_transform(df_filtered[feature_cols])

# 2. 製作滑動視窗 (Sliding Window)
def create_multistation_dataset(data, time_steps=3):
    X_list, y_list = [], []
    
    for station_idx in data['station_idx'].unique():
        station_data = data[data['station_idx'] == station_idx]
        
        values = station_data[feature_cols].values # 數值特徵
        ids = station_data['station_idx'].values   # 站點 ID
        
        for i in range(len(values) - time_steps):
            # 取過去 3 小時的特徵
            seq_values = values[i:i+time_steps]
            # 取對應的 ID (重塑形狀以利拼接)
            seq_ids = ids[i:i+time_steps].reshape(-1, 1)
            
            # 合併：[特徵(4) + ID(1)] = 5 個欄位
            combined_input = np.hstack((seq_values, seq_ids))
            
            # Label: 下一時刻的 bikes_available
            target = values[i + time_steps, 0]
            
            X_list.append(combined_input)
            y_list.append(target)
            
    return np.array(X_list), np.array(y_list)

# 執行轉換
TIME_STEPS = 3
X, y = create_multistation_dataset(df_filtered, TIME_STEPS)

# 轉成 PyTorch Tensor 並丟進 GPU/CPU
X_tensor = torch.FloatTensor(X).to(device)
y_tensor = torch.FloatTensor(y).reshape(-1, 1).to(device)

print(f" 序列資料製作完成。Input Shape: {X_tensor.shape}")
# 預期結果: (樣本數, 3, 5) -> 3是時間步, 5是特徵數(4數值+1ID)

 序列資料製作完成。Input Shape: torch.Size([12800, 3, 5])


In [7]:
class MultiStationLSTM(nn.Module):
    def __init__(self, num_stations, input_size=4, hidden_size=64, output_size=1, embedding_dim=5):
        super(MultiStationLSTM, self).__init__()
        
        # 1. 站點嵌入層 (把 0~4 的 ID 轉成向量)
        self.station_embedding = nn.Embedding(num_stations, embedding_dim)
        
        # 2. LSTM 層
        # 輸入維度 = 數值特徵(4) + Embedding(5) = 9
        self.lstm_input_size = input_size + embedding_dim
        self.lstm = nn.LSTM(self.lstm_input_size, hidden_size, batch_first=True)
        
        # 3. 輸出層
        self.dropout = nn.Dropout(0.2)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # x shape: (batch, time, 5) 
        # 切分資料：前4個是數值，最後1個是ID
        numerical_features = x[:, :, :4] 
        station_ids = x[:, :, 4].long()
        
        # 嵌入運算
        station_embedded = self.station_embedding(station_ids)
        
        # 拼接 (4 + 5 = 9)
        combined_input = torch.cat((numerical_features, station_embedded), dim=2)
        
        # LSTM 運算
        out, _ = self.lstm(combined_input)
        
        # 取最後一個時間點的輸出
        out = out[:, -1, :]
        out = self.dropout(out)
        out = self.fc(out)
        return out

print(" 模型架構定義完成")

 模型架構定義完成


In [8]:
# 初始化模型
num_stations = len(station_mapping)
model = MultiStationLSTM(num_stations=num_stations, input_size=4).to(device)

# 設定超參數
learning_rate = 0.001
num_epochs = 100
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

print(" 開始訓練模型...")
model.train()

for epoch in range(num_epochs):
    optimizer.zero_grad()
    
    # Forward
    outputs = model(X_tensor)
    loss = criterion(outputs, y_tensor)
    
    # Backward
    loss.backward()
    optimizer.step()
    
    if (epoch+1) % 10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

print(" 訓練結束！")

 開始訓練模型...
Epoch [10/100], Loss: 0.0508
Epoch [20/100], Loss: 0.0410
Epoch [30/100], Loss: 0.0334
Epoch [40/100], Loss: 0.0294
Epoch [50/100], Loss: 0.0255
Epoch [60/100], Loss: 0.0220
Epoch [70/100], Loss: 0.0178
Epoch [80/100], Loss: 0.0129
Epoch [90/100], Loss: 0.0088
Epoch [100/100], Loss: 0.0074
 訓練結束！


In [9]:
save_path = '../api/model_files'
if not os.path.exists(save_path):
    os.makedirs(save_path)

# 1. 儲存模型權重
torch.save(model.state_dict(), os.path.join(save_path, 'youbike_lstm_multistation.pth'))

# 2. 儲存 Scaler (很重要，預測時要用來還原數值)
joblib.dump(scaler, os.path.join(save_path, 'scaler.pkl'))

# 3. 儲存 站點 ID 對照表 (訓練時的 0,1,2 對應哪個真實站號)
joblib.dump(station_mapping, os.path.join(save_path, 'station_mapping.pkl'))

# 4. 儲存 Dashboard 顯示資訊 (站號 -> 中文名)
joblib.dump(station_info_map, os.path.join(save_path, 'station_info_map.pkl'))

print(f" 所有檔案已儲存至: {save_path}")
print("包含: model, scaler, mapping, info_map")

 所有檔案已儲存至: ../api/model_files
包含: model, scaler, mapping, info_map
