In [22]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


(1) 資料轉換

In [23]:
%%writefile dataset_preprocessing.py
import xml.etree.ElementTree as ET
import numpy as np
import re

def load_dataset(xml_file):
    # 讀取 XML
    tree = ET.parse(xml_file)
    root = tree.getroot()
    ns = {"cwa": "urn:cwa:gov:tw:cwacommon:0.1"}
    content_text = root.find(".//cwa:Content", ns).text.strip()

    # 抓所有浮點數 (科學記號格式，例如 -999.0E+00)
    values = [float(v) for v in re.findall(r"[-+]?\d+\.\d+E[+-]\d+", content_text)]

    # 格點大小：120 × 67 = 8040
    data = np.array(values).reshape(120, 67)

    # 經緯度範圍
    lon = np.linspace(120.00, 121.98, 67)   # 東經
    lat = np.linspace(21.88, 25.45, 120)   # 北緯
    lon_grid, lat_grid = np.meshgrid(lon, lat)

    # 展平成一維
    X = np.column_stack([lon_grid.ravel(), lat_grid.ravel()])
    y = data.ravel()

    # (a) 分類資料集：label = 0 (無效值 -999), 1 (有效值)
    labels = (y != -999.0) * 1
    data_cla = np.column_stack([X, labels])

    # (b) 回歸資料集：只保留有效值
    mask = y != -999.0
    X_reg = X[mask]
    y_reg = y[mask]
    data_reg = np.column_stack([X_reg, y_reg])

    # ====== 檢查輸出 ======
    print("總資料筆數:", len(y))
    print("\n[分類資料集範例] (lon, lat, label)")
    print(data_cla[:5])
    print("\n[回歸資料集範例] (lon, lat, value)")
    print(data_reg[:5])
    print("\n分類資料集大小:", data_cla.shape)
    print("回歸資料集大小:", data_reg.shape)

    return data_cla, data_reg

# ✅ 呼叫測試
data_cla, data_reg = load_dataset("/content/drive/MyDrive/Colab Notebooks/O-A0038-003.xml")


Writing dataset_preprocessing.py


(2a) 分類模型

In [24]:
from dataset_preprocessing import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

# ===== 讀取分類資料集 =====
data_cla, _ = load_dataset("/content/drive/MyDrive/Colab Notebooks/O-A0038-003.xml")

# 特徵與標籤
X = data_cla[:, :2].astype(float)   # (lon, lat)
y = data_cla[:, 2].astype(int)      # label (0 or 1)

# 標準化（對 Random Forest 不是必要，但加上也OK）
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 切分訓練/測試集 (80% / 20%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ===== 建立 Random Forest 模型 =====
clf = RandomForestClassifier(
    n_estimators=200,    # 樹的數量
    max_depth=None,      # 不限制樹深
    random_state=42
)
clf.fit(X_train, y_train)

# ===== 評估模型 =====
y_pred = clf.predict(X_test)
acc = accuracy_score(y_test, y_pred)

print("✅ Random Forest 分類模型準確率:", acc)
print("\n分類報告:")
print(classification_report(y_test, y_pred, digits=3))


總資料筆數: 8040

[分類資料集範例] (lon, lat, label)
[[120.    21.88   0.  ]
 [120.03  21.88   0.  ]
 [120.06  21.88   0.  ]
 [120.09  21.88   0.  ]
 [120.12  21.88   0.  ]]

[回歸資料集範例] (lon, lat, value)
[[120.84  21.94  28.1 ]
 [120.72  21.97  28.6 ]
 [120.75  21.97  28.6 ]
 [120.78  21.97  27.8 ]
 [120.81  21.97  26.5 ]]

分類資料集大小: (8040, 3)
回歸資料集大小: (3495, 3)
總資料筆數: 8040

[分類資料集範例] (lon, lat, label)
[[120.    21.88   0.  ]
 [120.03  21.88   0.  ]
 [120.06  21.88   0.  ]
 [120.09  21.88   0.  ]
 [120.12  21.88   0.  ]]

[回歸資料集範例] (lon, lat, value)
[[120.84  21.94  28.1 ]
 [120.72  21.97  28.6 ]
 [120.75  21.97  28.6 ]
 [120.78  21.97  27.8 ]
 [120.81  21.97  26.5 ]]

分類資料集大小: (8040, 3)
回歸資料集大小: (3495, 3)
✅ Random Forest 分類模型準確率: 0.9819651741293532

分類報告:
              precision    recall  f1-score   support

           0      0.978     0.991     0.985       929
           1      0.988     0.969     0.978       679

    accuracy                          0.982      1608
   macro avg      0.983     0

(2b) 回歸模型

In [25]:
from dataset_preprocessing import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np

# ===== 讀取回歸資料集 =====
_, data_reg = load_dataset("/content/drive/MyDrive/Colab Notebooks/O-A0038-003.xml")

# 特徵與目標值
X = data_reg[:, :2].astype(float)   # (lon, lat)
y = data_reg[:, 2].astype(float)    # 溫度值 (°C)

# 標準化 (提升模型效果，尤其對於經緯度數值差異大時)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# 切分訓練/測試集 (80% / 20%)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ===== 建立 Random Forest Regressor =====
reg = RandomForestRegressor(
    n_estimators=200,    # 樹的數量
    max_depth=None,      # 不限制樹深
    random_state=42
)
reg.fit(X_train, y_train)

# ===== 評估模型 =====
y_pred = reg.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print("✅ Random Forest 回歸模型")
print("MSE :", mse)
print("RMSE:", rmse)
print("R²  :", r2)


總資料筆數: 8040

[分類資料集範例] (lon, lat, label)
[[120.    21.88   0.  ]
 [120.03  21.88   0.  ]
 [120.06  21.88   0.  ]
 [120.09  21.88   0.  ]
 [120.12  21.88   0.  ]]

[回歸資料集範例] (lon, lat, value)
[[120.84  21.94  28.1 ]
 [120.72  21.97  28.6 ]
 [120.75  21.97  28.6 ]
 [120.78  21.97  27.8 ]
 [120.81  21.97  26.5 ]]

分類資料集大小: (8040, 3)
回歸資料集大小: (3495, 3)
✅ Random Forest 回歸模型
MSE : 4.8137069242207176
RMSE: 2.1940161631630515
R²  : 0.8580994551680236
