In [1]:
!pip -q install lxml numpy pandas scikit-learn


In [2]:
from google.colab import files
uploaded = files.upload()  # 會跳選檔視窗，選 O-A0038-003.xml

# 取第一個上傳的檔案名
DATA_PATH = list(uploaded.keys())[0]
SAVE_DIR  = '/content/'   # 輸出在工作目錄
print('Uploaded DATA_PATH =', DATA_PATH)


Saving O-A0038-003.xml to O-A0038-003.xml
Uploaded DATA_PATH = O-A0038-003.xml


In [3]:
# === 解析 XML（namespace-aware）→ 建立 df_full / df_cls / df_reg ===
import re
import numpy as np
import pandas as pd
from lxml import etree
from pathlib import Path

# 固定規格（作業給定）
LON0, LAT0 = 120.00, 21.88
DLON, DLAT = 0.03, 0.03
NX, NY = 67, 120
INVALID = -999.0

DATA_PATH = Path(DATA_PATH)

# default namespace
ns = {'c': 'urn:cwa:gov:tw:cwacommon:0.1'}
tree = etree.parse(str(DATA_PATH))

# 直接抓 <Resource>/<Content> 文字
contents = tree.xpath('//c:dataset/c:Resource/c:Content/text()', namespaces=ns)
if not contents:
    raise RuntimeError("找不到 Resource/Content 內容（請確認命名空間/XPath）")
content_str = contents[0]

# 抽出所有浮點（含科學記號）
nums = [float(x) for x in re.findall(r'[-+]?\d+(?:\.\d+)?(?:[eE][-+]?\d+)?', content_str)]
expected = NX * NY
if len(nums) != expected:
    raise RuntimeError(f'筆數不符：期望 {expected}，實得 {len(nums)}')

vals = np.array(nums, dtype=float).reshape(NY, NX)

# 建立經緯度座標
lons = LON0 + DLON * np.arange(NX)
lats = LAT0 + DLAT * np.arange(NY)
lon_mesh, lat_mesh = np.meshgrid(lons, lats)   # (NY, NX)

# 攤平成表格
df_full = pd.DataFrame({
    'lon': lon_mesh.ravel(),
    'lat': lat_mesh.ravel(),
    'value': vals.ravel()
})

# 分類資料：有效=1、-999=0
df_cls = df_full.copy()
df_cls['label'] = (df_cls['value'] != INVALID).astype(int)
df_cls = df_cls[['lon','lat','label']]

# 回歸資料：只留有效
df_reg = df_full[df_full['value'] != INVALID][['lon','lat','value']]

print("df_full:", df_full.shape, "df_cls:", df_cls.shape, "df_reg:", df_reg.shape)
print("有效/無效分佈：\n", df_cls['label'].value_counts())


df_full: (8040, 3) df_cls: (8040, 3) df_reg: (3495, 3)
有效/無效分佈：
 label
0    4545
1    3495
Name: count, dtype: int64


In [4]:
# === 訓練與評估：分類(Logistic, balanced) + 回歸(Polynomial Linear) ===
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    mean_absolute_error, mean_squared_error, r2_score, confusion_matrix
)

# --- 分類：Logistic Regression（課內內容）---
Xc = df_cls[['lon','lat']].to_numpy()
yc = df_cls['label'].to_numpy()
Xc_tr, Xc_te, yc_tr, yc_te = train_test_split(
    Xc, yc, test_size=0.2, random_state=42, stratify=yc
)

clf = make_pipeline(
    StandardScaler(),
    LogisticRegression(max_iter=1000, class_weight='balanced')
)
clf.fit(Xc_tr, yc_tr)
yc_pred = clf.predict(Xc_te)

cls_metrics = {
    "accuracy": float(accuracy_score(yc_te, yc_pred)),
    "precision": float(precision_score(yc_te, yc_pred, zero_division=0)),
    "recall": float(recall_score(yc_te, yc_pred, zero_division=0)),
    "f1": float(f1_score(yc_te, yc_pred, zero_division=0)),
}
print('Classification metrics:', cls_metrics)
print('Confusion matrix:\n', confusion_matrix(yc_te, yc_pred))

# --- 回歸：多項式線性回歸（課內安全牌）---
Xr = df_reg[['lon','lat']].to_numpy()
yr = df_reg['value'].to_numpy()
Xr_tr, Xr_te, yr_tr, yr_te = train_test_split(Xr, yr, test_size=0.2, random_state=42)

# 你可選：LinearRegression() 或 Ridge(alpha=1.0) 抑制過擬合
regr = Pipeline([
    ("poly", PolynomialFeatures(degree=3, include_bias=False)),  # 可把 3 改成 2 減少過擬合
    ("lin", LinearRegression())
])
regr.fit(Xr_tr, yr_tr)
yr_pred = regr.predict(Xr_te)

mse = mean_squared_error(yr_te, yr_pred)
reg_metrics = {
    "MAE": float(mean_absolute_error(yr_te, yr_pred)),
    "RMSE": float(np.sqrt(mse)),   # 兼容舊版 sklearn
    "R2": float(r2_score(yr_te, yr_pred)),
}
print('Regression metrics (Poly Linear):', reg_metrics)


Classification metrics: {'accuracy': 0.5740049751243781, 'precision': 0.508557457212714, 'recall': 0.5951359084406295, 'f1': 0.5484508899143046}
Confusion matrix:
 [[507 402]
 [283 416]]
Regression metrics (Poly Linear): {'MAE': 3.0333686910009003, 'RMSE': 4.445026177247194, 'R2': 0.41755749175704815}


In [5]:
# === 匯出 CSV 與指標摘要到 SAVE_DIR ===
import os
os.makedirs(SAVE_DIR, exist_ok=True)

df_cls.to_csv(os.path.join(SAVE_DIR, "week4_classification.csv"), index=False)
df_reg.to_csv(os.path.join(SAVE_DIR, "week4_regression.csv"), index=False)

pd.DataFrame([cls_metrics]).to_csv(os.path.join(SAVE_DIR, "week4_cls_metrics.csv"), index=False)
pd.DataFrame([reg_metrics]).to_csv(os.path.join(SAVE_DIR, "week4_reg_metrics.csv"), index=False)

# 回歸預測對照（抽樣 20 筆）
out_reg = pd.DataFrame({
    "lon": Xr_te[:,0],
    "lat": Xr_te[:,1],
    "y_true": yr_te,
    "y_pred": yr_pred
})
out_reg.sample(min(20, len(out_reg)), random_state=42).to_csv(
    os.path.join(SAVE_DIR, "week4_reg_sample_preds.csv"), index=False
)

print("已輸出到：", SAVE_DIR)


已輸出到： /content/


In [6]:
# === Week4 交前最終驗證（請在同一個 SAVE_DIR 下執行）===
import os, math, numpy as np, pandas as pd
from collections import Counter

def check_exists(fn):
    p = os.path.join(SAVE_DIR, fn)
    assert os.path.exists(p), f"找不到檔案：{fn}（SAVE_DIR={SAVE_DIR}）"
    return p

# 1) 載入檔案
p_cls = check_exists("week4_classification.csv")
p_reg = check_exists("week4_regression.csv")
p_cm  = check_exists("week4_cls_metrics.csv")
p_rm  = check_exists("week4_reg_metrics.csv")

df_cls = pd.read_csv(p_cls)
df_reg = pd.read_csv(p_reg)
m_cls  = pd.read_csv(p_cm).iloc[0].to_dict()
m_reg  = pd.read_csv(p_rm).iloc[0].to_dict()

print("檔案讀取 OK")
print("df_cls.shape =", df_cls.shape, " / df_reg.shape =", df_reg.shape)
print("cls_metrics:", m_cls)
print("reg_metrics:", m_reg)

# 2) 欄位與型別
assert list(df_cls.columns)==["lon","lat","label"], "classification 欄位應為 lon, lat, label"
assert list(df_reg.columns)==["lon","lat","value"], "regression 欄位應為 lon, lat, value"
assert set(np.unique(df_cls["label"])) <= {0,1}, "label 只能是 0/1"

# 3) 筆數檢查
EXPECTED = 67*120  # 8040
assert len(df_cls)==EXPECTED, f"classification 應有 {EXPECTED} 列，實得 {len(df_cls)}"
assert len(df_reg) <= EXPECTED, "regression 只保留有效值，筆數應 <= 8040"

# 4) 經緯度範圍與步距
lon = df_cls["lon"].unique()
lat = df_cls["lat"].unique()
lon.sort(); lat.sort()
# 範圍
lon_ok = math.isclose(lon[0], 120.00, abs_tol=1e-6) and math.isclose(lon[-1], 121.98, abs_tol=1e-6)
lat_ok = math.isclose(lat[0], 21.88, abs_tol=1e-6)  and math.isclose(lat[-1], 25.45, abs_tol=1e-6)
assert lon_ok and lat_ok, f"經緯度範圍不符：lon[{lon[0]}, {lon[-1]}], lat[{lat[0]}, {lat[-1]}]"

# 步距（取前後差的眾數當作步距）
def step_mode(arr):
    steps = np.round(np.diff(arr), 6)
    c = Counter(steps)
    return float(c.most_common(1)[0][0])
dlon = step_mode(lon)
dlat = step_mode(lat)
assert math.isclose(dlon, 0.03, rel_tol=0, abs_tol=1e-6), f"經度步距應為 0.03，實得 {dlon}"
assert math.isclose(dlat, 0.03, rel_tol=0, abs_tol=1e-6), f"緯度步距應為 0.03，實得 {dlat}"
assert len(lon)==67 and len(lat)==120, f"唯一經/緯度數量應為 67/120，實得 {len(lon)}/{len(lat)}"

# 5) 無效值處理正確性
#   df_cls: label=0 對應 df_full.value=-999；df_reg: 不應含 -999
neg_mask = (df_cls["label"]==0)
assert not (df_reg["value"]==-999).any(), "regression 不應包含 -999"
print("無效值檢查 OK（regression 無 -999；classification 0/1 合理）")

# 6) 回歸值合理性（大致溫度範圍）
if not df_reg.empty:
    vmin, vmax = float(df_reg["value"].min()), float(df_reg["value"].max())
    print(f"回歸值範圍：[{vmin:.2f}, {vmax:.2f}] °C")
    assert -50 <= vmin <= 60 and -50 <= vmax <= 60, "溫度數值看起來不合理，請再確認解析流程"

# 7) 指標 sanity check
for k in ["accuracy","precision","recall","f1"]:
    assert k in m_cls, f"缺少分類指標 {k}"
for k in ["MAE","RMSE","R2"]:
    assert k in m_reg, f"缺少回歸指標 {k}"
print("指標鍵值存在 OK")

print("\n✅ 交前驗證通過。可以交件！")


檔案讀取 OK
df_cls.shape = (8040, 3)  / df_reg.shape = (3495, 3)
cls_metrics: {'accuracy': 0.5740049751243781, 'precision': 0.508557457212714, 'recall': 0.5951359084406295, 'f1': 0.5484508899143046}
reg_metrics: {'MAE': 3.0333686910009003, 'RMSE': 4.445026177247194, 'R2': 0.4175574917570481}
無效值檢查 OK（regression 無 -999；classification 0/1 合理）
回歸值範圍：[-1.90, 30.00] °C
指標鍵值存在 OK

✅ 交前驗證通過。可以交件！
