
# 2.3 — Từ hiểu dữ liệu đến bước chuẩn bị

Ở đây, nhóm sẽ tìm hiểu về phân phối và mối quan hệ trong dữ liệu, sau đó kiểm định dữ liệu rồi mới làm sạch trước khi xây dựng đặc trưng dựa trên dữ liệu đó.

In [None]:
import json, os, re
from pathlib import Path

def slugify(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "-", str(s).lower()).strip("-")

CONFIG_PATH = "../.lab2_config.json"
CITY = None
city_slug = None
DAYS = None

if os.path.exists(CONFIG_PATH):
    with open(CONFIG_PATH, "r", encoding="utf-8") as f:
        cfg = json.load(f)
    CITY = cfg.get("CITY")
    city_slug = cfg.get("city_slug") or (slugify(CITY) if CITY else None)
    DAYS = cfg.get("DAYS")
    print(f"[config] CITY={CITY} | city_slug={city_slug} | DAYS={DAYS}")
else:
    print("[config] Not found, will auto-detect from files")


In [None]:
import pandas as pd, numpy as np, pathlib, re, os
from glob import glob
from datetime import datetime
from pathlib import Path

RAW_DIR = "../data/raw"
INTERIM_DIR = f"../data/interim/{city_slug}"
PROCESSED_DIR = f"../data/processed/{city_slug}"
Path(INTERIM_DIR).mkdir(parents=True, exist_ok=True)
Path(PROCESSED_DIR).mkdir(parents=True, exist_ok=True)

def slugify(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "-", str(s).lower()).strip("-")

try:
    CITY  
except NameError:
    CITY = None  

city_slug = slugify(CITY) if CITY else None

candidates = []
if city_slug:
    candidates = sorted(glob(f"{RAW_DIR}/{city_slug}/forecast_*.csv"))
if not candidates:
    candidates = sorted(glob(f"{RAW_DIR}/**/forecast_*.csv", recursive=True))

src = candidates[-1] if candidates else None
print("Using source:", src)

if not src:
    print("No valid source found. Run 2.2 first or check internet.")
else:
    df = pd.read_csv(src)

    if not city_slug:
        if "city" in df.columns and len(df):
            city_slug = slugify(df["city"].iloc[0])
        else:
            city_slug = Path(src).parent.name  
    ts = datetime.now().strftime("%Y%m%d_%H%M")

    # ===== Schema check =====
    expected = {"date","temp_max","temp_min","precipitation_sum","wind_speed_10m_max","city","lat","lon","source"}
    missing = expected - set(df.columns)
    assert not missing, f"Missing columns: {missing}"

    issues = []
    for col in ["temp_max","temp_min","precipitation_sum","wind_speed_10m_max","lat","lon"]:
        if not pd.api.types.is_numeric_dtype(df[col]):
            issues.append(f"{col} not numeric")
    print("Schema OK." if not issues else f"Validation issues: {issues}")

    # ===== Save interim checked =====
    interim_path = f"{INTERIM_DIR}/checked_{ts}.csv"
    df.to_csv(interim_path, index=False)
    print("Saved interim:", interim_path)

    # ===== Cleaning =====
    dfc = df.copy()
    dfc["date"] = pd.to_datetime(dfc["date"], errors="coerce")
    dfc = dfc.drop_duplicates(["city","date"]).sort_values("date").reset_index(drop=True)
    for col, hi in [("wind_speed_10m_max", 120), ("precipitation_sum", 200)]:
        dfc[col] = np.clip(dfc[col], 0, hi)

    clean_path = f"{PROCESSED_DIR}/clean_{ts}.csv"
    dfc.to_csv(clean_path, index=False)
    print("Saved clean:", clean_path)

    # ===== Features =====
    df_feat = dfc.copy()
    df_feat["temp_range"] = df_feat["temp_max"] - df_feat["temp_min"]
    df_feat["rain_flag"] = (df_feat["precipitation_sum"] > 0.0).astype(int)
    df_feat["dow"] = df_feat["date"].dt.dayofweek

    feat_path  = f"{PROCESSED_DIR}/features_{ts}.csv"
    df_feat.to_csv(feat_path, index=False)
    print("Saved features:", feat_path)

    # ===== Figures per city =====
    %pip install -q kaleido
    import plotly.express as px

    FIG_DIR = f"../reports/figures/{city_slug}"
    Path(FIG_DIR).mkdir(parents=True, exist_ok=True)

    df_feat["date"] = pd.to_datetime(df_feat["date"], errors="coerce")
    df_feat["date_str"] = df_feat["date"].dt.strftime("%Y-%m-%d")

    fig1 = px.bar(df_feat, x="date_str", y=["temp_max","temp_min"], barmode="group", title="Daily Temperatures")
    fig1.update_xaxes(type="category")
    fig1.write_image(os.path.join(FIG_DIR, f"temperatures_{ts}.png"))
    fig1.show()

    fig2 = px.line(df_feat, x="date_str", y="precipitation_sum", title="Daily Precipitation")
    fig2.update_xaxes(type="category")
    fig2.write_image(os.path.join(FIG_DIR, f"precipitation_{ts}.png"))
    fig2.show()

    fig3 = px.line(df_feat, x="date_str", y="wind_speed_10m_max", title="Max Wind Speed 10m")
    fig3.update_xaxes(type="category")
    fig3.write_image(os.path.join(FIG_DIR, f"wind_{ts}.png"))
    fig3.show()

    print("Saved figures to", FIG_DIR)
