In [18]:
# IMPORT
import pandas as pd
import numpy as np
from pathlib import Path

In [19]:
# DOMAIN MAPPING TABLE

location_map = {
    "pit_entrance": {
        "lat": -0.456000, "lon": 117.110000,
        "domain_type": "road_node",
        "domain_id": "rn_pit_entrance"
    },
    "active_pit": {
        "lat": -0.470000, "lon": 117.130000,
        "domain_type": "pit",
        "domain_id": "pit_1"
    },
    "rom": {
        "lat": -0.490000, "lon": 117.170000,
        "domain_type": "rom",
        "domain_id": "rom_1"
    },
    "stockpile": {
        "lat": -0.505000, "lon": 117.210000,
        "domain_type": "stockpile",
        "domain_id": "sp_1"
    },
    "jetty_primary": {
        "lat": -0.520000, "lon": 117.240000,
        "domain_type": "jetty",
        "domain_id": "jetty_A"
    },
    "jetty_secondary": {
        "lat": -0.525000, "lon": 117.255000,
        "domain_type": "jetty",
        "domain_id": "jetty_B"
    },
}

In [20]:
# PATH CONFIG
BASE = Path("../all_dataset")
CLEAN = BASE / "clean_dataset"
FE_DIR = BASE / "feature_dataset"
FE_DIR.mkdir(exist_ok=True, parents=True)

# LOAD DATA HARIAN
files = list(CLEAN.glob("weather*.csv"))
if not files:
    raise RuntimeError("[ERROR] Tidak ada data weather untuk diproses di clean_dataset")

df = pd.concat(
    [pd.read_csv(f, parse_dates=["date"], low_memory=False) for f in files],
    ignore_index=True
)

# VALIDASI KOLON
required_cols = [
    "date", "location",
    "rainfall_mm", "humidity_pct",
    "solar_radiation_wm2", "temp_c", "temp_max_c", "temp_min_c",
    "wind_speed_10m_mps", "cloud_cover_pct"
]

missing = [c for c in required_cols if c not in df.columns]

if missing:
    raise RuntimeError(f"[ERROR] Kolom hilang di weather: {missing}")

df["date"] = pd.to_datetime(df["date"], errors="coerce")

In [21]:
# Pastikan week_start ada
if "week_start" not in df.columns:
    print("ℹ️ Kolom 'week_start' tidak ditemukan — membuat otomatis dari date...")
    df["week_start"] = df["date"] - pd.to_timedelta(df["date"].dt.weekday, unit="D")
else:
    df["week_start"] = pd.to_datetime(df["week_start"])


# -----------------------------------------------------
# 2) Pilih kolom relevan (sesuai dataset kamu)
# -----------------------------------------------------
selected = [
    "date", "week_start",
    "rainfall_mm",
    "temp_c",
    "wind_speed_10m_mps",
    "solar_radiation_wm2",
    "humidity_pct",
    "cloud_cover_pct"
]

df = df[selected]

# -----------------------------------------------------
# 3) Weekly Aggregation
# -----------------------------------------------------
df_weekly = df.groupby("week_start").agg({
    "rainfall_mm": ["sum", "max"],
    "temp_c": "mean",
    "wind_speed_10m_mps": "mean",
    "solar_radiation_wm2": "mean",
    "humidity_pct": "mean",
    "cloud_cover_pct": "mean"
}).reset_index()

df_weekly.columns = [
    "week_start",
    "rain_total_mm",
    "rain_peak_mm",
    "temp_avg_c",
    "wind_avg_mps",
    "radiation_avg_wm2",
    "humidity_avg_pct",
    "cloud_cover_avg_pct"
]

# -----------------------------------------------------
# 4) Derived Features (non-leakage, aman untuk ML)
# -----------------------------------------------------
df_weekly["heavy_rain_flag"] = (df_weekly["rain_total_mm"] > 40).astype(int)

df_weekly["storm_flag"] = (
    (df_weekly["wind_avg_mps"] > 8) |
    (df_weekly["rain_peak_mm"] > 25)
).astype(int)

# simple composite index
df_weekly["weather_index"] = (
    df_weekly["rain_total_mm"].fillna(0) * 0.4 +
    df_weekly["wind_avg_mps"].fillna(0) * 0.2 +
    (100 - df_weekly["radiation_avg_wm2"]).clip(lower=0) * 0.2 +
    df_weekly["cloud_cover_avg_pct"].fillna(0) * 0.2
)

# -----------------------------------------------------
# 5) Save
# -----------------------------------------------------
fe_file = FE_DIR / "weather_fe.csv"
df_weekly.to_csv(fe_file, index=False)
print(f"FE Weather saved to {fe_file}")

df_weekly.head()


ℹ️ Kolom 'week_start' tidak ditemukan — membuat otomatis dari date...
FE Weather saved to ..\all_dataset\feature_dataset\weather_fe.csv


Unnamed: 0,week_start,rain_total_mm,rain_peak_mm,temp_avg_c,wind_avg_mps,radiation_avg_wm2,humidity_avg_pct,cloud_cover_avg_pct,heavy_rain_flag,storm_flag,weather_index
0,2022-12-26,1.71,0.31,27.195,1.285,19.96,81.655,93.21,0,0,35.591
1,2023-01-02,136.98,9.77,26.464286,1.445,14.527143,86.024286,95.824286,1,0,91.340429
2,2023-01-09,125.91,8.67,26.664286,1.116429,14.942857,84.511429,89.354286,1,0,85.469571
3,2023-01-16,323.94,22.0,26.790714,0.961429,14.712857,84.744286,92.18,1,0,165.261714
4,2023-01-23,353.04,17.79,26.252143,1.302143,11.511429,87.300714,99.238571,1,0,179.021857


In [22]:
print(df.columns.tolist())


['date', 'week_start', 'rainfall_mm', 'temp_c', 'wind_speed_10m_mps', 'solar_radiation_wm2', 'humidity_pct', 'cloud_cover_pct']
