In [1]:
# IMPORT
import pandas as pd
import numpy as np
from pathlib import Path

In [2]:
# PATH CONFIG
BASE = Path("../all_dataset")
CLEAN = BASE / "clean_dataset"
FE_DIR = BASE / "feature_dataset"
FE_DIR.mkdir(exist_ok=True, parents=True)

# LOAD CLEAN ROAD DATA
road_file = CLEAN / "road.csv"
if not road_file.exists():
    raise RuntimeError(f"[ERROR] File road.csv tidak ditemukan di {CLEAN}")

df_road = pd.read_csv(road_file)

# VALIDASI KOLON
required_cols = ["road_id","surface_type","slope_pct","curvature_level","segment_type"]
for col in required_cols:
    if col not in df_road.columns:
        raise RuntimeError(f"[ERROR] Kolom '{col}' wajib ada di dataset road")

In [3]:
# 1. Drop kolom non-informatif
drop_cols = [
    "segment_name", "surface_type",
    "lat_start", "lon_start", "lat_end", "lon_end",
    "elevation_start_m", "elevation_end_m"
]
df_road = df_road.drop(columns=drop_cols, errors="ignore")

# 2. road_id wajib string
df_road["road_id"] = df_road["road_id"].astype(str)

# 3. Normalisasi kategori
cat_cols = ["segment_type", "curvature_level", "roughness_index", "waterlogging_risk"]
for col in cat_cols:
    if col in df_road.columns:
        df_road[col] = df_road[col].astype("category")

# 4. OPTIONAL: cycle difficulty score (simple, tidak pakai kolom hilang)
df_road["difficulty_score"] = (
    df_road["slope_pct"].fillna(0) * 0.4 +
    df_road["roughness_index"].cat.codes.fillna(0) * 0.3 +
    df_road["waterlogging_risk"].cat.codes.fillna(0) * 0.2 +
    df_road["curvature_level"].cat.codes.fillna(0) * 0.1
)

# SAVE
fe_file = FE_DIR / "road_fe.csv"
df_road.to_csv(fe_file, index=False)
print(f" FE Road cleaned & saved to {fe_file}")


 FE Road cleaned & saved to ..\all_dataset\feature_dataset\road_fe.csv
