In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# =========================
# 1) Load data
# =========================
path = "/mnt/data/global_disaster_response_2018_2024.csv"
df = pd.read_csv(path)

# Target + 10 features theo yêu cầu
target = "economic_loss_usd"
features = [
    "country",
    "disaster_type",
    "severity_index",
    "casualties",
    "response_time_hours",
    "aid_amount_usd",
    "response_efficiency_score",
    "recovery_days",
    "latitude",
    "longitude",
]

# Giữ đúng cột cần thiết
missing_cols = [c for c in [target] + features if c not in df.columns]
if missing_cols:
    raise ValueError(f"Thiếu cột trong dataset: {missing_cols}")

data = df[features + [target]].copy()

# Ép kiểu numeric cho các cột số (nếu có string bẩn sẽ thành NaN -> imputer xử lý)
num_cols_all = [
    "severity_index",
    "casualties",
    "response_time_hours",
    "aid_amount_usd",
    "response_efficiency_score",
    "recovery_days",
    "latitude",
    "longitude",
    target,
]
for c in num_cols_all:
    data[c] = pd.to_numeric(data[c], errors="coerce")

# Bỏ dòng target bị thiếu
data = data.dropna(subset=[target]).reset_index(drop=True)

X = data[features]
y = data[target]

# =========================
# 2) Define transformers
# =========================

# (a) Log transform cho 3 cột: log1p(x) để tránh log(0)
log_cols = ["casualties", "response_time_hours", "aid_amount_usd"]

def log1p_safe(X_array):
    # đảm bảo không âm (nếu có dữ liệu âm -> cắt về 0)
    X_array = np.asarray(X_array, dtype=float)
    X_array = np.clip(X_array, a_min=0, a_max=None)
    return np.log1p(X_array)

log_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("log", FunctionTransformer(log1p_safe, feature_names_out="one-to-one")),
    ("scaler", StandardScaler()),
])

# (b) Sin-Cos cho lat/lon
latlon_cols = ["latitude", "longitude"]

def latlon_sincos(X_array):
    """
    Input shape: (n, 2) with [latitude, longitude] in degrees
    Output shape: (n, 4) => [sin(lat), cos(lat), sin(lon), cos(lon)]
    """
    X_array = np.asarray(X_array, dtype=float)
    lat = np.deg2rad(X_array[:, 0])
    lon = np.deg2rad(X_array[:, 1])
    return np.column_stack([np.sin(lat), np.cos(lat), np.sin(lon), np.cos(lon)])

latlon_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("sincos", FunctionTransformer(latlon_sincos, feature_names_out=None)),
    ("scaler", StandardScaler()),
])

# (c) Numeric còn lại (không log, không latlon)
num_cols_rest = ["severity_index", "response_efficiency_score", "recovery_days"]
num_rest_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
])

# (d) Categorical
cat_cols = ["country", "disaster_type"]
cat_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("ohe", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("cat", cat_transformer, cat_cols),
        ("lognum", log_transformer, log_cols),
        ("latlon", latlon_transformer, latlon_cols),
        ("num", num_rest_transformer, num_cols_rest),
    ],
    remainder="drop",
)

model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("lr", LinearRegression())
])

# =========================
# 3) Train / Evaluate
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:,.4f}")
print(f"R2:   {r2:,.4f}")

# =========================
# 4) Test: đưa 1 dòng dữ liệu mẫu vào để dự đoán
# =========================
sample_row = {
    "country": "Vietnam",
    "disaster_type": "Flood",
    "severity_index": 7.2,
    "casualties": 35,
    "response_time_hours": 18,
    "aid_amount_usd": 150000,
    "response_efficiency_score": 0.78,
    "recovery_days": 45,
    "latitude": 16.0471,
    "longitude": 108.2062,
}

sample_df = pd.DataFrame([sample_row])
pred_loss = model.predict(sample_df)[0]

print("\n=== Sample prediction ===")
print("Input sample:")
print(sample_df)
print(f"Predicted economic_loss_usd: {pred_loss:,.2f}")
