In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score

from xgboost import XGBRegressor


# ===== 1) Load data =====
path = "/mnt/data/global_disaster_response_2018_2024.csv"
df = pd.read_csv(path)

# Chỉ lấy đúng các cột cần thiết
feature_cols = [
    "country", "disaster_type",
    "severity_index", "casualties", "response_time_hours", "aid_amount_usd",
    "response_efficiency_score", "recovery_days",
    "latitude", "longitude"
]
target_col = "economic_loss_usd"

df = df[feature_cols + [target_col]].copy()

# (khuyến nghị) ép numeric + xử lý missing cơ bản
num_cols = ["severity_index","casualties","response_time_hours","aid_amount_usd",
            "response_efficiency_score","recovery_days","latitude","longitude"]
for c in num_cols + [target_col]:
    df[c] = pd.to_numeric(df[c], errors="coerce")

df = df.dropna(subset=[target_col])  # target không được NaN
# Với feature: có thể fillna đơn giản (bạn có thể thay bằng median/mean tuỳ bài)
df["country"] = df["country"].fillna("Unknown")
df["disaster_type"] = df["disaster_type"].fillna("Unknown")
for c in num_cols:
    df[c] = df[c].fillna(df[c].median())


# ===== 2) Define transforms =====
cat_features = ["country", "disaster_type"]
log_features = ["casualties", "response_time_hours", "aid_amount_usd"]
other_num_features = ["severity_index", "response_efficiency_score", "recovery_days"]

def latlon_sincos_transform(X):
    """
    X: array shape (n, 2) gồm [latitude, longitude] (độ)
    Return 4 cột: sin(lat), cos(lat), sin(lon), cos(lon)
    """
    X = np.asarray(X, dtype=float)
    lat = np.deg2rad(X[:, 0])
    lon = np.deg2rad(X[:, 1])
    return np.column_stack([np.sin(lat), np.cos(lat), np.sin(lon), np.cos(lon)])

latlon_transformer = FunctionTransformer(latlon_sincos_transform, feature_names_out="one-to-one")
log_transformer = FunctionTransformer(lambda x: np.log1p(np.maximum(x, 0)), feature_names_out="one-to-one")

preprocess = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), cat_features),
        ("log", log_transformer, log_features),
        ("latlon", latlon_transformer, ["latitude", "longitude"]),
        ("num", "passthrough", other_num_features),
    ],
    remainder="drop"
)

# ===== 3) Model =====
xgb = XGBRegressor(
    n_estimators=800,
    learning_rate=0.05,
    max_depth=6,
    subsample=0.85,
    colsample_bytree=0.85,
    reg_alpha=0.0,
    reg_lambda=1.0,
    random_state=42,
    n_jobs=-1,
    objective="reg:squarederror"
)

model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", xgb)
])

# ===== 4) Train / Test =====
X = df[feature_cols]
y = df[target_col]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model.fit(X_train, y_train)

pred = model.predict(X_test)
rmse = mean_squared_error(y_test, pred, squared=False)
r2 = r2_score(y_test, pred)

print(f"RMSE: {rmse:,.4f}")
print(f"R2  : {r2:,.4f}")


# ===== 5) Test dự đoán với 1 dòng dữ liệu mẫu =====
# Cách 1: Lấy 1 dòng thật trong tập test làm mẫu
sample_row = X_test.iloc[[0]].copy()     # giữ dạng DataFrame 1 dòng
sample_pred = model.predict(sample_row)[0]

print("\n--- SAMPLE (from X_test) ---")
print(sample_row)
print("Predicted economic_loss_usd:", sample_pred)

# Cách 2: Tự nhập 1 dòng dữ liệu mới (đúng 10 feature)
new_sample = pd.DataFrame([{
    "country": "Viet Nam",
    "disaster_type": "Flood",
    "severity_index": 7.2,
    "casualties": 15,
    "response_time_hours": 12,
    "aid_amount_usd": 250000,
    "response_efficiency_score": 0.78,
    "recovery_days": 45,
    "latitude": 21.0278,
    "longitude": 105.8342
}])

new_pred = model.predict(new_sample)[0]

print("\n--- NEW SAMPLE (manual) ---")
print(new_sample)
print("Predicted economic_loss_usd:", new_pred)
