In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score


# =====================
# 1) Load data
# =====================
df = pd.read_csv("global_disaster_response_2018_2024.csv")  # đổi path nếu cần

TARGET = "economic_loss_usd"
FEATURES = [
    "country", "disaster_type", "severity_index", "casualties",
    "response_time_hours", "aid_amount_usd", "response_efficiency_score",
    "recovery_days", "latitude", "longitude"
]

df_model = df[FEATURES + [TARGET]].copy()
df_model = df_model.dropna(subset=[TARGET])  # bắt buộc có nhãn


# =====================
# 2) Feature engineering (Sin/Cos + Log)
# =====================
def add_trig_and_log(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()

    # ép kiểu số (nếu có text bẩn -> NaN)
    num_cols_to_cast = [
        "severity_index", "casualties", "response_time_hours", "aid_amount_usd",
        "response_efficiency_score", "recovery_days", "latitude", "longitude"
    ]
    for c in num_cols_to_cast:
        X[c] = pd.to_numeric(X[c], errors="coerce")

    # Sin/Cos cho lat/lon (độ -> rad)
    lat_rad = np.deg2rad(X["latitude"])
    lon_rad = np.deg2rad(X["longitude"])
    X["lat_sin"] = np.sin(lat_rad)
    X["lat_cos"] = np.cos(lat_rad)
    X["lon_sin"] = np.sin(lon_rad)
    X["lon_cos"] = np.cos(lon_rad)

    # log1p cho các cột lệch
    for col in ["casualties", "response_time_hours", "aid_amount_usd"]:
        X[col] = np.log1p(X[col])

    # bỏ lat/lon gốc (đã thay bằng sin/cos)
    X = X.drop(columns=["latitude", "longitude"])
    return X

feat_engineering = FunctionTransformer(add_trig_and_log, validate=False)


# =====================
# 3) Preprocess + Model
# =====================
cat_cols = ["country", "disaster_type"]
num_cols = [
    "severity_index", "casualties", "response_time_hours", "aid_amount_usd",
    "response_efficiency_score", "recovery_days",
    "lat_sin", "lat_cos", "lon_sin", "lon_cos"
]

preprocess = ColumnTransformer(
    transformers=[
        ("cat", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore"))
        ]), cat_cols),

        ("num", Pipeline(steps=[
            ("imputer", SimpleImputer(strategy="median"))
        ]), num_cols),
    ],
    remainder="drop"
)

model = RandomForestRegressor(
    n_estimators=300,
    random_state=42,
    n_jobs=-1
)

pipe = Pipeline(steps=[
    ("feat", feat_engineering),
    ("prep", preprocess),
    ("rf", model)
])


# =====================
# 4) Train/Test + Evaluate
# =====================
X = df_model[FEATURES]
y = df_model[TARGET]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE = {rmse:,.2f}")
print(f"R2   = {r2:.4f}")


# ==========================================================
# 5) TEST: đưa 1 dòng dữ liệu mẫu vào để dự đoán (2 cách)
# ==========================================================

# ---- Cách A (khuyên dùng): lấy 1 dòng thật từ tập test ----
sample_row = X_test.iloc[[0]].copy()  # DataFrame 1 dòng
sample_pred = pipe.predict(sample_row)[0]
print("\n--- Sample row (from X_test) ---")
print(sample_row.to_dict(orient="records")[0])
print("Predicted economic_loss_usd:", sample_pred)




RMSE = 2,633,893.47
R2   = 0.3471

--- Sample row (from X_test) ---
{'country': 'Italy', 'disaster_type': 'Tornado', 'severity_index': 8.6, 'casualties': 180, 'response_time_hours': 9.89, 'aid_amount_usd': 229132.37, 'response_efficiency_score': 86.97, 'recovery_days': 90, 'latitude': -1.115, 'longitude': -0.112}
Predicted economic_loss_usd: 8659532.391566671

--- New custom row ---
{'country': 'Viet Nam', 'disaster_type': 'Flood', 'severity_index': 7.2, 'casualties': 35, 'response_time_hours': 18, 'aid_amount_usd': 2500000, 'response_efficiency_score': 0.72, 'recovery_days': 45, 'latitude': 10.8231, 'longitude': 106.6297}
Predicted economic_loss_usd: 8363774.443066665


In [12]:
# ---- Cách B: tự tạo 1 dòng mới (bạn sửa giá trị tùy ý) ----
new_row = pd.DataFrame([{
    "country": "Brazil",
    "disaster_type": "Tornado",
    "severity_index": 9.0,
    "casualties": 1200,
    "response_time_hours": 18,
    "aid_amount_usd": 250000000,
    "response_efficiency_score": 0.34,
    "recovery_days": 90,
    "latitude": 10.8231,
    "longitude": 106.6297
}])

new_pred = pipe.predict(new_row)[0]
print("\n--- New custom row ---")
print(new_row.to_dict(orient="records")[0])
print("Predicted economic_loss_usd:", new_pred)


--- New custom row ---
{'country': 'Brazil', 'disaster_type': 'Tornado', 'severity_index': 9.0, 'casualties': 1200, 'response_time_hours': 18, 'aid_amount_usd': 250000000, 'response_efficiency_score': 0.34, 'recovery_days': 90, 'latitude': 10.8231, 'longitude': 106.6297}
Predicted economic_loss_usd: 9916438.548466668
