In [46]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
df = pd.read_csv("global_disaster_response_2018_2024.csv")

target_col = "economic_loss_usd"
raw_features = [
    "country",
    "disaster_type",
    "severity_index",
    "casualties",
    "response_time_hours",
    "aid_amount_usd",
    "response_efficiency_score",
    "recovery_days",
    "latitude",
    "longitude",
]

df = df[raw_features + [target_col]].copy()

In [None]:
# =========================
# 2) Sin-Cos transform for lat/lon
# =========================
def add_latlon_sincos(dataframe: pd.DataFrame) -> pd.DataFrame:
    df_ = dataframe.copy()

    df_["latitude"] = pd.to_numeric(df_["latitude"], errors="coerce")
    df_["longitude"] = pd.to_numeric(df_["longitude"], errors="coerce")

    lat_rad = np.deg2rad(df_["latitude"])
    lon_rad = np.deg2rad(df_["longitude"])

    df_["lat_sin"] = np.sin(lat_rad)
    df_["lat_cos"] = np.cos(lat_rad)
    df_["lon_sin"] = np.sin(lon_rad)
    df_["lon_cos"] = np.cos(lon_rad)

    return df_.drop(columns=["latitude", "longitude"])

df_fe = add_latlon_sincos(df)

In [None]:
# =========================
# 3) X/y + clean target
# =========================
X = df_fe.drop(columns=[target_col])
y = pd.to_numeric(df_fe[target_col], errors="coerce")

mask = y.notna()
X = X.loc[mask].copy()
y = y.loc[mask].copy()

cat_cols = ["country", "disaster_type"]
num_cols = [c for c in X.columns if c not in cat_cols]  # numeric + sin/cos


In [None]:
# =========================
# 4) Preprocess + Model
# =========================
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)

rfr = RandomForestRegressor(
    n_estimators=500,      # tÄƒng lÃªn 500 náº¿u mÃ¡y báº¡n máº¡nh vÃ  muá»‘n tá»‘t hÆ¡n
    random_state=42,
    n_jobs=-1,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
)

model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("rfr", rfr),
])

In [None]:
# =========================
# 5) Train + Evaluate
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model.fit(X_train, y_train)

pred = model.predict(X_test)
rmse = mean_squared_error(y_test, pred) ** 0.5
r2 = r2_score(y_test, pred)

print("âœ… RandomForestRegressor trained")
print(f"RMSE: {rmse:,.2f}")
print(f"R2:   {r2:.4f}")

In [45]:
# =========================
# 6) TEST: Ä‘Æ°a 1 dÃ²ng dá»¯ liá»‡u máº«u vÃ o Ä‘á»ƒ predict
# =========================
sample_raw = {
    "country": "Brazil",
    "disaster_type": "Flood",
    "severity_index": 0.9,
    "casualties": 1000,
    "response_time_hours": 18,
    "aid_amount_usd": 250000,
    "response_efficiency_score": 0.81,
    "recovery_days": 45,
    "latitude": 10.8231,
    "longitude": 106.6297,
}

sample_df = pd.DataFrame([sample_raw])
sample_df_fe = add_latlon_sincos(sample_df)

sample_pred = model.predict(sample_df_fe)[0]
print("\nðŸ”Ž Sample prediction")
print(sample_raw)
print(f"Predicted economic_loss_usd = {sample_pred:,.2f}")


ðŸ”Ž Sample prediction
{'country': 'Brazil', 'disaster_type': 'Flood', 'severity_index': 0.9, 'casualties': 1000, 'response_time_hours': 18, 'aid_amount_usd': 250000, 'response_efficiency_score': 0.81, 'recovery_days': 45, 'latitude': 10.8231, 'longitude': 106.6297}
Predicted economic_loss_usd = 1,126,590.61
