In [58]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score


In [59]:
# ƒë·ªçc file csv b·ªè ng√†y th√°ng
df = pd.read_csv("global_disaster_response_2018_2024.csv")

target_col = "economic_loss_usd"
raw_features = [
    "country",
    "disaster_type",
    "severity_index",
    "casualties",
    "response_time_hours",
    "aid_amount_usd",
    "response_efficiency_score",
    "recovery_days",
    "latitude",
    "longitude",
]

df = df[raw_features + [target_col]].copy()

In [60]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 11 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   country                    50000 non-null  object 
 1   disaster_type              50000 non-null  object 
 2   severity_index             50000 non-null  float64
 3   casualties                 50000 non-null  int64  
 4   response_time_hours        50000 non-null  float64
 5   aid_amount_usd             50000 non-null  float64
 6   response_efficiency_score  50000 non-null  float64
 7   recovery_days              50000 non-null  int64  
 8   latitude                   50000 non-null  float64
 9   longitude                  50000 non-null  float64
 10  economic_loss_usd          50000 non-null  float64
dtypes: float64(7), int64(2), object(2)
memory usage: 4.2+ MB


In [61]:
pd.set_option('display.float_format', '{:.2f}'.format)
df.describe()

Unnamed: 0,severity_index,casualties,response_time_hours,aid_amount_usd,response_efficiency_score,recovery_days,latitude,longitude,economic_loss_usd
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,5.02,100.59,12.18,250000.33,87.57,49.68,0.23,0.09,5068593.45
std,1.94,65.05,9.26,143227.51,10.19,20.1,34.76,98.38,3268540.52
min,1.0,0.0,1.0,16.6,29.75,2.0,-59.99,-170.0,527.39
25%,3.66,51.0,6.27,142966.34,83.06,36.0,-29.85,-85.55,2585513.48
50%,4.99,91.0,10.51,230536.47,89.18,49.0,0.27,0.39,4548350.74
75%,6.34,138.0,15.45,335225.93,94.7,63.0,30.49,85.67,6950614.61
max,10.0,524.0,63.1,1126465.23,100.0,112.0,60.0,170.0,24456237.87


 0   country                    Qu·ªëc gia
 1   disaster_type              Lo·∫°i th·∫£m h·ªça 
 2   severity_index             m·ª©c ƒë·ªô th·∫£m h·ªça
 3   casualties                 ng∆∞·ªùi b·ªã ·∫£nh h∆∞·ªüng 
 4   response_time_hours        th·ªùi gian ph·∫£n ·ª©ng
 5   aid_amount_usd             t·ªïng vi·ªán tr·ª£ qu·ªëc t·∫ø
 6   response_efficiency_score  ƒëi·ªÉm hi·ªáu qu·∫£ ph·∫£n ·ª©ng
 7   recovery_days              ng√†y h·ªìi ph·ª•c 
 8   latitude                   vƒ© ƒë·ªô
 9   longitude                  kinh ƒë·ªô
 10  economic_loss_usd          thi·ªát h·∫°i kinh t·∫ø

In [None]:

#Sin-Cos transform vƒ© ƒë·ªô v√† kinh ƒë·ªô
def add_latlon_sincos(dataframe: pd.DataFrame) -> pd.DataFrame:
    df_ = dataframe.copy()

    df_["latitude"] = pd.to_numeric(df_["latitude"], errors="coerce")
    df_["longitude"] = pd.to_numeric(df_["longitude"], errors="coerce")

    lat_rad = np.deg2rad(df_["latitude"])
    lon_rad = np.deg2rad(df_["longitude"])

    df_["lat_sin"] = np.sin(lat_rad)
    df_["lat_cos"] = np.cos(lat_rad)
    df_["lon_sin"] = np.sin(lon_rad)
    df_["lon_cos"] = np.cos(lon_rad)

    return df_.drop(columns=["latitude", "longitude"])

df_fe = add_latlon_sincos(df)

In [63]:
# X/y + clean target
X = df_fe.drop(columns=[target_col])
y = pd.to_numeric(df_fe[target_col], errors="coerce")

mask = y.notna()
X = X.loc[mask].copy()
y = y.loc[mask].copy()

cat_cols = ["country", "disaster_type"]
num_cols = [c for c in X.columns if c not in cat_cols] 


In [64]:
# x·ª≠ l√Ω l·∫°i v√† training Random Forest Regression
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore")),
])

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols),
    ]
)

rfr = RandomForestRegressor(
    n_estimators=500,     
    random_state=42,
    n_jobs=-1,
    max_depth=4,
    min_samples_split=2,
    min_samples_leaf=1,
)

model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("rfr", rfr),
])

In [65]:
#Train + Evaluate
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

model.fit(X_train, y_train)

pred = model.predict(X_test)
rmse = mean_squared_error(y_test, pred) ** 0.5
r2 = r2_score(y_test, pred)

print("RandomForestRegressor trained")
print(f"RMSE: {rmse:,.2f}")
print(f"R2:   {r2:.4f}")

RandomForestRegressor trained
RMSE: 2,607,285.40
R2:   0.3602


In [66]:
# TEST
sample_raw = {
    "country": "Brazil",
    "disaster_type": "Flood",
    "severity_index": 0.9,
    "casualties": 1000,
    "response_time_hours": 18,
    "aid_amount_usd": 250000,
    "response_efficiency_score": 0.81,
    "recovery_days": 45,
    "latitude": 10.8231,
    "longitude": 106.6297,
}

sample_df = pd.DataFrame([sample_raw])
sample_df_fe = add_latlon_sincos(sample_df)

sample_pred = model.predict(sample_df_fe)[0]
print("\nüîé Sample prediction")
print(sample_raw)
print(f"Predicted economic_loss_usd = {sample_pred:,.2f}")


üîé Sample prediction
{'country': 'Brazil', 'disaster_type': 'Flood', 'severity_index': 0.9, 'casualties': 1000, 'response_time_hours': 18, 'aid_amount_usd': 250000, 'response_efficiency_score': 0.81, 'recovery_days': 45, 'latitude': 10.8231, 'longitude': 106.6297}
Predicted economic_loss_usd = 1,132,624.57
