In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score


In [3]:
df = pd.read_csv("global_disaster_response_2018_2024.csv")

df.head()


Unnamed: 0,date,country,disaster_type,severity_index,casualties,economic_loss_usd,response_time_hours,aid_amount_usd,response_efficiency_score,recovery_days,latitude,longitude
0,31/1/2021,Brazil,Earthquake,5.99,111,7934365.71,15.62,271603.79,83.21,67,-30.613,-122.557
1,23/12/2018,Brazil,Extreme Heat,6.53,100,8307648.99,5.03,265873.81,96.18,55,10.859,-159.194
2,10/8/2020,India,Hurricane,1.55,22,765136.99,32.54,49356.49,60.4,22,0.643,-160.978
3,15/9/2022,Indonesia,Extreme Heat,4.55,94,1308251.31,7.83,237512.88,86.41,47,-33.547,30.35
4,28/9/2022,United States,Wildfire,3.8,64,2655864.36,21.9,188910.69,72.81,42,-19.17,-117.137


In [4]:
features = [
    "country",
    "disaster_type",
    "severity_index",
    "casualties",
    "response_time_hours",
    "aid_amount_usd",
    "response_efficiency_score",
    "recovery_days"
]

target = "economic_loss_usd"

df_model = df[features + [target]].dropna()


In [5]:
log_cols = ["casualties", "response_time_hours", "aid_amount_usd"]

for col in log_cols:
    df_model[col] = np.log1p(df_model[col])


In [6]:
X = df_model[features]
y = df_model[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)


In [9]:
categorical_features = ["country", "disaster_type"]
numeric_features = [
    "severity_index",
    "casualties",
    "response_time_hours",
    "aid_amount_usd",
    "response_efficiency_score",
    "recovery_days"
]

preprocessor = ColumnTransformer(
    transformers=[
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features),
        ("num", "passthrough", numeric_features)
    ]
)


In [11]:
model = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("regressor", LinearRegression())
    ]
)


In [12]:
model.fit(X_train, y_train)


In [13]:
y_pred = model.predict(X_test)

rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:,.2f}")
print(f"R2 Score: {r2:.4f}")


RMSE: 2,606,949.72
R2 Score: 0.3603


In [20]:
sample_data = pd.DataFrame([
    {
     "country": "Brazil",
    "disaster_type": "Earthquake",
    "severity_index": 5.99,
    "casualties": 111,
    "response_time_hours": 15.62,
    "aid_amount_usd": 271603.79,
    "response_efficiency_score": 83.21,
    "recovery_days": 67
    }
])
# kết quả đúng 7,934,365.71 USD

In [21]:
for col in log_cols:
    sample_data[col] = np.log1p(sample_data[col])


In [22]:
predicted_loss = model.predict(sample_data)

print(f"Dự đoán economic_loss_usd: {predicted_loss[0]:,.2f} USD")


Dự đoán economic_loss_usd: 6,063,819.63 USD


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(6,6))
plt.scatter(y_test, pred, alpha=0.4)


plt.plot(
    [y_test.min(), y_test.max()],
    [y_test.min(), y_test.max()],
    color="red",
    linestyle="--"
)

plt.xlabel("Thiệt hại kinh tế thực tế (USD)")
plt.ylabel("Thiệt hại kinh tế dự đoán (x10tr USD)")
plt.title("Thiệt hại kinh tế dự đoán so với thực tế (x10tr USD)")
plt.tight_layout()
plt.show()