# load data

In [None]:
import pandas as pd
import numpy as np


def load_data(
    path: str,
    dayfirst: bool = True
) -> pd.DataFrame:
 

    # Load raw data
    df = pd.read_csv(path)

    # Chuẩn hóa tên cột
    df.columns = df.columns.str.strip()

    # Parse date
    if "date" in df.columns:
        df["date"] = pd.to_datetime(
            df["date"],
            dayfirst=dayfirst,
            errors="coerce"
        )

    #Cột số
    num_cols = [
        "severity_index",
        "casualties",
        "economic_loss_usd",
        "response_time_hours",
        "aid_amount_usd",
        "response_efficiency_score",
        "recovery_days",
        "latitude",
        "longitude"
    ]

    for col in num_cols:
        if col in df.columns:
            df[col] = pd.to_numeric(df[col], errors="coerce")

    return df


Unnamed: 0,date,country,disaster_type,severity_index,casualties,economic_loss_usd,response_time_hours,aid_amount_usd,response_efficiency_score,recovery_days,latitude,longitude
2,2021-01-31,Brazil,Earthquake,5.99,111,7934365.71,15.62,271603.79,83.21,67,-30.613,-122.557
3,2018-12-23,Brazil,Extreme Heat,6.53,100,8307648.99,5.03,265873.81,96.18,55,10.859,-159.194
4,2020-08-10,India,Hurricane,1.55,22,765136.99,32.54,49356.49,60.4,22,0.643,-160.978
5,2022-09-15,Indonesia,Extreme Heat,4.55,94,1308251.31,7.83,237512.88,86.41,47,-33.547,30.35
6,2022-09-28,United States,Wildfire,3.8,64,2655864.36,21.9,188910.69,72.81,42,-19.17,-117.137


In [3]:
display(df.shape)
display(df.dtypes)

# Missing
missing = df.isna().sum().sort_values(ascending=False)
display(missing[missing>0])

# Trùng lặp
print("Duplicate rows:", df.duplicated().sum())

# Thống kê mô tả số
display(df.describe(include=[np.number]).T)


(50000, 12)

date                         datetime64[ns]
country                              object
disaster_type                        object
severity_index                      float64
casualties                            int64
economic_loss_usd                   float64
response_time_hours                 float64
aid_amount_usd                      float64
response_efficiency_score           float64
recovery_days                         int64
latitude                            float64
longitude                           float64
dtype: object

Series([], dtype: int64)

Duplicate rows: 0


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
severity_index,50000.0,5.015769,1.942843,1.0,3.66,4.99,6.34,10.0
casualties,50000.0,100.5911,65.05206,0.0,51.0,91.0,138.0,524.0
economic_loss_usd,50000.0,5068593.0,3268541.0,527.39,2585513.0,4548351.0,6950615.0,24456240.0
response_time_hours,50000.0,12.18303,9.259081,1.0,6.27,10.51,15.45,63.1
aid_amount_usd,50000.0,250000.3,143227.5,16.6,142966.3,230536.5,335225.9,1126465.0
response_efficiency_score,50000.0,87.57402,10.18896,29.75,83.06,89.18,94.7,100.0
recovery_days,50000.0,49.68256,20.09894,2.0,36.0,49.0,63.0,112.0
latitude,50000.0,0.2312053,34.75966,-59.994,-29.84825,0.2675,30.48825,59.997
longitude,50000.0,0.08988896,98.38021,-169.998,-85.55175,0.3905,85.665,169.999
