In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

importing the disaster half of the data

In [33]:
df = pd.read_csv('data_with_coordinates.csv')

we analyze what we have so far

In [34]:
df.head()
df.info()
df.describe()
df.columns

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61702 entries, 0 to 61701
Data columns (total 49 columns):
 #   Column                                     Non-Null Count  Dtype  
---  ------                                     --------------  -----  
 0   DisNo.                                     61702 non-null  object 
 1   Historic                                   61702 non-null  object 
 2   Classification Key                         61702 non-null  object 
 3   Disaster Group                             61702 non-null  object 
 4   Disaster Subgroup                          61702 non-null  object 
 5   Disaster Type                              61702 non-null  object 
 6   Disaster Subtype                           61702 non-null  object 
 7   External IDs                               18009 non-null  object 
 8   Event Name                                 15685 non-null  object 
 9   ISO                                        61702 non-null  object 
 10  Country               

Index(['DisNo.', 'Historic', 'Classification Key', 'Disaster Group',
       'Disaster Subgroup', 'Disaster Type', 'Disaster Subtype',
       'External IDs', 'Event Name', 'ISO', 'Country', 'Subregion', 'Region',
       'Location', 'Origin', 'Associated Types', 'OFDA/BHA Response', 'Appeal',
       'Declaration', 'AID Contribution ('000 US$)', 'Magnitude',
       'Magnitude Scale', 'River Basin', 'Start Year', 'Start Month',
       'Start Day', 'End Year', 'End Month', 'End Day', 'Total Deaths',
       'No. Injured', 'No. Affected', 'No. Homeless', 'Total Affected',
       'Reconstruction Costs ('000 US$)',
       'Reconstruction Costs, Adjusted ('000 US$)',
       'Insured Damage ('000 US$)', 'Insured Damage, Adjusted ('000 US$)',
       'Total Damage ('000 US$)', 'Total Damage, Adjusted ('000 US$)', 'CPI',
       'Admin Units', 'Entry Date', 'Last Update', 'location_query',
       'Latitude_cached', 'Longitude_cached', 'Latitude', 'Longitude'],
      dtype='object')

Now we do some data cleaning before trying to merge the data

In [35]:
df = df.dropna(subset=['Latitude', 'Longitude'])
columns_to_drop = [
    'External IDs', 'OFDA/BHA Response', 'Appeal', 'Declaration',
    'Insured Damage (\'000 US$)', 'Insured Damage, Adjusted (\'000 US$)',
    'Reconstruction Costs (\'000 US$)', 'Reconstruction Costs, Adjusted (\'000 US$)',
    'Total Damage (\'000 US$)', 'Total Damage, Adjusted (\'000 US$)',
    'Admin Units', 'Entry Date', 'Last Update', 'DisNo.', 'Historic', 'Magnitude Scale', 'Origin', 'Associated Types', 'River Basin', 'CPI', 'Magnitude', 'Classification Key', 'Total Deaths', 'Total Affected','No. Injured', 'No. Affected', 'No. Homeless', 'Latitude_cached', 'location_query', 'Longitude_cached', 'Disaster Group', 'Disaster Subgroup', 'Event Name', 'ISO', 'AID Contribution (\'000 US$)'
]

df = df.drop(columns=columns_to_drop, errors='ignore')  # ignore if any column is missing
df.head()
df.to_csv('cleaned_coordinate_data.csv', index=False)
df.columns

Index(['Disaster Type', 'Disaster Subtype', 'Country', 'Subregion', 'Region',
       'Location', 'Start Year', 'Start Month', 'Start Day', 'End Year',
       'End Month', 'End Day', 'Latitude', 'Longitude'],
      dtype='object')

In [36]:
print(df['Disaster Subtype'].value_counts())

Disaster Subtype
Riverine flood                      8111
Tropical cyclone                    5384
Flood (General)                     4805
Drought                             2530
Flash flood                         2262
Bacterial disease                   1552
Cold wave                           1539
Viral disease                       1386
Storm (General)                     1267
Blizzard/Winter storm               1241
Road                                1097
Tornado                             1008
Heat wave                            973
Severe weather                       897
Lightning/Thunderstorms              849
Extra-tropical storm                 656
Ground movement                      578
Severe winter conditions             544
Forest fire                          518
Landslide (wet)                      504
Water                                479
Fire (Miscellaneous)                 361
Explosion (Industrial)               356
Air                                  228

In [37]:
# Normalize the 'Disaster Subtype' column (convert to lowercase, strip spaces)
df['Disaster Subtype'] = df['Disaster Subtype'].astype(str).str.strip().str.lower()

# Define allowed subtypes (also normalized to lowercase and stripped)
valid_subtypes = [s.lower().strip() for s in [
    # Floods
    'Riverine flood',
    'Flood (General)',
    'Flash flood',
    'Coastal flood',

    # Storms
    'Tropical cyclone',
    'Storm (General)',
    'Lightning/Thunderstorms',
    'Hail',
    'Severe weather',

    # Temperature Events
    'Heat wave',
    'Cold wave',

    # Drought
    'Drought',

    # Optional Fire-related
    'Forest fire',
    'Wildfire (General)',
    'Land fire (Brush, Bush, Pasture)'
]]

# Drop all rows not in the valid_subtypes list — IN PLACE
df.drop(df[~df['Disaster Subtype'].isin(valid_subtypes)].index, inplace=True)

# Optional: reset index
df.reset_index(drop=True, inplace=True)
df


Unnamed: 0,Disaster Type,Disaster Subtype,Country,Subregion,Region,Location,Start Year,Start Month,Start Day,End Year,End Month,End Day,Latitude,Longitude
0,Drought,drought,Djibouti,Sub-Saharan Africa,Africa,Ali Sabieh,2001,6.0,,2001,,,11.163069,42.837278
1,Drought,drought,Djibouti,Sub-Saharan Africa,Africa,Dikhil,2001,6.0,,2001,,,11.428370,42.063977
2,Drought,drought,Djibouti,Sub-Saharan Africa,Africa,Djibouti,2001,6.0,,2001,,,11.814597,42.845306
3,Drought,drought,Djibouti,Sub-Saharan Africa,Africa,Obock,2001,6.0,,2001,,,11.964015,43.292228
4,Drought,drought,Sudan,Northern Africa,Africa,Northern Darfur,2000,1.0,,2001,,,9.203465,26.916471
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29866,Drought,drought,Kenya,Sub-Saharan Africa,Africa,Mandera,2025,1.0,,2025,,,3.228533,40.705615
29867,Drought,drought,Kenya,Sub-Saharan Africa,Africa,Marsabit,2025,1.0,,2025,,,2.857958,37.715489
29868,Drought,drought,Kenya,Sub-Saharan Africa,Africa,Turkana,2025,1.0,,2025,,,3.525895,36.074295
29869,Drought,drought,Kenya,Sub-Saharan Africa,Africa,Samburu,2025,1.0,,2025,,,1.539446,36.942166


kept data for africa only for simplicity....if we get to get more data i'll add the other surrounding areas 

In [38]:
# Normalize 'Region' column: lowercase and strip
df['Region'] = df['Region'].astype(str).str.strip().str.lower()

# Drop rows where Region is not 'africa'
df.drop(df[df['Region'] != 'africa'].index, inplace=True)

# Optional: reset index
df.reset_index(drop=True, inplace=True)
df.head()

Unnamed: 0,Disaster Type,Disaster Subtype,Country,Subregion,Region,Location,Start Year,Start Month,Start Day,End Year,End Month,End Day,Latitude,Longitude
0,Drought,drought,Djibouti,Sub-Saharan Africa,africa,Ali Sabieh,2001,6.0,,2001,,,11.163069,42.837278
1,Drought,drought,Djibouti,Sub-Saharan Africa,africa,Dikhil,2001,6.0,,2001,,,11.42837,42.063977
2,Drought,drought,Djibouti,Sub-Saharan Africa,africa,Djibouti,2001,6.0,,2001,,,11.814597,42.845306
3,Drought,drought,Djibouti,Sub-Saharan Africa,africa,Obock,2001,6.0,,2001,,,11.964015,43.292228
4,Drought,drought,Sudan,Northern Africa,africa,Northern Darfur,2000,1.0,,2001,,,9.203465,26.916471


Filling Missing Values in the dates columns

In [39]:
import pandas as pd
import numpy as np

# Forward-fill start date fields (no warnings now)
df[['Start Year', 'Start Month', 'Start Day']] = df[['Start Year', 'Start Month', 'Start Day']].ffill()

# Forward-fill end date fields
df[['End Year', 'End Month', 'End Day']] = df[['End Year', 'End Month', 'End Day']].ffill()

# Ensure types are numeric
date_cols = ['Start Year', 'Start Month', 'Start Day', 'End Year', 'End Month', 'End Day']
df[date_cols] = df[date_cols].apply(pd.to_numeric, errors='coerce')

# Adjust end dates to be >= start dates
for i in df.index:
    if pd.notnull(df.at[i, 'End Year']) and df.at[i, 'End Year'] < df.at[i, 'Start Year']:
        df.at[i, 'End Year'] = df.at[i, 'Start Year']

    if df.at[i, 'End Year'] == df.at[i, 'Start Year']:
        if pd.notnull(df.at[i, 'End Month']) and df.at[i, 'End Month'] < df.at[i, 'Start Month']:
            df.at[i, 'End Month'] = df.at[i, 'Start Month']

    if (df.at[i, 'End Year'] == df.at[i, 'Start Year'] and
        df.at[i, 'End Month'] == df.at[i, 'Start Month']):
        if pd.notnull(df.at[i, 'End Day']) and df.at[i, 'End Day'] < df.at[i, 'Start Day']:
            df.at[i, 'End Day'] = df.at[i, 'Start Day']

df.dropna(subset=['Start Year', 'Start Month', 'Start Day', 'End Year', 'End Month', 'End Day'], inplace=True)
df.reset_index(drop=True, inplace=True)
df.to_csv('cleaned_data.csv', index=False)

df.head()

Unnamed: 0,Disaster Type,Disaster Subtype,Country,Subregion,Region,Location,Start Year,Start Month,Start Day,End Year,End Month,End Day,Latitude,Longitude
0,Flood,riverine flood,Angola,Sub-Saharan Africa,africa,Benguela province),2000,1.0,8.0,2000,1.0,15.0,-12.910466,14.035661
1,Flood,riverine flood,Angola,Sub-Saharan Africa,africa,Kuanza Norte province),2000,1.0,8.0,2000,1.0,15.0,-9.029675,15.092632
2,Flood,riverine flood,Mozambique,Sub-Saharan Africa,africa,Matutuine,2000,1.0,26.0,2000,3.0,27.0,-26.459698,32.574586
3,Flood,riverine flood,Mozambique,Sub-Saharan Africa,africa,Manhica,2000,1.0,26.0,2000,3.0,27.0,-25.288938,32.88279
4,Flood,riverine flood,Mozambique,Sub-Saharan Africa,africa,Magude,2000,1.0,26.0,2000,3.0,27.0,-24.759805,32.438285


In [40]:
print(df['Disaster Subtype'].value_counts())


Disaster Subtype
riverine flood                      1518
flood (general)                     1086
drought                              761
tropical cyclone                     506
flash flood                          416
storm (general)                      122
lightning/thunderstorms               79
coastal flood                         66
cold wave                             43
severe weather                        41
wildfire (general)                    33
forest fire                           23
land fire (brush, bush, pasture)      13
heat wave                              4
hail                                   1
Name: count, dtype: int64


As you can see above the dataset is very inbalanced, we need to do some undersampling and maybe even drop some of the minorities completely  

In [45]:
import pandas as pd
from sklearn.utils import resample

# Filter for only disaster subtypes of interest
target_subtypes = [
    'riverine flood', 'flood (general)', 'drought', 'tropical cyclone', 'flash flood',
    'storm (general)', 'lightning/thunderstorms', 'coastal flood', 'cold wave',
    'severe weather', 'wildfire (general)', 'forest fire', 'heat wave'
]

filtered_df = df[df['Disaster Subtype'].isin(target_subtypes)].copy()

# Drop rows with missing Start Year if any
filtered_df = filtered_df.dropna(subset=['Start Year'])

# Count number of rows per (Disaster Subtype, Start Year)
combo_counts = filtered_df.groupby(['Disaster Subtype', 'Start Year']).size()

# Keep only combinations that appear at least N times
MIN_COMBO_COUNT = 3
valid_combos = combo_counts[combo_counts >= MIN_COMBO_COUNT].index

# Filter the DataFrame to keep only those combinations
df_combo_filtered = filtered_df.set_index(['Disaster Subtype', 'Start Year']).loc[valid_combos].reset_index()

# Get the minimum number of rows among the remaining valid combinations
min_combo_size = (
    df_combo_filtered
    .groupby(['Disaster Subtype', 'Start Year'])
    .size()
    .min()
)

# Sample each group to min size
balanced_subtype_year = (
    df_combo_filtered
    .groupby(['Disaster Subtype', 'Start Year'], group_keys=False)
    .apply(lambda x: x.sample(n=min_combo_size, random_state=42))
    .reset_index(drop=True)
)
balanced_subtype_year
print(balanced_subtype_year['Disaster Subtype'].value_counts())
balanced_subtype_year.to_csv('cleaned_data2.csv', index=False)


Disaster Subtype
drought                    78
riverine flood             63
tropical cyclone           63
flash flood                60
flood (general)            51
storm (general)            45
lightning/thunderstorms    21
severe weather             15
cold wave                  12
coastal flood               9
wildfire (general)          9
forest fire                 6
Name: count, dtype: int64


  .apply(lambda x: x.sample(n=min_combo_size, random_state=42))


We try to enrich the data

In [46]:
import pandas as pd
import requests

# Load your balanced DataFrame
df = balanced_subtype_year.copy()

# Meteostat API credentials
API_HOST = "meteostat.p.rapidapi.com"
API_KEY = "90eb1cca2cmshfd32b674a76100cp1c5affjsn11d06d791ee1"

headers = {
    "x-rapidapi-host": API_HOST,
    "x-rapidapi-key": API_KEY
}

def build_date(year, month, day):
    try:
        return f"{int(year):04d}-{int(month):02d}-{int(day):02d}"
    except:
        return None

def find_nearest_station(lat, lon):
    url = f"https://{API_HOST}/stations/nearby?lat={lat}&lon={lon}"
    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        stations = response.json().get('data', [])
        return stations[0]['id'] if stations else None
    except Exception as e:
        print(f"Station fetch error at {lat}, {lon}: {e}")
        return None

def fetch_weather_data(station_id, start_date, end_date):
    url = f"https://{API_HOST}/stations/daily"
    params = {
        "station": station_id,
        "start": start_date,
        "end": end_date,
        "units": "metric"
    }
    try:
        response = requests.get(url, headers=headers, params=params, timeout=10)
        response.raise_for_status()
        return response.json().get('data', [])
    except Exception as e:
        print(f"Weather fetch error for station {station_id} from {start_date} to {end_date}: {e}")
        return []

def aggregate_weather(data):
    if not data:
        return {}
    df = pd.DataFrame(data)
    return {
        "avg_temp": df['tavg'].mean(),
        "max_temp": df['tmax'].max(),
        "min_temp": df['tmin'].min(),
        "total_precip": df['prcp'].sum(),
        "avg_wind_speed": df['wspd'].mean(),
        "sunshine_total": df['tsun'].sum() if 'tsun' in df.columns else None
    }

# Process and enrich
enriched_rows = []

for idx, row in df.iterrows():
    lat, lon = row["Latitude"], row["Longitude"]
    start_date = build_date(row["Start Year"], row["Start Month"], row["Start Day"])
    end_date = build_date(row["End Year"], row["End Month"], row["End Day"])

    if not start_date or not end_date:
        print(f"Invalid date for row {idx}")
        continue

    station_id = find_nearest_station(lat, lon)
    if not station_id:
        print(f"No station found near {lat}, {lon}")
        continue

    weather_data = fetch_weather_data(station_id, start_date, end_date)
    weather_summary = aggregate_weather(weather_data)

    enriched_row = row.to_dict()
    enriched_row.update(weather_summary)
    enriched_row["disaster_occurred"] = 1  # Binary label
    enriched_rows.append(enriched_row)

# Save enriched data
enriched_df = pd.DataFrame(enriched_rows)
enriched_df.to_csv("disaster_enriched.csv", index=False)


No station found near -17.7981948, 35.0884236
No station found near -16.8519695, 36.8785266
No station found near 13.4820706, -7.6096079
No station found near -4.072751, 35.8521735
No station found near 1.5394463, 36.942166
No station found near -4.072751, 35.8521735
No station found near 8.0, 32.0
No station found near 10.4386899, 47.5924085
No station found near 3.0, 42.0
No station found near 8.3676771, 49.083416
No station found near 15.2264398, 15.3151191
No station found near -12.7287195, 21.2737637
No station found near 3.9991789, 43.9995531
No station found near 9.912471, 49.392509
No station found near 1.00606, 38.7478954
No station found near 18.2612839, -73.8444985
No station found near 14.9320767, 3.4298122
No station found near 1.00606, 38.7478954
No station found near -1.5365119, 39.5508374
No station found near 1.5394463, 36.942166
No station found near 1.3453924, 42.0252942
No station found near 11.42837, 42.0639774
No station found near 17.1930577, 21.5813259
No statio

KeyboardInterrupt: 

In [None]:
import pandas as pd
import requests
from io import StringIO

# Load disaster CSV (skip first 53 data rows, keeping header)
df = pd.read_csv("cleaned_data2.csv", skiprows=range(1, 54))  # row 1 = header

#  Visual Crossing API key
API_KEY = "SNHPPP53MZDWLZ3YY5T3TEPLS"

# Weather elements to fetch
ELEMENTS = "datetime,temp,humidity,precip,windspeed,pressure"

# Helper: Build YYYY-MM-DD date from row
def build_date(row):
    return f"{int(row['Start Year'])}-{int(row['Start Month']):02d}-{int(row['Start Day']):02d}"

# Fetch weather data from Visual Crossing CSV API
def fetch_weather_csv(lat, lon, date):
    url = (
        f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/"
        f"{lat},{lon}/{date}?unitGroup=metric&elements={ELEMENTS}&include=days"
        f"&key={API_KEY}&contentType=csv"
    )
    try:
        response = requests.get(url, timeout=10)
        response.raise_for_status()
        df_weather = pd.read_csv(StringIO(response.text))
        if not df_weather.empty:
            return df_weather.iloc[0].to_dict()  # return first day's weather
    except Exception as e:
        print(f"Failed for {lat},{lon} on {date}: {e}")
    return {}

# Enrich the DataFrame
enriched_rows = []
for idx, row in df.iterrows():
    lat, lon = row["Latitude"], row["Longitude"]
    date = build_date(row)
    weather = fetch_weather_csv(lat, lon, date)
    if weather:
        enriched_row = row.to_dict()
        enriched_row.update({
            "weather_date": weather.get("datetime"),
            "temp": weather.get("temp"),
            "humidity": weather.get("humidity"),
            "precip": weather.get("precip"),
            "windspeed": weather.get("windspeed"),
            "pressure": weather.get("pressure")
        })
        enriched_rows.append(enriched_row)

# Save enriched data
enriched_df = pd.DataFrame(enriched_rows)
enriched_df.to_csv("disaster_weather_enriched.csv", index=False)
print("✅ Weather enrichment complete (using CSV API).")


Failed for 3.0,42.0 on 2010-02-15: 429 Client Error:  for url: https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/3.0,42.0/2010-02-15?unitGroup=metric&elements=datetime,temp,humidity,precip,windspeed,pressure&include=days&key=SNHPPP53MZDWLZ3YY5T3TEPLS&contentType=csv
Failed for -3.0214816,29.6458034 on 2011-08-15: 429 Client Error:  for url: https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/-3.0214816,29.6458034/2011-08-15?unitGroup=metric&elements=datetime,temp,humidity,precip,windspeed,pressure&include=days&key=SNHPPP53MZDWLZ3YY5T3TEPLS&contentType=csv
Failed for 8.3676771,49.083416 on 2011-01-15: 429 Client Error:  for url: https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/8.3676771,49.083416/2011-01-15?unitGroup=metric&elements=datetime,temp,humidity,precip,windspeed,pressure&include=days&key=SNHPPP53MZDWLZ3YY5T3TEPLS&contentType=csv
Failed for 38.6280278,-90.1910154 on 2011-12-31: 

KeyboardInterrupt: 