In [72]:
import numpy as np
import pandas as pd

In [73]:
# Load the datasets
print("Loading weather data...")
weather_data = pd.read_csv('../../data/Daily Data.csv')

print("Loading typhoon impact data...")
impact_data = pd.read_csv('../../data/data_withno_weatherdata.csv')

print("Loading typhoon duration/dates data...")
duration_data = pd.read_csv('../../data/Duration of typhoon.csv')

print(f"Weather data shape: {weather_data.shape}")
print(f"Impact data shape: {impact_data.shape}")
print(f"Duration data shape: {duration_data.shape}")
print("\nDatasets loaded successfully!")

Loading weather data...
Loading typhoon impact data...
Loading typhoon duration/dates data...
Weather data shape: (1828, 90)
Impact data shape: (1776, 26)
Duration data shape: (84, 7)

Datasets loaded successfully!


## Reset column for impact_data

In [74]:
impact_data.columns

Index(['Typhoon Name', 'Year', 'Region', 'Province', 'City/Municipality',
       'Families', 'Person', 'Brgy', 'Dead', 'Injured/Ill', 'Missing',
       'Totally', 'Partially', 'Total', 'Quantity', 'Cost', 'Type', 'Category',
       'Nearest_Station', 'Station_Province', 'Distance_km',
       'Max_24hr_Rainfall_mm', 'Total_Storm_Rainfall_mm', 'Min_Pressure_hPa',
       'Max_Sustained_Wind_kph', 'Duration_in_PAR_Hours'],
      dtype='object')

In [75]:
impact_data.head()

Unnamed: 0,Typhoon Name,Year,Region,Province,City/Municipality,Families,Person,Brgy,Dead,Injured/Ill,...,Type,Category,Nearest_Station,Station_Province,Distance_km,Max_24hr_Rainfall_mm,Total_Storm_Rainfall_mm,Min_Pressure_hPa,Max_Sustained_Wind_kph,Duration_in_PAR_Hours
0,BETTY,2023,2,BATANES,BASCO,3608.0,11120.0,6.0,0.0,0.0,...,['FAMILY FOOD PACK'],['FAMILY FOOD PACK'],BASCO,BATANES,2.497504,,,,,133.0
1,BETTY,2023,2,BATANES,ITBAYAT,968.0,3028.0,5.0,0.0,0.0,...,['FAMILY FOOD PACK'],['FAMILY FOOD PACK'],ITBAYAT,BATANES,3.204943,,,,,133.0
2,BETTY,2023,2,BATANES,IVANA,444.0,1532.0,4.0,0.0,0.0,...,['FAMILY FOOD PACK'],['FAMILY FOOD PACK'],BASCO,BATANES,9.470554,,,,,133.0
3,BETTY,2023,2,BATANES,MAHATAO,575.0,1792.0,4.0,0.0,0.0,...,['FAMILY FOOD PACK'],['FAMILY FOOD PACK'],BASCO,BATANES,4.890816,,,,,133.0
4,BETTY,2023,2,BATANES,SABTANG,575.0,1955.0,6.0,0.0,0.0,...,['FAMILY FOOD PACK'],['FAMILY FOOD PACK'],BASCO,BATANES,19.891231,,,,,133.0


In [76]:
impact_data.drop(columns=['Type', 'Category', 'Max_24hr_Rainfall_mm', 'Total_Storm_Rainfall_mm', 'Min_Pressure_hPa', 'Max_Sustained_Wind_kph'], inplace=True)

In [77]:
impact_data.head()

Unnamed: 0,Typhoon Name,Year,Region,Province,City/Municipality,Families,Person,Brgy,Dead,Injured/Ill,Missing,Totally,Partially,Total,Quantity,Cost,Nearest_Station,Station_Province,Distance_km,Duration_in_PAR_Hours
0,BETTY,2023,2,BATANES,BASCO,3608.0,11120.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,3608.0,2646179.36,BASCO,BATANES,2.497504,133.0
1,BETTY,2023,2,BATANES,ITBAYAT,968.0,3028.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,966.0,494592.0,ITBAYAT,BATANES,3.204943,133.0
2,BETTY,2023,2,BATANES,IVANA,444.0,1532.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,444.0,227328.0,BASCO,BATANES,9.470554,133.0
3,BETTY,2023,2,BATANES,MAHATAO,575.0,1792.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,575.0,291082.96,BASCO,BATANES,4.890816,133.0
4,BETTY,2023,2,BATANES,SABTANG,575.0,1955.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,575.0,296521.75,BASCO,BATANES,19.891231,133.0


## Join Impact Data with Duration Data

In [78]:
# First, let's examine the typhoon names in both datasets to understand matching
print("Unique typhoons in impact_data:")
impact_typhoons = set(impact_data['Typhoon Name'].str.upper().unique())
print(f"Count: {len(impact_typhoons)}")
print(sorted(impact_typhoons))

print("\nUnique typhoons in duration_data:")
duration_typhoons = set(duration_data['TC NAME'].str.upper().unique())
print(f"Count: {len(duration_typhoons)}")
print(sorted(duration_typhoons))

# Check which typhoons match
matching_typhoons = impact_typhoons.intersection(duration_typhoons)
missing_in_duration = impact_typhoons - duration_typhoons
missing_in_impact = duration_typhoons - impact_typhoons

print(f"\nMatching typhoons: {len(matching_typhoons)}")
print(f"Missing in duration_data: {len(missing_in_duration)} - {sorted(missing_in_duration)}")
print(f"Missing in impact_data: {len(missing_in_impact)} - {sorted(missing_in_impact)}")

Unique typhoons in impact_data:
Count: 25
['AGATON', 'AMANG', 'AMBO', 'BETTY', 'DANTE', 'DODONG', 'ESTER', 'FABIAN', 'FLORITA', 'HENRY', 'JENNY', 'JOLINA', 'KABAYAN', 'KARDING', 'KIKO', 'MARING', 'MAYMAY', 'NENENG', 'OBET', 'ODETTE', 'OFEL', 'PAENG', 'PEPITO', 'QUINTA', 'ROLLY']

Unique typhoons in duration_data:
Count: 69
['AGATON', 'AGHON', 'AMANG', 'AMBO', 'AURING', 'BASYANG', 'BETTY', 'BISING', 'BUTCHOY', 'CALOY', 'CARINA', 'CHEDENG', 'CRISING', 'DANTE', 'DINDO', 'DODONG', 'DOMENG', 'EGAY', 'EMONG', 'ENTENG', 'ESTER', 'FABIAN', 'FALCON', 'FERDIE', 'FLORITA', 'GARDO', 'GENER', 'GORING', 'GORIO', 'HANNA', 'HELEN', 'HENRY', 'HUANING', 'IGME', 'INDAY', 'INENG', 'ISANG', 'JENNY', 'JOLINA', 'JOSIE', 'JULIAN', 'KABAYAN', 'KARDING', 'KIKO', 'KRISTINE', 'LANNIE', 'LEON', 'LUIS', 'MARCE', 'MARING', 'MAYMAY', 'NANDO', 'NENENG', 'NIKA', 'OBET', 'ODETTE', 'OFEL', 'PAENG', 'PEPITO', 'QUEENIE', 'QUERUBIN', 'QUINTA', 'ROLLY', 'ROMINA', 'ROSAL', 'SIONY', 'TONYO', 'ULYSSES', 'VICKY']

Matching typho

In [79]:
# Prepare the data for joining
# Clean and standardize typhoon names and years
impact_data_clean = impact_data.copy()
duration_data_clean = duration_data.copy()

# Standardize typhoon names (convert to uppercase for matching)
impact_data_clean['Typhoon_Name_Clean'] = impact_data_clean['Typhoon Name'].str.strip().str.upper()
duration_data_clean['TC_Name_Clean'] = duration_data_clean['TC NAME'].str.strip().str.upper()

# Convert date columns in duration_data
duration_data_clean['PAR_START'] = pd.to_datetime(duration_data_clean['PAR BEG'], format='%m/%d/%Y', errors='coerce')
duration_data_clean['PAR_END'] = pd.to_datetime(duration_data_clean['PAR END'], format='%m/%d/%Y', errors='coerce')

print("Data preparation completed!")
print(f"Duration data with valid dates: {duration_data_clean.dropna(subset=['PAR_START', 'PAR_END']).shape[0]}")

Data preparation completed!
Duration data with valid dates: 84


In [80]:
# Perform the join
merged_data = impact_data_clean.merge(
    duration_data_clean[['YEAR', 'TC_Name_Clean', 'PAR_START', 'PAR_END', 'MSW', 'TYPE']],
    left_on=['Year', 'Typhoon_Name_Clean'],
    right_on=['YEAR', 'TC_Name_Clean'],
    how='left'
)

print(f"Original impact data: {len(impact_data_clean)} records")
print(f"Merged data: {len(merged_data)} records")

# Check how many records got duration data
records_with_duration = merged_data['PAR_START'].notna().sum()
print(f"Records with duration data: {records_with_duration}")
print(f"Records without duration data: {len(merged_data) - records_with_duration}")

# Display sample of merged data
print("\nSample of merged data:")
sample_cols = ['Typhoon Name', 'Year', 'Province', 'PAR_START', 'PAR_END', 'MSW', 'TYPE']
available_cols = [col for col in sample_cols if col in merged_data.columns]
print(merged_data[available_cols].head(10))

Original impact data: 1776 records
Merged data: 1776 records
Records with duration data: 1776
Records without duration data: 0

Sample of merged data:
  Typhoon Name  Year Province  PAR_START    PAR_END  MSW TYPE
0        BETTY  2023  BATANES 2023-05-27 2023-06-01  195  STY
1        BETTY  2023  BATANES 2023-05-27 2023-06-01  195  STY
2        BETTY  2023  BATANES 2023-05-27 2023-06-01  195  STY
3        BETTY  2023  BATANES 2023-05-27 2023-06-01  195  STY
4        BETTY  2023  BATANES 2023-05-27 2023-06-01  195  STY
5        BETTY  2023  BATANES 2023-05-27 2023-06-01  195  STY
6        HENRY  2022  BATANES 2022-08-31 2022-09-04  195  STY
7        JENNY  2023  BATANES 2023-09-29 2023-10-06  175   TY
8        JENNY  2023  BATANES 2023-09-29 2023-10-06  175   TY
9        JENNY  2023  BATANES 2023-09-29 2023-10-06  175   TY


In [81]:
# Clean up the merged dataset
# Drop temporary columns used for matching
final_merged_data = merged_data.drop(columns=['Typhoon_Name_Clean', 'TC_Name_Clean', 'YEAR'], errors='ignore')

# Rename columns for clarity
column_renames = {
    'TYPE': 'Typhoon_Type',
    'MSW': 'Max_Sustained_Wind_kph',
    'DURATION_HOURS': 'Duration_in_PAR_Hours'
}
final_merged_data = final_merged_data.rename(columns=column_renames)

print(f"Final merged dataset shape: {final_merged_data.shape}")
print(f"\nColumns in final dataset: {final_merged_data.columns.tolist()}")

# Show distribution by typhoon type
if 'Typhoon_Type' in final_merged_data.columns:
    print("\nDistribution by Typhoon Type:")
    print(final_merged_data['Typhoon_Type'].value_counts())

# Show summary statistics for new columns
numeric_cols = ['Duration_in_PAR_Hours', 'Max_Sustained_Wind_kph']
available_numeric = [col for col in numeric_cols if col in final_merged_data.columns]
if available_numeric:
    print("\nSummary statistics for new columns:")
    print(final_merged_data[available_numeric].describe())

Final merged dataset shape: (1776, 24)

Columns in final dataset: ['Typhoon Name', 'Year', 'Region', 'Province', 'City/Municipality', 'Families', 'Person', 'Brgy', 'Dead', 'Injured/Ill', 'Missing', 'Totally', 'Partially', 'Total', 'Quantity', 'Cost', 'Nearest_Station', 'Station_Province', 'Distance_km', 'Duration_in_PAR_Hours', 'PAR_START', 'PAR_END', 'Max_Sustained_Wind_kph', 'Typhoon_Type']

Distribution by Typhoon Type:
Typhoon_Type
STY    504
STS    503
TY     468
TS     234
TD      67
Name: count, dtype: int64

Summary statistics for new columns:
       Duration_in_PAR_Hours  Max_Sustained_Wind_kph
count            1776.000000             1776.000000
mean              117.337669              139.304617
std                37.039775               50.960721
min                39.000000               45.000000
25%                91.000000              110.000000
50%               122.000000              120.000000
75%               124.700000              195.000000
max               

In [82]:
final_merged_data.head()

Unnamed: 0,Typhoon Name,Year,Region,Province,City/Municipality,Families,Person,Brgy,Dead,Injured/Ill,...,Quantity,Cost,Nearest_Station,Station_Province,Distance_km,Duration_in_PAR_Hours,PAR_START,PAR_END,Max_Sustained_Wind_kph,Typhoon_Type
0,BETTY,2023,2,BATANES,BASCO,3608.0,11120.0,6.0,0.0,0.0,...,3608.0,2646179.36,BASCO,BATANES,2.497504,133.0,2023-05-27,2023-06-01,195,STY
1,BETTY,2023,2,BATANES,ITBAYAT,968.0,3028.0,5.0,0.0,0.0,...,966.0,494592.0,ITBAYAT,BATANES,3.204943,133.0,2023-05-27,2023-06-01,195,STY
2,BETTY,2023,2,BATANES,IVANA,444.0,1532.0,4.0,0.0,0.0,...,444.0,227328.0,BASCO,BATANES,9.470554,133.0,2023-05-27,2023-06-01,195,STY
3,BETTY,2023,2,BATANES,MAHATAO,575.0,1792.0,4.0,0.0,0.0,...,575.0,291082.96,BASCO,BATANES,4.890816,133.0,2023-05-27,2023-06-01,195,STY
4,BETTY,2023,2,BATANES,SABTANG,575.0,1955.0,6.0,0.0,0.0,...,575.0,296521.75,BASCO,BATANES,19.891231,133.0,2023-05-27,2023-06-01,195,STY


## Extract Extreme Weather Data During Typhoon Periods

In [83]:
# First, let's examine the weather data structure
print("Weather data columns:")
print(weather_data.columns[:20].tolist())  # Show first 20 columns

# Clean weather data date column
weather_data_clean = weather_data.copy()
weather_data_clean['Date'] = pd.to_datetime(weather_data_clean['Date(UTC)'], errors='coerce')

# Remove rows with invalid dates
weather_data_clean = weather_data_clean.dropna(subset=['Date'])

print(f"\nWeather data date range: {weather_data_clean['Date'].min()} to {weather_data_clean['Date'].max()}")
print(f"Total weather records: {len(weather_data_clean)}")

Weather data columns:
['Date(UTC)', 'Aparri, Cagayan Prec.Sum.Dly [mm]', 'Aparri, Cagayan Press.QFF.Min.Dly [hPa]', 'Aparri, Cagayan Wind.Speed.Dly [m/s]', 'Aparri, Cagayan Wind.Dir.Prevailing.Dly [deg.]', 'Baler (Radar), Aurora Prec.Sum.Dly [mm]', 'Baler (Radar), Aurora Press.QFF.Min.Dly [hPa]', 'Baler (Radar), Aurora Wind.Speed.Dly [m/s]', 'Baler (Radar), Aurora Wind.Dir.Prevailing.Dly [deg.]', 'Basco (Radar), Batanes Prec.Sum.Dly [mm]', 'Basco (Radar), Batanes Press.QFF.Min.Dly [hPa]', 'Basco (Radar), Batanes Wind.Speed.Dly [m/s]', 'Basco (Radar), Batanes Wind.Dir.Prevailing.Dly [deg.]', 'Borongan, Eastern Samar Prec.Sum.Dly [mm]', 'Borongan, Eastern Samar Press.QFF.Min.Dly [hPa]', 'Borongan, Eastern Samar Wind.Speed.Dly [m/s]', 'Borongan, Eastern Samar Wind.Dir.Prevailing.Dly [deg.]', 'Calayan, Cagayan Prec.Sum.Dly [mm]', 'Calayan, Cagayan Press.QFF.Min.Dly [hPa]', 'Calayan, Cagayan Wind.Speed.Dly [m/s]']

Weather data date range: 2020-01-01 00:00:00 to 2024-12-31 00:00:00
Total we

  weather_data_clean['Date'] = pd.to_datetime(weather_data_clean['Date(UTC)'], errors='coerce')


In [84]:
# Function to extract station names and their weather data columns
def extract_weather_stations_and_columns(weather_df):
    """Extract station names and their corresponding weather data columns"""
    stations_info = {}
    
    for col in weather_df.columns:
        if col != 'Date(UTC)' and col != 'Date' and ',' in col:
            parts = col.split(',')
            if len(parts) >= 2:
                station = parts[0].strip()
                metric = parts[1].strip()
                
                if station not in stations_info:
                    stations_info[station] = {
                        'rainfall': None,
                        'pressure': None,
                        'wind_speed': None
                    }
                
                # Map metric types to our standard names
                if 'Prec.Sum.Dly [mm]' in metric:
                    stations_info[station]['rainfall'] = col
                elif 'Press.QFF.Min.Dly [hPa]' in metric:
                    stations_info[station]['pressure'] = col
                elif 'Wind.Speed.Dly [m/s]' in metric:
                    stations_info[station]['wind_speed'] = col
    
    return stations_info

# Extract station information
stations_info = extract_weather_stations_and_columns(weather_data_clean)

print(f"Found {len(stations_info)} weather stations:")
for i, (station, metrics) in enumerate(list(stations_info.items())[:10], 1):
    print(f"{i:2d}. {station}:")
    print(f"    Rainfall: {'‚úì' if metrics['rainfall'] else '‚úó'}")
    print(f"    Pressure: {'‚úì' if metrics['pressure'] else '‚úó'}")
    print(f"    Wind Speed: {'‚úì' if metrics['wind_speed'] else '‚úó'}")
    
if len(stations_info) > 10:
    print(f"... and {len(stations_info) - 10} more stations")

Found 22 weather stations:
 1. Aparri:
    Rainfall: ‚úì
    Pressure: ‚úì
    Wind Speed: ‚úì
 2. Baler (Radar):
    Rainfall: ‚úì
    Pressure: ‚úì
    Wind Speed: ‚úì
 3. Basco (Radar):
    Rainfall: ‚úì
    Pressure: ‚úì
    Wind Speed: ‚úì
 4. Borongan:
    Rainfall: ‚úì
    Pressure: ‚úì
    Wind Speed: ‚úì
 5. Calayan:
    Rainfall: ‚úì
    Pressure: ‚úì
    Wind Speed: ‚úì
 6. Casiguran:
    Rainfall: ‚úì
    Pressure: ‚úì
    Wind Speed: ‚úì
 7. Catarman:
    Rainfall: ‚úì
    Pressure: ‚úì
    Wind Speed: ‚úì
 8. Catbalogan:
    Rainfall: ‚úì
    Pressure: ‚úì
    Wind Speed: ‚úì
 9. Clark Airport (DMIA):
    Rainfall: ‚úì
    Pressure: ‚úì
    Wind Speed: ‚úì
10. CLSU Mu√±oz:
    Rainfall: ‚úì
    Pressure: ‚úì
    Wind Speed: ‚úì
... and 12 more stations


In [85]:
def extract_extreme_weather_for_typhoon(weather_df, station_name, start_date, end_date, stations_info):
    """
    Extract extreme weather values for a specific station during a typhoon period
    
    Returns:
    - Max_24hr_Rainfall_mm: Highest single day rainfall
    - Total_Storm_Rainfall_mm: Total accumulated rainfall over storm duration
    - Min_Pressure_hPa: Lowest pressure recorded
    """
    
    # Initialize results
    results = {
        'Max_24hr_Rainfall_mm': np.nan,
        'Total_Storm_Rainfall_mm': np.nan,
        'Min_Pressure_hPa': np.nan,
        'records_found': 0,
        'days_covered': 0
    }
    
    # Check if station exists in our station info
    if station_name not in stations_info:
        return results
    
    station_metrics = stations_info[station_name]
    
    # Filter weather data for the typhoon period
    mask = (weather_df['Date'] >= start_date) & (weather_df['Date'] <= end_date)
    period_data = weather_df[mask].copy()
    
    if period_data.empty:
        return results
    
    results['records_found'] = len(period_data)
    results['days_covered'] = (end_date - start_date).days + 1
    
    # Extract rainfall data
    if station_metrics['rainfall'] and station_metrics['rainfall'] in period_data.columns:
        rainfall_col = station_metrics['rainfall']
        rainfall_data = pd.to_numeric(period_data[rainfall_col], errors='coerce')
        
        # Remove NaN and negative values
        rainfall_data = rainfall_data[rainfall_data >= 0]
        
        if not rainfall_data.empty:
            results['Max_24hr_Rainfall_mm'] = rainfall_data.max()
            results['Total_Storm_Rainfall_mm'] = rainfall_data.sum()
    
    # Extract pressure data
    if station_metrics['pressure'] and station_metrics['pressure'] in period_data.columns:
        pressure_col = station_metrics['pressure']
        pressure_data = pd.to_numeric(period_data[pressure_col], errors='coerce')
        
        # Remove NaN and unrealistic values (pressure should be between 800-1100 hPa)
        pressure_data = pressure_data[(pressure_data >= 800) & (pressure_data <= 1100)]
        
        if not pressure_data.empty:
            results['Min_Pressure_hPa'] = pressure_data.min()
    
    return results

# Test the function with a sample
if len(stations_info) > 0:
    test_station = list(stations_info.keys())[0]
    test_start = pd.Timestamp('2020-10-01')
    test_end = pd.Timestamp('2020-10-05')
    
    test_result = extract_extreme_weather_for_typhoon(
        weather_data_clean, test_station, test_start, test_end, stations_info
    )
    
    print(f"Test extraction for {test_station}:")
    for key, value in test_result.items():
        print(f"  {key}: {value}")

Test extraction for Aparri:
  Max_24hr_Rainfall_mm: 23.0
  Total_Storm_Rainfall_mm: 70.1
  Min_Pressure_hPa: 1005.0
  records_found: 5
  days_covered: 5


In [86]:
# Create mapping between impact data stations and weather data stations
print("Creating station mapping...")

# Get unique stations from impact data
impact_stations = final_merged_data['Nearest_Station'].dropna().unique()
weather_stations = list(stations_info.keys())

print(f"Impact data stations: {len(impact_stations)}")
print(f"Weather data stations: {len(weather_stations)}")

# Create mapping dictionary
station_mapping = {}

for impact_station in impact_stations:
    impact_clean = impact_station.strip().upper()
    
    # Try exact match first
    for weather_station in weather_stations:
        weather_clean = weather_station.strip().upper()
        
        if impact_clean == weather_clean:
            station_mapping[impact_station] = weather_station
            break
        elif impact_clean in weather_clean or weather_clean in impact_clean:
            station_mapping[impact_station] = weather_station
            break

print(f"\nStation mappings found: {len(station_mapping)}")
print("Sample mappings:")
for i, (impact, weather) in enumerate(list(station_mapping.items())[:10], 1):
    print(f"  {i:2d}. {impact} -> {weather}")

if len(station_mapping) > 10:
    print(f"  ... and {len(station_mapping) - 10} more mappings")

# Show unmapped stations
unmapped = [s for s in impact_stations if s not in station_mapping]
if unmapped:
    print(f"\nUnmapped impact stations ({len(unmapped)}): {unmapped[:5]}")
    if len(unmapped) > 5:
        print(f"  ... and {len(unmapped) - 5} more")

Creating station mapping...
Impact data stations: 22
Weather data stations: 22

Station mappings found: 20
Sample mappings:
   1. BASCO -> Basco (Radar)
   2. ITBAYAT -> Itbayat
   3. CALAYAN -> Calayan
   4. APARRI -> Aparri
   5. TUGUEGARAO -> Tuguegarao City
   6. CASIGURAN -> Casiguran
   7. BALER -> Baler (Radar)
   8. CLARK AIRPORT -> Clark Airport (DMIA)
   9. IBA -> Iba
  10. LEGAZPI CITY -> Legazpi City
  ... and 10 more mappings

Unmapped impact stations (2): ['CABANATUAN', 'SUBIC BAY']


In [87]:
# Apply weather extraction to the merged dataset
print("Extracting extreme weather data for each typhoon impact...")

# Initialize weather columns in the final dataset
final_merged_data['Max_24hr_Rainfall_mm'] = np.nan
final_merged_data['Total_Storm_Rainfall_mm'] = np.nan
final_merged_data['Min_Pressure_hPa'] = np.nan
final_merged_data['Weather_Records_Found'] = 0
final_merged_data['Weather_Days_Covered'] = 0
final_merged_data['Weather_Station_Mapped'] = 0

# Process in batches for progress tracking
batch_size = 100
total_records = len(final_merged_data)
successful_extractions = 0
failed_extractions = 0

print(f"Processing {total_records} records in batches of {batch_size}...")

for i in range(0, total_records, batch_size):
    batch_end = min(i + batch_size, total_records)
    
    for idx in range(i, batch_end):
        row = final_merged_data.iloc[idx]
        
        # Check if we have the required data
        impact_station = row['Nearest_Station']
        par_start = row['PAR_START']
        par_end = row['PAR_END']
        
        # Skip if missing required data
        if pd.isna(impact_station) or pd.isna(par_start) or pd.isna(par_end):
            failed_extractions += 1
            continue
        
        # Check if station can be mapped
        if impact_station not in station_mapping:
            failed_extractions += 1
            continue
        
        weather_station = station_mapping[impact_station]
        final_merged_data.at[idx, 'Weather_Station_Mapped'] = 1
        
        # Extract weather data
        weather_result = extract_extreme_weather_for_typhoon(
            weather_data_clean, weather_station, par_start, par_end, stations_info
        )
        
        # Update the dataset with extracted values
        final_merged_data.at[idx, 'Max_24hr_Rainfall_mm'] = weather_result['Max_24hr_Rainfall_mm']
        final_merged_data.at[idx, 'Total_Storm_Rainfall_mm'] = weather_result['Total_Storm_Rainfall_mm']
        final_merged_data.at[idx, 'Min_Pressure_hPa'] = weather_result['Min_Pressure_hPa']
        final_merged_data.at[idx, 'Weather_Records_Found'] = weather_result['records_found']
        final_merged_data.at[idx, 'Weather_Days_Covered'] = weather_result['days_covered']
        
        if weather_result['records_found'] > 0:
            successful_extractions += 1
        else:
            failed_extractions += 1
    
    # Progress update
    if (i // batch_size + 1) % 5 == 0:
        progress = (batch_end / total_records) * 100
        print(f"  Progress: {progress:.1f}% ({batch_end}/{total_records} records)")

print(f"\nWeather extraction completed!")
print(f"Successful extractions: {successful_extractions}")
print(f"Failed extractions: {failed_extractions}")
print(f"Success rate: {(successful_extractions / total_records) * 100:.1f}%")

Extracting extreme weather data for each typhoon impact...
Processing 1776 records in batches of 100...
  Progress: 28.2% (500/1776 records)
  Progress: 56.3% (1000/1776 records)
  Progress: 84.5% (1500/1776 records)

Weather extraction completed!
Successful extractions: 1613
Failed extractions: 163
Success rate: 90.8%


## Analyze Extracted Weather Data

In [88]:
# Analyze the extracted weather data
print("=== EXTREME WEATHER DATA ANALYSIS ===")

# Basic statistics
weather_cols = ['Max_24hr_Rainfall_mm', 'Total_Storm_Rainfall_mm', 'Min_Pressure_hPa']
print("\nWeather data summary:")
print(final_merged_data[weather_cols].describe())

# Coverage statistics
total_records = len(final_merged_data)
mapped_stations = final_merged_data['Weather_Station_Mapped'].sum()
records_with_data = final_merged_data['Weather_Records_Found'].sum()

print(f"\n=== COVERAGE STATISTICS ===")
print(f"Total impact records: {total_records:,}")
print(f"Records with mapped weather stations: {mapped_stations:,} ({mapped_stations/total_records*100:.1f}%)")

# Check data availability for each weather metric
for col in weather_cols:
    non_null = final_merged_data[col].notna().sum()
    print(f"{col}: {non_null:,} records ({non_null/total_records*100:.1f}%)")

# Show some extreme values
print(f"\n=== EXTREME VALUES RECORDED ===")

# Highest rainfall values
max_24hr = final_merged_data['Max_24hr_Rainfall_mm'].max()
max_total = final_merged_data['Total_Storm_Rainfall_mm'].max()
min_pressure = final_merged_data['Min_Pressure_hPa'].min()

if not pd.isna(max_24hr):
    print(f"Highest 24hr rainfall: {max_24hr:.1f} mm")
if not pd.isna(max_total):
    print(f"Highest total storm rainfall: {max_total:.1f} mm")
if not pd.isna(min_pressure):
    print(f"Lowest pressure recorded: {min_pressure:.1f} hPa")

# Show sample of records with complete weather data
complete_weather = final_merged_data[
    final_merged_data['Max_24hr_Rainfall_mm'].notna() &
    final_merged_data['Total_Storm_Rainfall_mm'].notna() &
    final_merged_data['Min_Pressure_hPa'].notna()
]

print(f"\n=== RECORDS WITH COMPLETE WEATHER DATA ===")
print(f"Records with all weather metrics: {len(complete_weather):,} ({len(complete_weather)/total_records*100:.1f}%)")

if len(complete_weather) > 0:
    print("\nSample records with complete weather data:")
    sample_cols = ['Typhoon Name', 'Year', 'Province', 'Nearest_Station', 'PAR_START', 'PAR_END',
                   'Max_24hr_Rainfall_mm', 'Total_Storm_Rainfall_mm', 'Min_Pressure_hPa']
    available_cols = [col for col in sample_cols if col in complete_weather.columns]
    print(complete_weather[available_cols].head())

=== EXTREME WEATHER DATA ANALYSIS ===

Weather data summary:
       Max_24hr_Rainfall_mm  Total_Storm_Rainfall_mm  Min_Pressure_hPa
count           1613.000000              1613.000000       1612.000000
mean             105.303125               169.334619        995.586104
std               71.217412               105.491013         10.315003
min                0.000000                 0.000000        924.900000
25%               50.000000                92.700000        991.700000
50%              102.100000               154.610000        998.100000
75%              138.400000               234.500000       1001.850000
max              510.000000               770.500000       1008.600000

=== COVERAGE STATISTICS ===
Total impact records: 1,776
Records with mapped weather stations: 1,613 (90.8%)
Max_24hr_Rainfall_mm: 1,613 records (90.8%)
Total_Storm_Rainfall_mm: 1,613 records (90.8%)
Min_Pressure_hPa: 1,612 records (90.8%)

=== EXTREME VALUES RECORDED ===
Highest 24hr rainfall: 510.

## Export Final Dataset with Weather Data

In [89]:
# Prepare final dataset for export
print("Preparing final dataset for export...")

# Reorder columns to group related data together
# Basic typhoon info
basic_cols = ['Typhoon Name', 'Year', 'Region', 'Province', 'City/Municipality']
# Impact data
impact_cols = ['Families', 'Person', 'Brgy', 'Dead', 'Injured/Ill', 'Missing', 
               'Totally', 'Partially', 'Total', 'Quantity', 'Cost', 'Type', 'Category']
# Station info
station_cols = ['Nearest_Station', 'Station_Province', 'Distance_km']
# Typhoon characteristics
typhoon_cols = ['PAR_START', 'PAR_END', 'Duration_in_PAR_Hours', 'Max_Sustained_Wind_kph', 'Typhoon_Type']
# Weather data
weather_cols = ['Max_24hr_Rainfall_mm', 'Total_Storm_Rainfall_mm', 'Min_Pressure_hPa']
# Metadata
meta_cols = ['Weather_Station_Mapped', 'Weather_Records_Found', 'Weather_Days_Covered']

# Get only columns that exist in the dataset
all_desired_cols = basic_cols + impact_cols + station_cols + typhoon_cols + weather_cols + meta_cols
final_columns = [col for col in all_desired_cols if col in final_merged_data.columns]

# Add any remaining columns
remaining_cols = [col for col in final_merged_data.columns if col not in final_columns]
final_columns.extend(remaining_cols)

# Reorder the dataset
final_export_data = final_merged_data[final_columns].copy()

print(f"Final dataset shape: {final_export_data.shape}")
print(f"Final columns: {final_export_data.columns.tolist()}")

# Export to CSV
output_filename = 'typhoon_impact_with_extreme_weather.csv'
try:
    final_export_data.to_csv(f'../../data/{output_filename}', index=False)
    print(f"\nDataset successfully exported to: ../../data/{output_filename}")
except:
    # Fallback to current directory
    final_export_data.to_csv(output_filename, index=False)
    print(f"\nDataset exported to current directory: {output_filename}")

print(f"\nTotal records exported: {len(final_export_data):,}")
print(f"Records with weather data: {final_export_data['Weather_Station_Mapped'].sum():,}")

Preparing final dataset for export...
Final dataset shape: (1776, 30)
Final columns: ['Typhoon Name', 'Year', 'Region', 'Province', 'City/Municipality', 'Families', 'Person', 'Brgy', 'Dead', 'Injured/Ill', 'Missing', 'Totally', 'Partially', 'Total', 'Quantity', 'Cost', 'Nearest_Station', 'Station_Province', 'Distance_km', 'PAR_START', 'PAR_END', 'Duration_in_PAR_Hours', 'Max_Sustained_Wind_kph', 'Typhoon_Type', 'Max_24hr_Rainfall_mm', 'Total_Storm_Rainfall_mm', 'Min_Pressure_hPa', 'Weather_Station_Mapped', 'Weather_Records_Found', 'Weather_Days_Covered']

Dataset successfully exported to: ../../data/typhoon_impact_with_extreme_weather.csv

Total records exported: 1,776
Records with weather data: 1,613


In [90]:
# Final summary
print("="*80)
print("                    FINAL PROCESSING SUMMARY")
print("="*80)

print(f"üìä DATASET OVERVIEW:")
print(f"   Total typhoon impact records: {len(final_export_data):,}")
print(f"   Unique typhoons: {final_export_data['Typhoon Name'].nunique()}")
print(f"   Year range: {final_export_data['Year'].min()}-{final_export_data['Year'].max()}")
print(f"   Provinces affected: {final_export_data['Province'].nunique()}")

weather_coverage = final_export_data['Weather_Station_Mapped'].sum()
coverage_pct = (weather_coverage / len(final_export_data)) * 100

print(f"\nüå¶Ô∏è  EXTREME WEATHER DATA:")
print(f"   Records with weather stations mapped: {weather_coverage:,} ({coverage_pct:.1f}%)")

for col, desc in [
    ('Max_24hr_Rainfall_mm', 'Maximum 24-hour rainfall'),
    ('Total_Storm_Rainfall_mm', 'Total storm rainfall'),
    ('Min_Pressure_hPa', 'Minimum pressure')
]:
    non_null = final_export_data[col].notna().sum()
    pct = (non_null / len(final_export_data)) * 100
    print(f"   {desc}: {non_null:,} records ({pct:.1f}%)")

print(f"\nüìà EXTREME VALUES CAPTURED:")
if not final_export_data['Max_24hr_Rainfall_mm'].isna().all():
    max_rain = final_export_data['Max_24hr_Rainfall_mm'].max()
    print(f"   Highest 24hr rainfall: {max_rain:.1f} mm")

if not final_export_data['Total_Storm_Rainfall_mm'].isna().all():
    max_total = final_export_data['Total_Storm_Rainfall_mm'].max()
    print(f"   Highest total rainfall: {max_total:.1f} mm")

if not final_export_data['Min_Pressure_hPa'].isna().all():
    min_press = final_export_data['Min_Pressure_hPa'].min()
    print(f"   Lowest pressure: {min_press:.1f} hPa")

print(f"\nüíæ OUTPUT:")
print(f"   File: {output_filename}")
print(f"   Size: {final_export_data.memory_usage(deep=True).sum() / 1024 / 1024:.1f} MB")

print("="*80)
print("‚úÖ PROCESSING COMPLETED SUCCESSFULLY!")
print("   Dataset now includes extreme weather values during typhoon periods.")
print("="*80)

                    FINAL PROCESSING SUMMARY
üìä DATASET OVERVIEW:
   Total typhoon impact records: 1,776
   Unique typhoons: 25
   Year range: 2020-2023
   Provinces affected: 24

üå¶Ô∏è  EXTREME WEATHER DATA:
   Records with weather stations mapped: 1,613 (90.8%)
   Maximum 24-hour rainfall: 1,613 records (90.8%)
   Total storm rainfall: 1,613 records (90.8%)
   Minimum pressure: 1,612 records (90.8%)

üìà EXTREME VALUES CAPTURED:
   Highest 24hr rainfall: 510.0 mm
   Highest total rainfall: 770.5 mm
   Lowest pressure: 924.9 hPa

üíæ OUTPUT:
   File: typhoon_impact_with_extreme_weather.csv
   Size: 1.0 MB
‚úÖ PROCESSING COMPLETED SUCCESSFULLY!
   Dataset now includes extreme weather values during typhoon periods.


In [91]:
final_export_data.head()

Unnamed: 0,Typhoon Name,Year,Region,Province,City/Municipality,Families,Person,Brgy,Dead,Injured/Ill,...,PAR_END,Duration_in_PAR_Hours,Max_Sustained_Wind_kph,Typhoon_Type,Max_24hr_Rainfall_mm,Total_Storm_Rainfall_mm,Min_Pressure_hPa,Weather_Station_Mapped,Weather_Records_Found,Weather_Days_Covered
0,BETTY,2023,2,BATANES,BASCO,3608.0,11120.0,6.0,0.0,0.0,...,2023-06-01,133.0,195,STY,25.6,39.0,977.1,1,6,6
1,BETTY,2023,2,BATANES,ITBAYAT,968.0,3028.0,5.0,0.0,0.0,...,2023-06-01,133.0,195,STY,20.3,32.71,1000.5,1,6,6
2,BETTY,2023,2,BATANES,IVANA,444.0,1532.0,4.0,0.0,0.0,...,2023-06-01,133.0,195,STY,25.6,39.0,977.1,1,6,6
3,BETTY,2023,2,BATANES,MAHATAO,575.0,1792.0,4.0,0.0,0.0,...,2023-06-01,133.0,195,STY,25.6,39.0,977.1,1,6,6
4,BETTY,2023,2,BATANES,SABTANG,575.0,1955.0,6.0,0.0,0.0,...,2023-06-01,133.0,195,STY,25.6,39.0,977.1,1,6,6
