# Data wrangling

In [17]:
import kagglehub
import pandas as pd
import numpy as np
from kagglehub import KaggleDatasetAdapter

encoding_type='dummies'

## 1. Download disaster data from Kaggle

In [18]:
# Load a DataFrame with a specific version of a CSV
raw_data_df=kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    'headsortails/us-natural-disaster-declarations',
    'us_disaster_declarations.csv',
)

raw_data_df.head().transpose()

Unnamed: 0,0,1,2,3,4
fema_declaration_string,DR-1-GA,DR-2-TX,DR-3-LA,DR-4-MI,DR-5-MT
disaster_number,1,2,3,4,5
state,GA,TX,LA,MI,MT
declaration_type,DR,DR,DR,DR,DR
declaration_date,1953-05-02T00:00:00Z,1953-05-15T00:00:00Z,1953-05-29T00:00:00Z,1953-06-02T00:00:00Z,1953-06-06T00:00:00Z
fy_declared,1953,1953,1953,1953,1953
incident_type,Tornado,Tornado,Flood,Tornado,Flood
declaration_title,Tornado,Tornado & Heavy Rainfall,Flood,Tornado,Floods
ih_program_declared,0,0,0,0,0
ia_program_declared,1,1,1,1,1


In [19]:
raw_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64092 entries, 0 to 64091
Data columns (total 23 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   fema_declaration_string     64092 non-null  object
 1   disaster_number             64092 non-null  int64 
 2   state                       64092 non-null  object
 3   declaration_type            64092 non-null  object
 4   declaration_date            64092 non-null  object
 5   fy_declared                 64092 non-null  int64 
 6   incident_type               64092 non-null  object
 7   declaration_title           64092 non-null  object
 8   ih_program_declared         64092 non-null  int64 
 9   ia_program_declared         64092 non-null  int64 
 10  pa_program_declared         64092 non-null  int64 
 11  hm_program_declared         64092 non-null  int64 
 12  incident_begin_date         64092 non-null  object
 13  incident_end_date           55682 non-null  ob

In [20]:
raw_data_df.to_parquet('../data/raw_disaster_data.parquet')

## 2. Data shape formatting

Now the real work begins - we need to set this dataset up for modeling as a multi-label time-series prediction problem. To make our lives a little easier, we will treat each county as an independent sample (this is likely not strictly true, as especially weather based disasters may tend to co-occur geographically). Doing so increases the number of observations and simplifies the input. Each individual time point will be a vector of dummy encoded disasters, including one feature for 'no-disaster'. This will give is n different time series for the n different counties. Care will need to be taken when splitting/sampling and generating batches not to cross between the time series from different counties.

Let's dig in!

### 2.1. Feature selection

In [21]:
# Get only the features we are going to work with into a new dataframe
working_df=raw_data_df[['incident_begin_date','state','incident_type']].copy()

# Convert 'incident_begin_date' to month and year columns and set as index
working_df['incident_begin_date']=pd.to_datetime(working_df['incident_begin_date'])

# Fix the index
working_df.reset_index(inplace=True, drop=True)
working_df.head()

Unnamed: 0,incident_begin_date,state,incident_type
0,1953-05-02 00:00:00+00:00,GA,Tornado
1,1953-05-15 00:00:00+00:00,TX,Tornado
2,1953-05-29 00:00:00+00:00,LA,Flood
3,1953-06-02 00:00:00+00:00,MI,Tornado
4,1953-06-06 00:00:00+00:00,MT,Flood


In [22]:
working_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64092 entries, 0 to 64091
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   incident_begin_date  64092 non-null  datetime64[ns, UTC]
 1   state                64092 non-null  object             
 2   incident_type        64092 non-null  object             
dtypes: datetime64[ns, UTC](1), object(2)
memory usage: 1.5+ MB


### 2.2. Disaster encoding

In [23]:
# Define target disaster types for summation and prediction
target_disasters=['Severe Storm', 'Hurricane', 'Flood', 'Coastal Storm']

# Encode the disaster types to dummies
disaster_dummies_df=pd.get_dummies(working_df['incident_type'], dtype=int)

# Sum our target disasters
incidents=disaster_dummies_df[target_disasters].sum(axis=1)

# Add the disaster sums to the data frame as a new feature
working_df['incidents']=incidents

# Drop the string incident_type column
working_df.drop('incident_type', axis=1, inplace=True)

# Count how many disasters we have so that we can check later to make
# sure we didn't cause any issues in our data manipulations
starting_disaster_count=working_df['incidents'].sum()

working_df.head(20)
print(f"Have {starting_disaster_count} total disaster incidents")

Have 41865 total disaster incidents


Now, we need to regularize the time series to a frequency of months across the span of years within each state.

### 2.3. Time series regularization

In [24]:
def sum_months(group: pd.DataFrame) -> pd.DataFrame:
    '''Takes a yearly groupby object and sums features over months'''

    return group.resample('ME').sum()


def resample_months(group: pd.DataFrame) -> pd.DataFrame:
    '''Takes working dataframe and resamples frequency to months.
    Returns updated dataframe'''

    # Set 'incident_begin_date' as datetime axis
    group=group.set_index('incident_begin_date')

    # Sum the disasters in each month by year. This removes duplicates where
    # there was more than one disaster in a month.
    group=group.groupby(group.index.year, group_keys=False).apply(sum_months)

    # Resample to monthly frequency
    group=group.resample('ME').asfreq()

    # Fill missing values with 0
    group=group.fillna(0)

    # Convert everything to int
    group=group.astype(int)

    # Reset the index, preserving the `incident_begin_date`
    group.reset_index(inplace=True, drop=False)

    return group

# Do the resampling
resampled_working_df=working_df.groupby('state', group_keys=True).apply(resample_months, include_groups=False)
resampled_working_df.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,incident_begin_date,incidents
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
AK,0,1953-10-31 00:00:00+00:00,0
AK,1,1953-11-30 00:00:00+00:00,0
AK,2,1953-12-31 00:00:00+00:00,0
AK,3,1954-01-31 00:00:00+00:00,0
AK,4,1954-02-28 00:00:00+00:00,0


In [25]:
resampled_working_df.info()
print(f"\nHave {working_df['incidents'].sum()} total disaster incidents")

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 43639 entries, ('AK', np.int64(0)) to ('WY', np.int64(686))
Data columns (total 2 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   incident_begin_date  43639 non-null  datetime64[ns, UTC]
 1   incidents            43639 non-null  int64              
dtypes: datetime64[ns, UTC](1), int64(1)
memory usage: 851.1+ KB

Have 41865 total disaster incidents


### 2.4. Clean up and finalize features

In [26]:
# Clean up the index
resampled_working_df.reset_index(inplace=True)
resampled_working_df.drop('level_1', axis=1, inplace=True)

In [27]:
# Extract month and year from 'incident_begin_date' and drop
resampled_working_df['year']=resampled_working_df['incident_begin_date'].dt.year.astype(int)
resampled_working_df['month']=resampled_working_df['incident_begin_date'].dt.month.astype(int)
resampled_working_df.drop('incident_begin_date', axis=1, inplace=True)
resampled_working_df.head()

Unnamed: 0,state,incidents,year,month
0,AK,0,1953,10
1,AK,0,1953,11
2,AK,0,1953,12
3,AK,0,1954,1
4,AK,0,1954,2


In [28]:
resampled_working_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43639 entries, 0 to 43638
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   state      43639 non-null  object
 1   incidents  43639 non-null  int64 
 2   year       43639 non-null  int64 
 3   month      43639 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 1.3+ MB


In [29]:
resampled_working_df.describe()

Unnamed: 0,incidents,year,month
count,43639.0,43639.0,43639.0
mean,0.959348,1989.346112,6.504709
std,7.79039,18.875705,3.450543
min,0.0,1953.0,1.0
25%,0.0,1973.0,4.0
50%,0.0,1990.0,7.0
75%,0.0,2006.0,10.0
max,509.0,2023.0,12.0


### 5.5. Sanity check incident sum

In [30]:
incident_count=working_df['incidents'].sum()

print(f'Final disaster count matches original disaster count: {incident_count == starting_disaster_count}')

Final disaster count matches original disaster count: True


Note: switching to state as location aggregation level, dropping and combining disaster types and resampling with a frequency of months made our dataset size problem go away. Nice.

## 3. Save the resampled data

### 3.1. Complete dataset

In [31]:
# Save the complete dataset
resampled_working_df.to_parquet('../data/resampled_disaster_data_all.parquet')

### 3.2. Most recent 5 years

In [32]:
# Also save a subset from the last 5 years
sample_resampled_working_df=resampled_working_df[resampled_working_df['year'] >= 1998]
sample_resampled_working_df.reset_index(inplace=True, drop=True)
sample_resampled_working_df.to_parquet('../data/resampled_disaster_data_1998-current.parquet')
sample_resampled_working_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16384 entries, 0 to 16383
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   state      16384 non-null  object
 1   incidents  16384 non-null  int64 
 2   year       16384 non-null  int64 
 3   month      16384 non-null  int64 
dtypes: int64(3), object(1)
memory usage: 512.1+ KB
