# Data wrangling

In [2]:
import kagglehub
import pandas as pd
from kagglehub import KaggleDatasetAdapter

## 1. Download disaster data from Kaggle

In [3]:
# Load a DataFrame with a specific version of a CSV
raw_data_df=kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    'headsortails/us-natural-disaster-declarations',
    'us_disaster_declarations.csv',
)

In [4]:
raw_data_df.head().transpose()

Unnamed: 0,0,1,2,3,4
fema_declaration_string,DR-1-GA,DR-2-TX,DR-3-LA,DR-4-MI,DR-5-MT
disaster_number,1,2,3,4,5
state,GA,TX,LA,MI,MT
declaration_type,DR,DR,DR,DR,DR
declaration_date,1953-05-02T00:00:00Z,1953-05-15T00:00:00Z,1953-05-29T00:00:00Z,1953-06-02T00:00:00Z,1953-06-06T00:00:00Z
fy_declared,1953,1953,1953,1953,1953
incident_type,Tornado,Tornado,Flood,Tornado,Flood
declaration_title,Tornado,Tornado & Heavy Rainfall,Flood,Tornado,Floods
ih_program_declared,0,0,0,0,0
ia_program_declared,1,1,1,1,1


In [5]:
raw_data_df.to_parquet('../data/raw_disaster_data.parquet')

## 2. Download FIPS code database from census.gov

In [6]:
fips_df=pd.read_csv('https://www2.census.gov/geo/docs/reference/codes2020/national_county2020.txt', sep='|')
fips_df.head()

Unnamed: 0,STATE,STATEFP,COUNTYFP,COUNTYNS,COUNTYNAME,CLASSFP,FUNCSTAT
0,AL,1,1,161526,Autauga County,H1,A
1,AL,1,3,161527,Baldwin County,H1,A
2,AL,1,5,161528,Barbour County,H1,A
3,AL,1,7,161529,Bibb County,H1,A
4,AL,1,9,161530,Blount County,H1,A


In [7]:
# Save the raw data
fips_df.to_parquet('../data/fips_codes.parquet')

## 3. Add county name based on FIPS code

To add the county name, we need to concatenate the `STATEFP` and `COUNTYFP` columns in the census data, then use that string to translate between the `fips` column in the disaster data and the `COUNTYNAME` column in the census data. Let's put the logic in a function for easy refactoring.

In [8]:
def decode_fips(fips_df: pd.DataFrame, raw_data_df: pd.DataFrame) -> pd.DataFrame:
    '''Takes census.gov FIPS dataframe and disaster dataframe, adds human readable 
    county column to disaster data using the census data as look-up table returns 
    updated disaster dataframe'''

    # First extract the state and county FIPS codes
    state_fp=fips_df['STATEFP'].to_list()
    county_fp=fips_df['COUNTYFP'].to_list()

    # Left zero pad state and county FIPS codes to two and three digits respectively
    state_fp=[str(n).zfill(2) for n in state_fp]
    county_fp=[str(n).zfill(3) for n in county_fp]

    # Concatenate the state and county codes to get the full FIPS code
    fips=[i+j for i,j in zip(state_fp, county_fp)]

    # Make a dictionary to translate FIPS county codes to county names
    fips_lookup=dict(zip(fips, fips_df['COUNTYNAME']))

    # Add a new column to the raw disaster data containing the FIPS code to be translated
    data_df=raw_data_df.copy()
    data_df['county_name']=raw_data_df['fips'].apply(str)

    # Translate the column values from FIPS to county name, using 'Unknown' for County if we don't
    # have the FIPS code in our dict
    data_df['county_name']=data_df['county_name'].map(fips_lookup).fillna('Unknown')

    return data_df

In [9]:
# Do the decoding
data_df=decode_fips(fips_df, raw_data_df)

## 4. Save the data

In [10]:
# Take a look at what we have
data_df.tail().transpose()

Unnamed: 0,64087,64088,64089,64090,64091
fema_declaration_string,DR-4696-ME,DR-4697-MS,DR-4697-MS,DR-4697-MS,DR-4697-MS
disaster_number,4696,4697,4697,4697,4697
state,ME,MS,MS,MS,MS
declaration_type,DR,DR,DR,DR,DR
declaration_date,2023-03-22T00:00:00Z,2023-03-26T00:00:00Z,2023-03-26T00:00:00Z,2023-03-26T00:00:00Z,2023-03-26T00:00:00Z
fy_declared,2023,2023,2023,2023,2023
incident_type,Severe Storm,Severe Storm,Severe Storm,Severe Storm,Severe Storm
declaration_title,Severe Storm And Flooding,"Severe Storms, Straight-Line Winds, And Tornadoes","Severe Storms, Straight-Line Winds, And Tornadoes","Severe Storms, Straight-Line Winds, And Tornadoes","Severe Storms, Straight-Line Winds, And Tornadoes"
ih_program_declared,0,1,1,1,1
ia_program_declared,0,0,0,0,0


In [11]:
data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64092 entries, 0 to 64091
Data columns (total 24 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   fema_declaration_string     64092 non-null  object
 1   disaster_number             64092 non-null  int64 
 2   state                       64092 non-null  object
 3   declaration_type            64092 non-null  object
 4   declaration_date            64092 non-null  object
 5   fy_declared                 64092 non-null  int64 
 6   incident_type               64092 non-null  object
 7   declaration_title           64092 non-null  object
 8   ih_program_declared         64092 non-null  int64 
 9   ia_program_declared         64092 non-null  int64 
 10  pa_program_declared         64092 non-null  int64 
 11  hm_program_declared         64092 non-null  int64 
 12  incident_begin_date         64092 non-null  object
 13  incident_end_date           55682 non-null  ob

In [12]:
# Save the data
data_df.to_parquet('../data/disaster_data.parquet')

## 5. Data shape formatting

Now the real work begins - we need to set this dataset up for modeling as a multi-label time-series prediction problem. To make our lives a little easier, we will treat each county as an independent sample (this is likely not strictly true, as especially weather based disasters may tend to co-occur geographically). Doing so increases the number of observations and simplifies the input. Each individual time point will be a vector of dummy encoded disasters, including one feature for 'no-disaster'. This will give is n different time series for the n different counties. Care will need to be taken when splitting/sampling and generating batches not to cross between the time series from different counties.

Let's dig in!

### 5.1. Feature selection/engineering

In [13]:
# Get only the features we are going to work with into a new dataframe
working_df=data_df[['incident_begin_date','fips','incident_type']].copy()

# Convert 'incident_begin_date' to month and year columns and set as index
working_df['incident_begin_date']=pd.to_datetime(working_df['incident_begin_date'])

# Remove duplicates
working_df=working_df.drop_duplicates(keep='first')

# Fix the index
working_df.reset_index(inplace=True, drop=True)

# # Extract month and year from the dataetime series and add them back to the dataframe
# working_df['year'] = datetime_series.dt.year
# working_df['month'] = datetime_series.dt.month
working_df.head()

Unnamed: 0,incident_begin_date,fips,incident_type
0,1953-05-02 00:00:00+00:00,13000,Tornado
1,1953-05-15 00:00:00+00:00,48000,Tornado
2,1953-05-29 00:00:00+00:00,22000,Flood
3,1953-06-02 00:00:00+00:00,26000,Tornado
4,1953-06-06 00:00:00+00:00,30000,Flood


In [17]:
# Encode the disaster types to dummies
disaster_dummies=pd.get_dummies(working_df['incident_type'], dtype=int)

# Drop some disaster types - the dataset is big enough that it causes out of memory
# crashes in the Codespace. But more than that, we have some disaster types
# the are rare/hard to predict. Let's focus on the wether based types and get
# rid of the rest
disaster_drops=['Biological', 'Chemical', 'Earthquake', 'Fishing Losses', 'Human Cause', 'Other', 'Terrorist', 'Toxic Substances', 'Volcanic Eruption']
disaster_dummies.drop(disaster_drops, axis=1, inplace=True)

# We also have 4 disaster types that are winter weather relates, let's combine those
disaster_dummies['Winter weather']=disaster_dummies['Severe Ice Storm'] + disaster_dummies['Snowstorm'] + disaster_dummies['Freezing'] + disaster_dummies['Winter Storm']
disaster_dummies.drop(['Severe Ice Storm','Snowstorm','Freezing','Winter Storm'], axis=1, inplace=True)

disaster_dummies.head().transpose()

Unnamed: 0,0,1,2,3,4
Coastal Storm,0,0,0,0,0
Dam/Levee Break,0,0,0,0,0
Drought,0,0,0,0,0
Fire,0,0,0,0,0
Flood,0,0,1,0,1
Hurricane,0,0,0,0,0
Mud/Landslide,0,0,0,0,0
Severe Storm,0,0,0,0,0
Tornado,1,1,0,1,0
Tropical Storm,0,0,0,0,0


In [18]:
working_df=pd.concat([working_df.reset_index(drop=True), disaster_dummies.reset_index(drop=True)], axis=1)
working_df.head().transpose()

Unnamed: 0,0,1,2,3,4
incident_begin_date,1953-05-02 00:00:00+00:00,1953-05-15 00:00:00+00:00,1953-05-29 00:00:00+00:00,1953-06-02 00:00:00+00:00,1953-06-06 00:00:00+00:00
fips,13000,48000,22000,26000,30000
incident_type,Tornado,Tornado,Flood,Tornado,Flood
Coastal Storm,0,0,0,0,0
Dam/Levee Break,0,0,0,0,0
Drought,0,0,0,0,0
Fire,0,0,0,0,0
Flood,0,0,1,0,1
Hurricane,0,0,0,0,0
Mud/Landslide,0,0,0,0,0


Now, we need to regularize the time series to a frequency of months across the span of years within each county. As we do so, there will be months with no disaster that will need to be filled in with a 'none' value.

## 5.2. Time series regularization

In [15]:
def sum_months(group: pd.DataFrame) -> pd.DataFrame:
    '''Takes a yearly groupby object and sums features over months'''

    group=group.resample('ME').sum()

    return group

def resample_months(group: pd.DataFrame) -> pd.DataFrame:
    '''Takes working dataframe and resamples frequency to months.
    Returns updated dataframe'''

    # Set 'incident_begin_date' as datetime axis
    group=group.set_index('incident_begin_date')

    print(group.head())

    # Sum the disasters in each month by year. This removes duplicates where
    # there was more than on disaster in a month.
    group=group.groupby(group.index.year, group_keys=False).apply(sum_months)

    # Resample to monthly frequency
    group=group.resample('D').asfreq()

    # Fill missing values with 0
    group=group.fillna(0)

    # Reset the index, preserving the `incident_begin_date`
    group.reset_index(inplace=True, drop=False)

    print(group.head())

    return group

: 

In [None]:
resampled_working_df=working_df.groupby('fips', group_keys=True).apply(resample_months, include_groups=False)

                          incident_type  Biological  Chemical  Coastal Storm  \
incident_begin_date                                                            
1961-02-27 00:00:00+00:00         Flood           0         0              0   
2017-09-08 00:00:00+00:00     Hurricane           0         0              0   
2017-10-06 00:00:00+00:00     Hurricane           0         0              0   
2018-10-10 00:00:00+00:00     Hurricane           0         0              0   
2020-01-20 00:00:00+00:00    Biological           1         0              0   

                           Dam/Levee Break  Drought  Earthquake  Fire  \
incident_begin_date                                                     
1961-02-27 00:00:00+00:00                0        0           0     0   
2017-09-08 00:00:00+00:00                0        0           0     0   
2017-10-06 00:00:00+00:00                0        0           0     0   
2018-10-10 00:00:00+00:00                0        0           0     0   
2

In [None]:
resampled_working_df.head()

In [None]:
working_df.info()