# Data wrangling

In [1]:
import kagglehub
import pandas as pd
from kagglehub import KaggleDatasetAdapter

## 1. Download disaster data from Kaggle

In [2]:
# Load a DataFrame with a specific version of a CSV
raw_data_df=kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    'headsortails/us-natural-disaster-declarations',
    'us_disaster_declarations.csv',
)

In [None]:
raw_data_df.head().transpose()

In [4]:
raw_data_df.to_parquet('../data/raw_disaster_data.parquet')

## 2. Download FIPS code database from census.gov

In [None]:
fips_df=pd.read_csv('https://www2.census.gov/geo/docs/reference/codes2020/national_county2020.txt', sep='|')
fips_df.head()

In [6]:
# Save the raw data
fips_df.to_parquet('../data/fips_codes.parquet')

## 3. Add county name based on FIPS code

To add the county name, we need to concatenate the `STATEFP` and `COUNTYFP` columns in the census data, then use that string to translate between the `fips` column in the disaster data and the `COUNTYNAME` column in the census data. Let's put the logic in a function for easy refactoring.

In [7]:
def decode_fips(fips_df: pd.DataFrame, raw_data_df: pd.DataFrame) -> pd.DataFrame:
    '''Takes census.gov FIPS dataframe and disaster dataframe, adds human readable 
    county column to disaster data using the census data as look-up table returns 
    updated disaster dataframe'''

    # First extract the state and county FIPS codes
    state_fp=fips_df['STATEFP'].to_list()
    county_fp=fips_df['COUNTYFP'].to_list()

    # Left zero pad state and county FIPS codes to two and three digits respectively
    state_fp=[str(n).zfill(2) for n in state_fp]
    county_fp=[str(n).zfill(3) for n in county_fp]

    # Concatenate the state and county codes to get the full FIPS code
    fips=[i+j for i,j in zip(state_fp, county_fp)]

    # Make a dictionary to translate FIPS county codes to county names
    fips_lookup=dict(zip(fips, fips_df['COUNTYNAME']))

    # Add a new column to the raw disaster data containing the FIPS code to be translated
    data_df=raw_data_df.copy()
    data_df['county_name']=raw_data_df['fips'].apply(str)

    # Translate the column values from FIPS to county name, using 'Unknown' for County if we don't
    # have the FIPS code in our dict
    data_df['county_name']=data_df['county_name'].map(fips_lookup).fillna('Unknown')

    return data_df

In [8]:
# Do the decoding
data_df=decode_fips(fips_df, raw_data_df)

## 4. Save the data

In [None]:
# Take a look at what we have
data_df.tail().transpose()

In [None]:
data_df.info()

In [11]:
# Save the data
data_df.to_parquet('../data/disaster_data.parquet')

## 5. Data shape formatting

Now the real work begins - we need to set this dataset up for modeling as a multi-label time-series prediction problem. To make our lives a little easier, we will treat each county as an independent sample (this is likely not strictly true, as especially weather based disasters may tend to co-occur geographically). Doing so increases the number of observations and simplifies the input. Each individual time point will be a vector of dummy encoded disasters, including one feature for 'no-disaster'. This will give is n different time series for the n different counties. Care will need to be taken when splitting/sampling and generating batches not to cross between the time series from different counties.

Let's dig in!

### 5.1. Feature selection/engineering

In [None]:
# Get only the features we are going to work with into a new dataframe
working_df=data_df[['incident_begin_date','fips','incident_type']].copy()

# Convert 'incident_begin_date' to month and year columns and set as index
working_df['incident_begin_date']=pd.to_datetime(working_df['incident_begin_date'])

# Remove duplicates
working_df=working_df.drop_duplicates(keep='first')

# Fix the index
working_df.reset_index(inplace=True, drop=True)

# # Extract month and year from the dataetime series and add them back to the dataframe
# working_df['year'] = datetime_series.dt.year
# working_df['month'] = datetime_series.dt.month
working_df.head()

In [None]:
disaster_dummies=pd.get_dummies(working_df['incident_type'], dtype=int)
disaster_dummies.head()

In [None]:
working_df=pd.concat([working_df.reset_index(drop=True), disaster_dummies.reset_index(drop=True)], axis=1)
working_df.head().transpose()

Now, we need to regularize the time series to a frequency of months across the span of years within each county. As we do so, there will be months with no disaster that will need to be filled in with a 'none' value.

## 5.2. Time series regularization

In [29]:
def sum_months(group: pd.DataFrame) -> pd.DataFrame:
    '''Takes a yearly groupby object and sums features over months'''

    group=group.resample('ME').sum()

    return group

def resample_months(group: pd.DataFrame) -> pd.DataFrame:
    '''Takes working dataframe and resamples frequency to months.
    Returns updated dataframe'''

    # Set 'incident_begin_date' as datetime axis
    group=group.set_index('incident_begin_date')

    print(group.head())

    # Sum the disasters in each month by year. This removes duplicates where
    # there was more than on disaster in a month.
    group=group.groupby(group.index.year, group_keys=False).apply(sum_months)


    # Resample to monthly frequency
    group=group.resample('D').asfreq()

    # Fill missing values 'no_disaster'
    group['incident_type']=group['incident_type'].fillna(0)

    # Reset the index, preserving the `incident_begin_date`
    group.reset_index(inplace=True, drop=False)

    print(group.head())

    return group

In [None]:
resampled_working_df=working_df.groupby('fips', group_keys=True).apply(resample_months, include_groups=False)

In [None]:
resampled_working_df.head()

In [None]:
working_df.info()