In [2]:
import pandas as pd

## Load the Data

### 1.1. Load the Data

In [15]:
df=pd.read_csv('../data/us_disaster_declarations.csv')

### 1.2. Date Encoding

In [16]:
# Create 'incident_dtm' column to convert 'incident_begin_date' obj to datetime
df['incident_dtm']=pd.to_datetime(df['incident_begin_date'], format='%Y-%m-%dT%H:%M:%SZ')

In [11]:
# Create 'month' column to filter month from 'incident_dtm'
df['month']=pd.to_datetime(df['incident_dtm'], format='%b').dt.month

# Create 'year' column to filter year from 'incident_dtm'
df['year']=pd.to_datetime(df['incident_dtm'], format='%Y').dt.year

In [9]:
# Save the raw data
# Original df plus: incident_dtm / year / month
df.to_parquet('../data/dtm_df.parquet')

In [45]:
# Drop years before YYYY
df=df[df['year'] >= 2009]

### 1.3. Clean States

In [17]:
# Clean states / remove territories 
mainland_states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", 
                   "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
                   "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
                   "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
                   "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

df=df[df['state'].isin(mainland_states)]

### 1.4. Clean Incident Types

In [47]:
# List incident types to drop
disaster_drops=['Biological', 'Snowstorm', 'Fire', 'Severe Ice Storm', 'Tornado', 'Drought', 'Coastal Storm', 
                'Other', 'Freezing', 'Earthquake', 'Typhoon', 'Tropical Storm', 'Volcanic Eruption', 'Winter Storm',
                'Fishing Losses', 'Mud/Landslide', 'Dam/Levee Break', 'Toxic Substances', 'Tsunami', 'Chemical', 'Human Cause', 'Terrorist']

# Drop specified incident types
for d in disaster_drops:
    df=df[df.incident_type != d]

In [None]:
# List incident types to combine
common_disasters=['Severe Storm', 'Hurricane', 'Flood']

# Combine specified incidents into incident type 'winter weather'
for c in common_disasters:
    df.loc[df['incident_type'] == c, 'incident_type'] = 'Common Disasters'

In [None]:
# List incident types to combine
winter_weather=['Severe Ice Storm', 'Snowstorm', 'Freezing', 'Winter Storm']

# Combine specified incidents into incident type 'winter weather'
for w in winter_weather:
    df.loc[df['incident_type'] == w, 'incident_type'] = 'Winter Weather'

#result = df[df['incident_type'] == 'Winter Weather']
#print(result)

### 1.5. Clean Columns

In [20]:
# Remove unneeded columns

# Create subset_df with 'incident_dtm', 'incident_type', and 'state'
subset_df=df[['incident_dtm', 'incident_type', 'state']]

#subset_df.set_index('incident_dtm', inplace=True)

Unnamed: 0,incident_dtm,incident_type,state
0,1953-05-02,Tornado,GA
1,1953-05-15,Tornado,TX
2,1953-05-29,Flood,LA
3,1953-06-02,Tornado,MI
4,1953-06-06,Flood,MT
...,...,...,...
64087,2022-12-23,Severe Storm,ME
64088,2023-03-24,Severe Storm,MS
64089,2023-03-24,Severe Storm,MS
64090,2023-03-24,Severe Storm,MS


In [19]:
# Save the raw data
# Subset df: incident_dtm / incident_type / state
subset_df.to_parquet('../data/subset_df.parquet')

### 1.6. Monthly Aggregation and 'No Disaster' Column

In [49]:
# Monthly aggregation and "no disaster" feature
monthly_disasters=subset_df.resample('M').size()
monthly_disasters_df=monthly_disasters.to_frame(name='disaster_count')
monthly_disasters_df['disaster?']=monthly_disasters_df['disaster_count'].apply(lambda x: 0 if x == 0 else 1)

  monthly_disasters=subset_df.resample('M').size()


In [41]:
# Save the raw data
# Subset df: incident_dtm (index) / disaster_count / disaster?
monthly_disasters_df.to_parquet('../data/clean_state_type_df.parquet')

In [50]:
# Save the raw data
# Subset df (2009-2023): incident_dtm (index) / disaster_count / disaster?
monthly_disasters_df.to_parquet('../data/clean_state_type_09_df.parquet')

### 1.7. Disaster Encoding

In [None]:
# Encode the disaster types to dummies
disaster_dummies=pd.get_dummies(subset_df['incident_type'], dtype=int)

disaster_dummies.head()

In [None]:
# Combine disaster dummies and subset_df, remove 'incident_type' column
subset_df=pd.concat([subset_df.reset_index(drop=True), disaster_dummies.reset_index(drop=True)], axis=1)
subset_df.drop('incident_type', axis=1, inplace=True)
subset_df.head()

### 1.8. Time Axis Regularization/Resampling

In [None]:
def sum_months(group: pd.DataFrame) -> pd.DataFrame:
    '''Takes a yearly groupby object and sums features over months'''

    group=group.resample('ME').sum()

    return group

def resample_months(group: pd.DataFrame) -> pd.DataFrame:
    '''Takes working dataframe and resamples frequency to months.
    Returns updated dataframe'''

    # Set 'incident_dtm' as datetime axis
    group=group.set_index('incident_dtm')

    # Sum disasters in each month by year; removes duplicates where there was more than one disaster in a month
    group=group.groupby(group.index.year, group_keys=False).apply(sum_months)

    # Resample to monthly frequency
    group=group.resample('D').asfreq()

    # Fill missing values with 0
    group=group.fillna(0)

    # Convert everything to int
    group=group.astype(bool)

    # Reset the index, preserving the `incident_dtm`
    group.reset_index(inplace=True, drop=False)

    return group

# Use resampling function on subset_df
resampled_df=subset_df.groupby('state', group_keys=True).apply(resample_months, include_groups=False)

In [None]:
resampled_df.head(10)

### 1.9. Data Reshaping

In [None]:
# NOT WORKING on resampled_df

# Create 'month' column to filter month from 'incident_dtm'
resampled_df['month']=pd.to_datetime(resampled_df['incident_dtm'], format='%b').dt.month

# Create 'year' column to filter year from 'incident_dtm'
resampled_df['year']=pd.to_datetime(resampled_df['incident_dtm'], format='%Y').dt.year

reshaped_df=resampled_df.set_index(['year', 'state', 'month'], inplace=True)

reshaped_df.head()

In [None]:
test_df=subset_df

# Create 'month' column to filter month from 'incident_dtm'
test_df['month']=pd.to_datetime(test_df['incident_dtm'], format='%b').dt.month

# Create 'year' column to filter year from 'incident_dtm'
test_df['year']=pd.to_datetime(test_df['incident_dtm'], format='%Y').dt.year

# Set 'year', 'state', and 'month' indices
test_df.set_index(['year', 'state', 'month'], inplace=True)

test_df.head()

In [None]:
# Create 'month' column to filter month from 'incident_dtm'
subset_df['month']=pd.to_datetime(subset_df['incident_dtm'], format='%b').dt.month

# Create 'year' column to filter year from 'incident_dtm'
subset_df['year']=pd.to_datetime(subset_df['incident_dtm'], format='%Y').dt.year

# Set 'year', 'state', and 'month' indices
subset_df.set_index(['year', 'state', 'month'], inplace=True)

subset_df.head()

### 2.0. Feature Selection

In [None]:
# Remove non-weather disasters (include earthquakes, volcanic eruptions)
# Combine severe ice storm, snowstorm, freezing, and winter storm into 'winter weather'

# Drop some disaster types - focus on weather-related disasters (include wildfires, earthquakes, and volcanic eruptions)
disaster_drops=['Biological', 'Chemical', 'Fishing Losses', 'Human Cause', 'Other', 'Terrorist', 'Toxic Substances']
resampled_df.drop(disaster_drops, axis=1, inplace=True)

# Combine winter-related weather disasters
resampled_df['Winter weather']=resampled_df['Severe Ice Storm'] + resampled_df['Snowstorm'] + resampled_df['Freezing'] + resampled_df['Winter Storm']
resampled_df.drop(['Severe Ice Storm','Snowstorm','Freezing','Winter Storm'], axis=1, inplace=True)
resampled_df.head()

In [None]:
# Clean up the index
resampled_df.reset_index(inplace=True)
resampled_df.drop('level_1', axis=1, inplace=True)

In [None]:
# Extract month and year from 'declaration_dtm' and drop dtm column
resampled_df['year']=resampled_df['declaration_dtm'].dt.year
resampled_df['month']=resampled_df['declaration_dtm'].dt.month
resampled_df.drop('declaration_dtm', axis=1, inplace=True)
resampled_df.head()