In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime as dt
import tensorflow as tf
import urllib.request, json
import os

from math import sqrt
from sklearn.metrics import mean_squared_error
from pandas_datareader import data
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from statsmodels.tsa.arima.model import ARIMA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import FunctionTransformer

2025-01-16 00:25:01.270851: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-16 00:25:01.273721: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-16 00:25:01.280192: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1736987101.294818   18698 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1736987101.299291   18698 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-16 00:25:01.316466: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

## Load the Data

### 1.1. Load the Data

In [3]:
df=pd.read_csv('../data/us_disaster_declarations.csv')

### 1.2. Date Encoding

In [None]:
# Create 'incident_dtm' column to convert 'incident_begin_date' obj to datetime
df['incident_dtm']=pd.to_datetime(df['incident_begin_date'], format='%Y-%m-%dT%H:%M:%SZ')

In [None]:
# Create 'month' column to filter month from 'incident_dtm'
df['month']=pd.to_datetime(df['incident_begin_date'], format='%b').dt.month

# Create 'year' column to filter year from 'incident_dtm'
df['year']=pd.to_datetime(df['incident_begin_date'], format='%Y').dt.year

In [None]:
# Drop years before 2013
df=df[df['year'] >= 2013]

### 1.3. Clean States

In [None]:
# Clean states / remove territories 
mainland_states = ["AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE", "FL", "GA", 
                   "HI", "ID", "IL", "IN", "IA", "KS", "KY", "LA", "ME", "MD", 
                   "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
                   "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", 
                   "SD", "TN", "TX", "UT", "VT", "VA", "WA", "WV", "WI", "WY"]

df=df[df['state'].isin(mainland_states)]

### 1.4. Clean Incident Types

In [None]:
# List incident types to drop
disaster_drops=['Biological', 'Snowstorm', 'Fire', 'Severe Ice Storm', 'Tornado', 'Drought', 'Coastal Storm', 
                'Other', 'Freezing', 'Earthquake', 'Typhoon', 'Tropical Storm', 'Volcanic Eruption', 'Winter Storm',
                'Fishing Losses', 'Mud/Landslide', 'Dam/Levee Break', 'Toxic Substances', 'Tsunami', 'Chemical', 'Human Cause', 'Terrorist']

# Drop specified incident types
for d in disaster_drops:
    df=df[df.incident_type != d]

In [None]:
# List incident types to combine
common_disasters=['Severe Storm', 'Hurricane', 'Flood']

# Combine specified incidents into incident type 'winter weather'
for c in common_disasters:
    df.loc[df['incident_type'] == c, 'incident_type'] = 'Common Disasters'

In [None]:
# List incident types to combine
winter_weather=['Severe Ice Storm', 'Snowstorm', 'Freezing', 'Winter Storm']

# Combine specified incidents into incident type 'winter weather'
for w in winter_weather:
    df.loc[df['incident_type'] == w, 'incident_type'] = 'Winter Weather'

#result = df[df['incident_type'] == 'Winter Weather']
#print(result)

### 1.5. Clean Columns

In [None]:
# Remove unneeded columns

# Create subset_df with 'incident_dtm', 'incident_type', and 'state'
subset_df=df[['incident_dtm', 'incident_type', 'state']]

subset_df.set_index('incident_dtm', inplace=True)

#subset_df.head()

### 1.6. Disaster Encoding

In [None]:
# Encode the disaster types to dummies
disaster_dummies=pd.get_dummies(subset_df['incident_type'], dtype=int)

disaster_dummies.head()

In [None]:
# Combine disaster dummies and subset_df, remove 'incident_type' column
subset_df=pd.concat([subset_df.reset_index(drop=True), disaster_dummies.reset_index(drop=True)], axis=1)
subset_df.drop('incident_type', axis=1, inplace=True)
subset_df.head()

### 1.7. Time Axis Regularization/Resampling

In [None]:
def sum_months(group: pd.DataFrame) -> pd.DataFrame:
    '''Takes a yearly groupby object and sums features over months'''

    group=group.resample('ME').sum()

    return group

def resample_months(group: pd.DataFrame) -> pd.DataFrame:
    '''Takes working dataframe and resamples frequency to months.
    Returns updated dataframe'''

    # Set 'incident_dtm' as datetime axis
    group=group.set_index('incident_dtm')

    # Sum disasters in each month by year; removes duplicates where there was more than one disaster in a month
    group=group.groupby(group.index.year, group_keys=False).apply(sum_months)

    # Resample to monthly frequency
    group=group.resample('D').asfreq()

    # Fill missing values with 0
    group=group.fillna(0)

    # Convert everything to int
    group=group.astype(bool)

    # Reset the index, preserving the `incident_dtm`
    group.reset_index(inplace=True, drop=False)

    return group

# Use resampling function on subset_df
resampled_df=subset_df.groupby('state', group_keys=True).apply(resample_months, include_groups=False)

In [None]:
resampled_df.head(10)

### 1.8. Data Reshaping

In [None]:
# NOT WORKING on resampled_df

# Create 'month' column to filter month from 'incident_dtm'
resampled_df['month']=pd.to_datetime(resampled_df['incident_dtm'], format='%b').dt.month

# Create 'year' column to filter year from 'incident_dtm'
resampled_df['year']=pd.to_datetime(resampled_df['incident_dtm'], format='%Y').dt.year

reshaped_df=resampled_df.set_index(['year', 'state', 'month'], inplace=True)

reshaped_df.head()

In [None]:
test_df=subset_df

# Create 'month' column to filter month from 'incident_dtm'
test_df['month']=pd.to_datetime(test_df['incident_dtm'], format='%b').dt.month

# Create 'year' column to filter year from 'incident_dtm'
test_df['year']=pd.to_datetime(test_df['incident_dtm'], format='%Y').dt.year

# Set 'year', 'state', and 'month' indices
test_df.set_index(['year', 'state', 'month'], inplace=True)

test_df.head()

In [None]:
# Create 'month' column to filter month from 'incident_dtm'
subset_df['month']=pd.to_datetime(subset_df['incident_dtm'], format='%b').dt.month

# Create 'year' column to filter year from 'incident_dtm'
subset_df['year']=pd.to_datetime(subset_df['incident_dtm'], format='%Y').dt.year

# Set 'year', 'state', and 'month' indices
subset_df.set_index(['year', 'state', 'month'], inplace=True)

subset_df.head()