In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import datetime

from math import sqrt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

df=pd.read_csv('../data/us_disaster_declarations.csv')

### Regularize Time Axis (Monthly) 

In [None]:
# Create resampled_df containing 'state', 'declaration_dtm', and 'incident_type'
resampled_df=(df.assign(date=lambda d: pd.to_datetime(d['declaration_dtm'], format="%Y-%m-%d"))[['state', 'declaration_dtm', 'incident_type']])

# Set index to declaration_dtm, group by 'state', and sum the monthly (ME) incidents
resampled_df.set_index('declaration_dtm').groupby('state').resample('ME').sum().head()

In [None]:
# Check if dtm axis is regular - take the difference across the dtm index
diff=np.diff(resampled_df.index.to_numpy())

# Check that all of the differences are equal
equally_spaced=np.all(diff==diff[0])

print(f'Datetime axis is regular: {equally_spaced}')

In [None]:
# Check time axis in a plot (for year 1980 only)
year=resampled_df[resampled_df['declaration_dtm'].dt.strftime('%Y')=='1980']
counts=year['declaration_dtm'].value_counts()

counts.plot(kind='bar')
plt.title('Incident Count by Date')
plt.xticks(rotation=45, ha='right')
plt.xlabel('Timepoint')
plt.ylabel('Count')
plt.show()

### Dtm Conversions

In [7]:
# Convert 'declaration_date' obj to dtm in new 'declaration_dtm' column
df['declaration_dtm']=pd.to_datetime(df['declaration_date'], format='%Y-%m-%dT%H:%M:%SZ')

# Create resampled_df containing 'state', 'declaration_dtm', and 'incident_type'
subset_df=(df.assign(date=lambda d: pd.to_datetime(d['declaration_dtm'], format="%Y-%m-%d"))[['state', 'declaration_dtm', 'incident_type']])

subset_df.head()

Unnamed: 0_level_0,state,declaration_dtm,incident_type
declaration_dtm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1953-05-02,GA,1953-05-02,Tornado
1953-05-15,TX,1953-05-15,Tornado
1953-05-29,LA,1953-05-29,Flood
1953-06-02,MI,1953-06-02,Tornado
1953-06-06,MT,1953-06-06,Flood


In [8]:
# Create 'month' column to filter month from 'declaration_dtm'
df['month']=pd.to_datetime(df['declaration_dtm'], format='%b').dt.month

# Create 'year' column to filter year from 'declaration_dtm'
df['year']=pd.to_datetime(df['declaration_dtm'], format='%Y').dt.year

# Create 'date' column with year-month format of each incident, with day assigned to 1
df["date"]=pd.to_datetime(df[['year', 'month']].assign(day=1))

# Create subset_df with reformatted 'date', 'incident_type', and 'state'
date_df=df[['date', 'incident_type', 'state']]

# Set index to 'date'
#subset_df=subset_df.set_index('date')

date_df.head()

Unnamed: 0_level_0,date,incident_type,state
declaration_dtm,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1953-05-02,1953-05-01,Tornado,GA
1953-05-15,1953-05-01,Tornado,TX
1953-05-29,1953-05-01,Flood,LA
1953-06-02,1953-06-01,Tornado,MI
1953-06-06,1953-06-01,Flood,MT


### Incident Counts

In [22]:
count_date_df=subset_df.date.value_counts().reset_index()    

count_date_df.head(30)


Unnamed: 0,date,count
0,2020-03-01,5842
1,2005-09-01,3607
2,2020-04-01,2138
3,2004-09-01,859
4,1993-03-01,824
5,2017-09-01,807
6,1996-01-01,764
7,2021-02-01,747
8,1999-09-01,634
9,2020-08-01,571


### Date Encoding

In [None]:
# Floor day of the month so that all incidents are reported on the same day monthly (1st of the month; for monthly counts)

# Create 'month' column to filter month from 'declaration_dtm'
df['month']=pd.to_datetime(df['declaration_dtm'], format='%b').dt.month

# Create 'year' column to filter year from 'declaration_dtm'
df['year']=pd.to_datetime(df['declaration_dtm'], format='%Y').dt.year

# Create 'date' column with year-month format of each incident (day assigned 1st of month)
df['date']=pd.to_datetime(df[['year', 'month']].assign(day=1))

# Create 'doy' column to extract the day of the year
df['doy'] = df.date.dt.day_of_year

df.head()

In [None]:
# Create subset_df with reformatted 'date' and other necessary columns
subset_df=df[['date', 'incident_type', 'declaration_title', 'state', 'doy', 'year']]

# Create 'count' column to count the number of disasters reported based on 'date'
subset_df['mo_count'] = df.groupby(['date'])['date'].transform('count')

subset_df.head(10)

### Time Axis Regularization

In [None]:
# Check if dtm axis is regular 

# Create dtm_df where index is set to 'date'
dtm_df=subset_df.set_index('date')

# Take the difference across the dtm index
diff=np.diff(dtm_df.index.to_numpy())

# Check that all of the differences are equal
equally_spaced=np.all(diff==diff[0])

print(f'Datetime axis is regular: {equally_spaced}')
dtm_df.head()

In [None]:
# Resample the time series data based on months - get mean monthly count 
resampled_df=dtm_df.mo_count.resample('M').mean().interpolate()

resampled_df.head()

### Baseline Model Performance

In [None]:
# Instantiate a linear model
linear_model=LinearRegression()

# Generate a uniform x variable representing the time steps
x=np.array(range(len(subset_df))).reshape(-1, 1)

# Fit the model
fit_result=linear_model.fit(x, subset_df['counts'])

# Make predictions over the input domain
predictions=linear_model.predict(x)

# Add the linear model predictions back to the dataframe
subset_df['Linear model']=predictions

# Calculate the root mean squared error
rmse=sqrt(mean_squared_error(subset_df['counts'], predictions))

# Save the result for later
rmse_results={'Linear model': rmse}

# Display the result for the user
print(f'Root mean square error: {rmse:.1f}')

### Missing and/or Extreme Values

In [None]:
# Plot data as a histogram as you normally would for any numerical data
plt.title('Distribution of Disaster Incidents')
plt.hist(subset_df['counts'])
plt.xlabel('Disaster Incidents')
plt.ylabel('Count')
plt.show()