In [10]:
%matplotlib inline
from db_scripts.focus_intersection import subset_floods, flood_df, subset_locations
from db_scripts.get_server_data import get_table_for_variable, get_db_table_as_df
import pandas as pd
import numpy as np

In [11]:
grouped = subset_floods.groupby('event')

In [12]:
event_total_flooded = subset_floods['event'].value_counts()

In [13]:
event_dates = grouped['_date'].unique()
for e in event_dates:
    for d in e:
        d = pd.to_datetime(d)
        d = d.strftime("%Y-%m-%d")
num_event_dates = grouped['_date'].nunique()
num_locations = grouped['location'].nunique()

In [14]:
event_df = pd.concat([event_dates, event_total_flooded, num_event_dates, num_locations], axis=1)
event_df.columns = ['dates', 'num_flooded', 'num_dates', 'num_locations']
event_df.reset_index(inplace=True)
event_df.head()

Unnamed: 0,index,dates,num_flooded,num_dates,num_locations
0,01/15/2016 (1/15/2016),[2016-01-15T00:00:00.000000000],1,1,1
1,09/02/15 (9/2/2015),[2015-09-02T00:00:00.000000000],1,1,1
2,7/10 Thunderstorms (7/10/2014),[2014-07-10T00:00:00.000000000],27,1,27
3,Bernie (Training) (7/25/2016),[2016-07-25T00:00:00.000000000],1,1,1
4,February 24th Storm (2/24/2016),[2016-02-24T00:00:00.000000000],1,1,1


In [15]:
s = pd.Series(event_df['index'])
s = s.str.replace('\(Training\)', 'Training')  # remove parenth around training to get right split in next line

event_date_names = s.str.split("(", expand=True)
event_names = event_date_names[0]
event_date = event_date_names[1].str.replace("\)", "")
event_date = pd.to_datetime(event_date)
event_df['event_name'] = event_names.str.strip()
event_df['event_date'] = event_date
del event_df['index']
event_df.set_index(['event_date', 'event_name'], inplace=True)
event_df

Unnamed: 0_level_0,Unnamed: 1_level_0,dates,num_flooded,num_dates,num_locations
event_date,event_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2016-01-15,01/15/2016,[2016-01-15T00:00:00.000000000],1,1,1
2015-09-02,09/02/15,[2015-09-02T00:00:00.000000000],1,1,1
2014-07-10,7/10 Thunderstorms,[2014-07-10T00:00:00.000000000],27,1,27
2016-07-25,Bernie Training,[2016-07-25T00:00:00.000000000],1,1,1
2016-02-24,February 24th Storm,[2016-02-24T00:00:00.000000000],1,1,1
2016-09-19,HEAVY RAIN,[2016-09-19T00:00:00.000000000],4,1,4
2016-09-02,HERMINE,[2016-09-03T00:00:00.000000000],35,1,35
2013-10-09,Heavy Rain,"[2013-10-08T00:00:00.000000000, 2013-10-09T00:...",6,3,6
2014-05-16,Heavy Rain,[2014-05-16T00:00:00.000000000],21,1,21
2016-10-05,Hurricane Matthew,"[2016-10-05T00:00:00.000000000, 2016-10-08T00:...",17,5,17


You wouldn't expect to see a different number of locations that were flooded ('num_locations') than number of flood occurences ('num_flooded') for an event that only happened on one day. But this happens on 2014-07-24 where there are 8 places where it flooded but only 7 location names. Let's check that out.

In [16]:
fl_724 = subset_floods[subset_floods['_date'] == '2014-07-24']
fl_724[fl_724['location'].duplicated(keep=False)]

Unnamed: 0.1,Unnamed: 0,﻿recid,location,event,eventType,xcoord,ycoord,dt,_date,_time
444,444,4264,HAMPTON BOULEVARD & W 21ST STREET,unnamed (7/24/2014),Flooded street,12125900.0,3484891.0,2014-07-24 20:29:25.000,2014-07-24,2014-07-24 20:29:25.000
445,445,4265,HAMPTON BOULEVARD & W 21ST STREET,unnamed (7/24/2014),Flooded underpass,12125900.0,3484891.0,2014-07-24 20:29:25.000,2014-07-24,2014-07-24 20:29:25.000


So _here's_ what is happening. The location name is the same in two rows but there are two different event types: "flooded street" and "flooded underpass."
Now that I think about it, that may explain all the differences between the num_location and num_flooded columns. Let's try another one.

In [17]:
irene = subset_floods[subset_floods['event'].str.contains('Irene')].sort_values('location')
irene[irene['location'].duplicated(keep=False)]

Unnamed: 0.1,Unnamed: 0,﻿recid,location,event,eventType,xcoord,ycoord,dt,_date,_time
182,182,1151,1000 BLOCK OF E VIRGINIA BEACH BOULEVARD,Irene (8/27/2011),Flooded underpass,12134230.0,3478210.0,2011-08-28 04:18:01.000,2011-08-28,2011-08-28 04:18:01.000
181,181,1150,1000 BLOCK OF E VIRGINIA BEACH BOULEVARD,Irene (8/27/2011),Flooded street,12134230.0,3478210.0,2011-08-28 04:18:01.000,2011-08-28,2011-08-28 04:18:01.000
104,104,926,E 21ST STREET & MONTICELLO AVENUE,Irene (8/27/2011),Flooded street,12131100.0,3482796.0,2011-08-27 06:08:00.000,2011-08-27,2011-08-27 06:08:00.000
185,185,1248,E 21ST STREET & MONTICELLO AVENUE,Irene (8/27/2011),Flooded underpass,12131100.0,3482796.0,2011-08-28 08:44:35.000,2011-08-28,2011-08-28 08:44:35.000


Looks like that's it. Which is not what I was hoping to show. I was thinking that that tell me something about the variety of locations that were flooded over the days but that's not the case.

Let's try this one more time with Hurricane Joaquin

In [18]:
jqn = flood_df[flood_df['event'].str.contains('Joaquin')]

In [19]:
jqn[jqn['location'].duplicated(keep=False)]

Unnamed: 0.1,Unnamed: 0,﻿recid,location,event,eventType,xcoord,ycoord,dt,_date,_time


So that is interesting. Even though for hurricanes Matthew and Joaquin, the seven and six days respectively, none
of the flooded locations were reported twice for one event. Very interesting. So to me, this means we really should be looking at these things by 'event' and not by '\_date'. It also means that the num_locations col doesn't add any information. So imma delete that.

In [20]:
del event_df['num_locations']

Now I want to explore these weird events that had dates in the 'event' column in the original data that were very different from the dates in the '\_date' column for the event. A good example for this is the "unnamed (2/25/2016)" event.

In [21]:
subset_floods[subset_floods['event'].str.contains('2/25/2016')]

Unnamed: 0.1,Unnamed: 0,﻿recid,location,event,eventType,xcoord,ycoord,dt,_date,_time
762,762,4817,BOUSH STREET & W OLNEY ROAD,unnamed (2/25/2016),Flooded street,12129210.0,3478803.0,2016-05-05 20:46:10.000,2016-05-05,2016-05-05 20:46:10.000
763,763,4818,900 BLOCK OF E CHARLOTTE STREET,unnamed (2/25/2016),Flooded street,12132230.0,3476292.0,2016-05-05 20:51:34.000,2016-05-05,2016-05-05 20:51:34.000
764,764,4819,LLEWELLYN AVENUE & W VIRGINIA BEACH BOULEVARD,unnamed (2/25/2016),Flooded street,12129060.0,3479121.0,2016-05-05 20:52:17.000,2016-05-05,2016-05-05 20:52:17.000
767,767,4823,DUKE STREET & W OLNEY ROAD,unnamed (2/25/2016),Flooded street,12128850.0,3478992.0,2016-05-31 08:45:33.000,2016-05-31,2016-05-31 08:45:33.000


The date in the 'event' column is 2/25/2016 but the dates in the '\_date' column are 5/5 and 5/31/2016. So which date should I be looking at? More specifically, which date should I be gathering data for to base the model off of?

In [22]:
variable_df = get_db_table_as_df('Variables')
variable_df

Unnamed: 0,VariableID,VariableCode,VariableName,VariableType,Units
0,1,00060,Discharge,Regular interval instantaneous value,ft3/s
1,2,72019,"Water level, depth LSD",Maximum,ft
2,3,72019,"Water level, depth LSD",Minimum,ft
3,4,hourly_height,Tide Level,hourly_height,ft
4,5,Rainfall,Rainfall Depth,Incremental,in
5,6,Shallow_well_depth,Shallow Well Depth in NAVD88,depth,ft


Let's look at some of these weird events.

In [23]:
def get_rain(date):
    rain_df = get_table_for_variable('rainfall')
    rain_dfa = rain_df.loc[date]
    return rain_dfa.groupby('SiteID')['Value'].sum()

In [24]:
print get_rain('2016-02-25')

SiteID
4.0    0.0
6.0    0.0
7.0    0.0
8.0    0.0
Name: Value, dtype: float64


When I tried to calculate the time between the 'event_date' and the 'dates' to see how far off these were I found that two events had the same 'event_date'. So I think it's appropriate to drop the 'unnamed' one based on the fact that the dates are further from it. I'll drop that above in the grouping section.

In [25]:
event_df.sort_index(inplace=True)
idx = pd.IndexSlice
event_df.loc[idx['2016-07-30', :], :]

Unnamed: 0_level_0,Unnamed: 1_level_0,dates,num_flooded,num_dates
event_date,event_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-07-30,Thunderstorm,[2016-07-30T00:00:00.000000000],3,1
2016-07-30,unnamed,"[2016-08-02T00:00:00.000000000, 2016-08-31T00:...",4,2


When I tried to calculate the time between the 'event_date' and the 'dates' to see how far off these were I found that two events had the same 'event_date'. So I think it's appropriate to drop the 'unnamed' one based on the fact that the dates are further from it.

In [26]:
i = event_df.loc[idx['2016-07-30', 'unnamed'],:].index
event_df.drop(i, inplace=True)

In [27]:
event_df

Unnamed: 0_level_0,Unnamed: 1_level_0,dates,num_flooded,num_dates
event_date,event_name,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2010-09-30,Nicole,"[2010-09-30T00:00:00.000000000, 2010-10-01T00:...",48,3
2011-08-27,Irene,"[2011-08-27T00:00:00.000000000, 2011-08-28T00:...",32,2
2012-10-28,Sandy,"[2012-10-28T00:00:00.000000000, 2012-10-29T00:...",45,2
2013-10-09,Heavy Rain,"[2013-10-08T00:00:00.000000000, 2013-10-09T00:...",6,3
2014-05-16,Heavy Rain,[2014-05-16T00:00:00.000000000],21,1
2014-06-19,Thunderstorms,[2014-06-20T00:00:00.000000000],5,1
2014-07-09,Thunderstorms,[2014-07-09T00:00:00.000000000],1,1
2014-07-10,7/10 Thunderstorms,[2014-07-10T00:00:00.000000000],27,1
2014-07-24,unnamed,[2014-07-24T00:00:00.000000000],8,1
2014-09-04,Thunderstorm,[2014-09-04T00:00:00.000000000],2,1


In [28]:
event_df.reset_index(inplace=True)
event_df.set_index('event_date', inplace=True)

In [29]:
days_away = []
max_days = []
for d in event_df.index:
    ar = event_df.loc[d, 'dates'] - np.datetime64(d)
    ar = ar.astype('timedelta64[D]')
    days = ar / np.timedelta64(1, 'D')
    days_away.append(days)
    max_days.append(days.max())
event_df['days_away_from_event'] = days_away
event_df['max_days_away'] = max_days
print event_df.shape
event_df

(34, 6)


Unnamed: 0_level_0,event_name,dates,num_flooded,num_dates,days_away_from_event,max_days_away
event_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-09-30,Nicole,"[2010-09-30T00:00:00.000000000, 2010-10-01T00:...",48,3,"[0.0, 1.0, 4.0]",4.0
2011-08-27,Irene,"[2011-08-27T00:00:00.000000000, 2011-08-28T00:...",32,2,"[0.0, 1.0]",1.0
2012-10-28,Sandy,"[2012-10-28T00:00:00.000000000, 2012-10-29T00:...",45,2,"[0.0, 1.0]",1.0
2013-10-09,Heavy Rain,"[2013-10-08T00:00:00.000000000, 2013-10-09T00:...",6,3,"[-1.0, 0.0, 1.0]",1.0
2014-05-16,Heavy Rain,[2014-05-16T00:00:00.000000000],21,1,[0.0],0.0
2014-06-19,Thunderstorms,[2014-06-20T00:00:00.000000000],5,1,[1.0],1.0
2014-07-09,Thunderstorms,[2014-07-09T00:00:00.000000000],1,1,[0.0],0.0
2014-07-10,7/10 Thunderstorms,[2014-07-10T00:00:00.000000000],27,1,[0.0],0.0
2014-07-24,unnamed,[2014-07-24T00:00:00.000000000],8,1,[0.0],0.0
2014-09-04,Thunderstorm,[2014-09-04T00:00:00.000000000],2,1,[0.0],0.0


In [30]:
get_rain('2014-11-01')

SiteID
6.0    0.02
7.0    0.04
8.0    0.00
Name: Value, dtype: float64

In [31]:
event_df[event_df['max_days_away']>10]
print event_df.loc['2016-09-20', 'dates']

['2016-09-21T00:00:00.000000000' '2016-09-22T00:00:00.000000000'
 '2016-10-05T00:00:00.000000000']


In [32]:
event_filt = event_df[event_df['max_days_away']<10]
print event_filt.shape
event_filt

(29, 6)


Unnamed: 0_level_0,event_name,dates,num_flooded,num_dates,days_away_from_event,max_days_away
event_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2010-09-30,Nicole,"[2010-09-30T00:00:00.000000000, 2010-10-01T00:...",48,3,"[0.0, 1.0, 4.0]",4.0
2011-08-27,Irene,"[2011-08-27T00:00:00.000000000, 2011-08-28T00:...",32,2,"[0.0, 1.0]",1.0
2012-10-28,Sandy,"[2012-10-28T00:00:00.000000000, 2012-10-29T00:...",45,2,"[0.0, 1.0]",1.0
2013-10-09,Heavy Rain,"[2013-10-08T00:00:00.000000000, 2013-10-09T00:...",6,3,"[-1.0, 0.0, 1.0]",1.0
2014-05-16,Heavy Rain,[2014-05-16T00:00:00.000000000],21,1,[0.0],0.0
2014-06-19,Thunderstorms,[2014-06-20T00:00:00.000000000],5,1,[1.0],1.0
2014-07-09,Thunderstorms,[2014-07-09T00:00:00.000000000],1,1,[0.0],0.0
2014-07-10,7/10 Thunderstorms,[2014-07-10T00:00:00.000000000],27,1,[0.0],0.0
2014-07-24,unnamed,[2014-07-24T00:00:00.000000000],8,1,[0.0],0.0
2014-09-04,Thunderstorm,[2014-09-04T00:00:00.000000000],2,1,[0.0],0.0


In [149]:
rain_df = get_table_for_variable('rainfall')
rain_grouped = rain_df.groupby('SiteID')
rain_daily = rain_grouped.resample('D').sum()
del rain_daily['SiteID']
by_date = rain_daily.groupby(level=['Datetime']).mean()
by_date.loc[event_df.loc['2016-10-05', 'dates']]

Unnamed: 0_level_0,ValueID,Value,VariableID,QCID
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2016-10-05,150099300.0,0.0,480.0,0.0
2016-10-08,148728600.0,7.6125,468.75,0.0
2016-10-09,180292100.0,2.633333,480.0,0.0
2016-10-10,180301400.0,0.0,480.0,0.0
2016-10-11,143623400.0,0.0,427.5,0.0


In [114]:
get_rain('2016-10-08')

SiteID
4.0    8.98
6.0    5.84
7.0    8.39
8.0    7.24
Name: Value, dtype: float64

In [160]:
df = rain_grouped.rolling(window=4).sum()

In [161]:
type(df)

pandas.core.frame.DataFrame

In [162]:
df.reset_index(level=0, drop=True, inplace=True)
df.resample('D').max()

Unnamed: 0_level_0,ValueID,Value,VariableID,SiteID,QCID
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-01,10295550.0,4.000000e-02,20.0,28.0,0.0
2010-01-02,10295934.0,0.000000e+00,20.0,28.0,0.0
2010-01-03,10296318.0,0.000000e+00,20.0,28.0,0.0
2010-01-04,10296702.0,1.000000e-01,20.0,28.0,0.0
2010-01-05,10297086.0,0.000000e+00,20.0,28.0,0.0
2010-01-06,10297470.0,0.000000e+00,20.0,28.0,0.0
2010-01-07,10297854.0,0.000000e+00,20.0,28.0,0.0
2010-01-08,10298238.0,1.000000e-02,20.0,28.0,0.0
2010-01-09,10298622.0,0.000000e+00,20.0,28.0,0.0
2010-01-10,10299006.0,0.000000e+00,20.0,28.0,0.0
