In [None]:
# import pandas as pd
import numpy as np

import plotly.figure_factory as ff

import sqlite3
import requests

from functools import reduce
from pandarallel import pandarallel

# Storm Events Data

[Storm Events Data](https://www.ncdc.noaa.gov/stormevents/)  

[Storm Events Data Export Format](https://github.com/jennyrhee/storm-events/blob/master/docs/storm-data-export-format.pdf)  
Documentation provided by NOAA

In [2]:
# Importing storm events data from database
conn = sqlite3.connect('../data/raw/storms.db')
c = conn.cursor()

query = '''
          SELECT BEGIN_YEARMONTH,
                 BEGIN_DAY,
                 END_YEARMONTH,
                 END_DAY,
                 CZ_NAME AS PARISH,
                 EVENT_TYPE,
                 BEGIN_LAT,
                 BEGIN_LON,
                 END_LAT,
                 END_LON,
                 INJURIES_DIRECT,
                 INJURIES_INDIRECT,
                 DEATHS_DIRECT,
                 DEATHS_INDIRECT,
                 DAMAGE_PROPERTY
            FROM details 
           WHERE STATE = 'LOUISIANA'
             AND YEAR >= 1996
        ORDER BY CZ_NAME,
                 BEGIN_YEARMONTH,
                 BEGIN_DAY;
        '''

results = c.execute(query)
storm_df = pd.DataFrame(results.fetchall())

# Change column names to lowercase
names = [description[0].lower() for description in results.description]
storm_df.columns = names

conn.close()

In [3]:
storm_df.head()

Unnamed: 0,begin_yearmonth,begin_day,end_yearmonth,end_day,parish,event_type,begin_lat,begin_lon,end_lat,end_lon,injuries_direct,injuries_indirect,deaths_direct,deaths_indirect,damage_property
0,199603,29,199603,29,ACADIA,Hail,30.23,-92.27,30.23,-92.27,0,0,0,0,
1,199603,29,199603,29,ACADIA,Hail,30.22,-92.37,30.22,-92.37,0,0,0,0,
2,199605,1,199605,31,ACADIA,Drought,,,,,0,0,0,0,
3,199608,13,199608,13,ACADIA,Thunderstorm Wind,30.23,-92.27,30.23,-92.27,0,0,0,0,10K
4,199610,25,199610,26,ACADIA,Flash Flood,,,,,0,0,0,0,20K


In [4]:
# Importing FIPS data from storm events database
conn = sqlite3.connect('../data/raw/storms.db')
c = conn.cursor()

query = '''
          SELECT STATE_FIPS,
                 CZ_FIPS,
                 CZ_NAME AS PARISH,
                 BEGIN_LAT AS LAT,
                 BEGIN_LON AS LON
            FROM details 
           WHERE STATE = 'LOUISIANA'
             AND YEAR >= 1996
        ORDER BY CZ_FIPS
        '''

results = c.execute(query)
fips_df = pd.DataFrame(results.fetchall())

# Change column names to lowercase
names = [description[0].lower() for description in results.description]
fips_df.columns = names

conn.close()

In [5]:
fips_df.head()

Unnamed: 0,state_fips,cz_fips,parish,lat,lon
0,22,1,ACADIA,30.4,-92.21
1,22,1,ACADIA,30.12,-92.5
2,22,1,ACADIA,30.12,-92.5
3,22,1,ACADIA,30.33,-92.49
4,22,1,ACADIA,30.21,-92.41


## Date

In [6]:
def separate(yearmonth: int):
    '''
    Separates the yearmonth column into their own respective columns
    Return year (int) and month (int)
    '''
    yearmonth = str(yearmonth)
    return int(yearmonth[:4]), int(yearmonth[-2:])


# Separate, drop yearmonth, then convert year and month to int
storm_df[['begin_year', 'begin_month']] = pd.DataFrame(storm_df.begin_yearmonth.apply(separate).tolist(), 
                                           index=storm_df.index)
storm_df[['end_year', 'end_month']] = pd.DataFrame(storm_df.end_yearmonth.apply(separate).tolist(), 
                                           index=storm_df.index)
storm_df = storm_df.drop(['begin_yearmonth', 'end_yearmonth'], axis=1)

# Convert separate year, month, day columns into datetime
storm_df['begin_date'] = pd.to_datetime(storm_df.begin_year*10000 + \
                                        storm_df.begin_month*100 + \
                                        storm_df.begin_day, \
                                        format='%Y%m%d')
storm_df['end_date'] = pd.to_datetime(storm_df.end_year*10000 + \
                                        storm_df.end_month*100 + \
                                        storm_df.end_day, \
                                        format='%Y%m%d')
storm_df = storm_df.drop(['begin_year', 'begin_month', 'begin_day',
                          'end_year', 'end_month', 'end_day'], axis=1)

# Change order of DataFrame
cols = storm_df.columns.tolist()
cols = cols[-2:] + cols[:-2]
storm_df = storm_df[cols]

In [7]:
storm_df.head()

Unnamed: 0,begin_date,end_date,parish,event_type,begin_lat,begin_lon,end_lat,end_lon,injuries_direct,injuries_indirect,deaths_direct,deaths_indirect,damage_property
0,1996-03-29,1996-03-29,ACADIA,Hail,30.23,-92.27,30.23,-92.27,0,0,0,0,
1,1996-03-29,1996-03-29,ACADIA,Hail,30.22,-92.37,30.22,-92.37,0,0,0,0,
2,1996-05-01,1996-05-31,ACADIA,Drought,,,,,0,0,0,0,
3,1996-08-13,1996-08-13,ACADIA,Thunderstorm Wind,30.23,-92.27,30.23,-92.27,0,0,0,0,10K
4,1996-10-25,1996-10-26,ACADIA,Flash Flood,,,,,0,0,0,0,20K


## Parish

In [8]:
# Capitalize first letter of each part of name
storm_df['parish'] = storm_df.parish.str.title()
fips_df['parish'] = fips_df.parish.str.title()

In [9]:
storm_df.parish.unique()

array(['Acadia', 'Allen', 'Ascension', 'Assumption', 'Avoyelles',
       'Beauregard', 'Bienville', 'Bossier', 'Caddo', 'Calcasieu',
       'Caldwell', 'Cameron', 'Catahoula', 'Claiborne', 'Concordia',
       'De Soto', 'East Baton Rouge', 'East Cameron', 'East Carroll',
       'East Feliciana', 'Evangeline', 'Franklin', 'Grant', 'Iberia',
       'Iberville', 'Jackson', 'Jefferson', 'Jefferson Davis', 'La Salle',
       'Lafayette', 'Lafourche', 'Lincoln', 'Livingston',
       'Lower Jefferson', 'Lower Lafourche', 'Lower Plaquemines',
       'Lower St. Bernard', 'Lower St. Martin', 'Lower Terrebonne',
       'Madison', 'Morehouse', 'Natchitoches', 'Northern Tangipahoa',
       'Orleans', 'Ouachita', 'Plaquemines', 'Pointe Coupee', 'Rapides',
       'Red River', 'Richland', 'Sabine', 'Southern Tangipahoa',
       'St. Bernard', 'St. Charles', 'St. Helena', 'St. James',
       'St. John The Baptist', 'St. Landry', 'St. Martin', 'St. Mary',
       'St. Tammany', 'Tangipahoa', 'Tensas', 'T

In [10]:
# Fix typos
storm_df['parish'] = storm_df.parish.replace({'St. John The Baptist': 'St. John the Baptist'})

In [11]:
# Find the FIPS code for cz_name that aren't actual parishes
check_fips = ['East Cameron', 'Lower Jefferson', 'Lower Lafourche', 'Lower Plaquemines', 
              'Lower St. Bernard', 'Lower St. Martin', 'Lower Terrebonne', 
              'Northern Tangipahoa', 'Sabine And Natchitoches', 'Southern Tangipahoa', 
              'Upper Jefferson', 'Upper Lafourche', 'Upper Plaquemines', 'Upper St. Bernard', 
              'Upper St. Martin', 'Upper Terrebonne', 'West Cameron']

for parish in check_fips:
    print(parish + ': ' + str(fips_df[fips_df.parish == parish]['cz_fips'].unique()) + '\n')

East Cameron: [74]

Lower Jefferson: [68]

Lower Lafourche: [67]

Lower Plaquemines: [69]

Lower St. Bernard: [70]

Lower St. Martin: [55]

Lower Terrebonne: [66]

Northern Tangipahoa: [71]

Sabine And Natchitoches: []

Southern Tangipahoa: [72]

Upper Jefferson: [61]

Upper Lafourche: [59]

Upper Plaquemines: [63]

Upper St. Bernard: [64]

Upper St. Martin: [45]

Upper Terrebonne: [65]

West Cameron: [73]



In [12]:
# Compare above FIPS with actual parish FIPS
parish_fips = ['Cameron', 'Jefferson', 'Lafourche', 'Plaquemines', 'St. Bernard', 'St. Martin', 
               'Terrebonne', 'Tangipahoa', 'Sabine', 'Natchitoches']

for parish in parish_fips:
    print(parish + ': ' + str(fips_df[fips_df.parish == parish]['cz_fips'].unique()) + '\n')

Cameron: [23 51]

Jefferson: [51]

Lafourche: [57]

Plaquemines: [75]

St. Bernard: [87]

St. Martin: [99]

Terrebonne: [109]

Tangipahoa: [ 38 105]

Sabine: [17 85]

Natchitoches: [18 69]



In [13]:
def find_parish(lat: float, lon: float, debug=False):
    '''
    Use coordinates2politics API to find parish based on coordinates
    Returns parish name (str) or None
    '''
    url = 'http://www.datasciencetoolkit.org/coordinates2politics/'
    coord = str(lat) + '%2c' + str(lon)
    r = requests.get(url + coord)
    try:
        # Print coordinates returned by API
        if debug:
            print(i)
            print(r.json()[0]['location'])
        # Iterate through each dict
        for d in r.json()[0]['politics']:
            # Add parish name if found
            if d['friendly_type'] == 'county':
                if debug:
                    print(d['name'])
                    print(d['code'])
                return d['name']
    # NoneType when API returns no politics data
    except TypeError:
        if debug:
            print('No data')
        return None


# Only one - Sabine and Natchitoches - use coordinates to find parish
find_parish(31.62, -93.43)

'Sabine'

In [14]:
# None of the FIPS codes are valid, so change to parish
parish_dict = {'East Cameron': 'Cameron', 
               'Lower Jefferson': 'Jefferson', 
               'Lower Lafourche': 'Lafourche', 
               'Lower Plaquemines': 'Plaquemines',
               'Lower St. Bernard': 'St. Bernard', 
               'Lower St. Martin': 'St. Martin',
               'Lower Terrebonne': 'Terrebonne', 
               'Northern Tangipahoa': 'Tangipahoa',
               'Sabine And Natchitoches': 'Sabine',
               'Southern Tangipahoa': 'Tangipahoa', 
               'Upper Jefferson': 'Jefferson',
               'Upper Lafourche': 'Lafourche',
               'Upper Plaquemines': 'Plaquemines', 
               'Upper St. Bernard': 'St. Bernard',
               'Upper St. Martin': 'St. Martin', 
               'Upper Terrebonne': 'Terrebonne',
               'West Cameron': 'Cameron'}

storm_df['parish'] = storm_df.parish.replace(parish_dict)

## FIPS

In [15]:
# Add column with FIPS code
parish_fips_dict = {'Acadia': '22001',
                    'Allen': '22003',
                    'Ascension': '22005',
                    'Assumption': '22007',
                    'Avoyelles': '22009',
                    'Beauregard': '22011',
                    'Bienville': '22013',
                    'Bossier': '22015',
                    'Caddo': '22017',
                    'Calcasieu': '22019',
                    'Caldwell': '22021',
                    'Cameron': '22023',
                    'Catahoula': '22025',
                    'Claiborne': '22027',
                    'Concordia': '22029',
                    'De Soto': '22031',
                    'East Baton Rouge': '22033',
                    'East Carroll': '22035', 
                    'East Feliciana': '22037',
                    'Evangeline': '22039',
                    'Franklin': '22041',
                    'Grant': '22043',
                    'Iberia': '22045', 
                    'Iberville': '22047',
                    'Jackson': '22049',
                    'Jefferson': '22051',
                    'Jefferson Davis': '22053',
                    'La Salle': '22059', 
                    'Lafayette': '22055',
                    'Lafourche': '22057',
                    'Lincoln': '22061', 
                    'Livingston': '22063',
                    'Madison': '22065', 
                    'Morehouse': '22067', 
                    'Natchitoches': '22069', 
                    'Orleans': '22071', 
                    'Ouachita': '22073', 
                    'Plaquemines': '22075', 
                    'Pointe Coupee': '22077', 
                    'Rapides': '22079', 
                    'Red River': '22081', 
                    'Richland': '22083', 
                    'Sabine': '22085',
                    'St. Bernard': '22087',
                    'St. Charles': '22089', 
                    'St. Helena': '22091',
                    'St. James': '22093',
                    'St. John the Baptist': '22095',
                    'St. Landry': '22097', 
                    'St. Martin': '22099', 
                    'St. Mary': '22101', 
                    'St. Tammany': '22103',
                    'Tangipahoa': '22105', 
                    'Tensas': '22107', 
                    'Terrebonne': '22109', 
                    'Union': '22111', 
                    'Vermilion': '22113', 
                    'Vernon': '22115',
                    'Washington': '22117', 
                    'Webster': '22119', 
                    'West Baton Rouge': '22121', 
                    'West Carroll': '22123', 
                    'West Feliciana': '22125', 
                    'Winn': '22127'}

storm_df['fips'] = storm_df.parish.map(parish_fips_dict)

# Drop unnecessary columns
#storm_df = storm_df.drop(['begin_lat', 'begin_lon', 'end_lat', 'end_lon'], axis=1)

# Reorder columns
cols = storm_df.columns.tolist()
cols = cols[:2] + [cols[-1]] + cols[2:-1]
storm_df = storm_df[cols]

In [16]:
storm_df.head()

Unnamed: 0,begin_date,end_date,fips,parish,event_type,begin_lat,begin_lon,end_lat,end_lon,injuries_direct,injuries_indirect,deaths_direct,deaths_indirect,damage_property
0,1996-03-29,1996-03-29,22001,Acadia,Hail,30.23,-92.27,30.23,-92.27,0,0,0,0,
1,1996-03-29,1996-03-29,22001,Acadia,Hail,30.22,-92.37,30.22,-92.37,0,0,0,0,
2,1996-05-01,1996-05-31,22001,Acadia,Drought,,,,,0,0,0,0,
3,1996-08-13,1996-08-13,22001,Acadia,Thunderstorm Wind,30.23,-92.27,30.23,-92.27,0,0,0,0,10K
4,1996-10-25,1996-10-26,22001,Acadia,Flash Flood,,,,,0,0,0,0,20K


## Property damage
Estimated amount of damage to property ($) incurred by weather event

In [17]:
def format_damage(damage: str):
    '''
    damage_property and damage_crops are str values appended with K, M, or B
    Formats to proper float values
    Returns damage (float)
    '''
    if damage is '0' or not damage:
        damage = 0
    elif damage is 'K': # One row with only "K" - assume 0
        damage = 0
    else:
        rep = damage[-1]
        damage = float(damage[:-1])
        if rep is 'K':
            return damage * 1e3
        elif rep is 'M':
            return damage * 1e6
        elif rep is 'B':
            return damage * 1e9
    return damage


storm_df['damage_property'] = storm_df.damage_property.apply(format_damage)
storm_df.head()

Unnamed: 0,begin_date,end_date,fips,parish,event_type,begin_lat,begin_lon,end_lat,end_lon,injuries_direct,injuries_indirect,deaths_direct,deaths_indirect,damage_property
0,1996-03-29,1996-03-29,22001,Acadia,Hail,30.23,-92.27,30.23,-92.27,0,0,0,0,0.0
1,1996-03-29,1996-03-29,22001,Acadia,Hail,30.22,-92.37,30.22,-92.37,0,0,0,0,0.0
2,1996-05-01,1996-05-31,22001,Acadia,Drought,,,,,0,0,0,0,0.0
3,1996-08-13,1996-08-13,22001,Acadia,Thunderstorm Wind,30.23,-92.27,30.23,-92.27,0,0,0,0,10000.0
4,1996-10-25,1996-10-26,22001,Acadia,Flash Flood,,,,,0,0,0,0,20000.0


# Meteorological Data

[Meteorological Data](https://data.nodc.noaa.gov/cgi-bin/iso?id=gov.noaa.ncdc:C00946)  

[Global Summary of the Month Data Documentation](https://github.com/jennyrhee/storm-events/blob/master/docs/gsom-gsoy_documentation.pdf)  
Documentation provided by NOAA

## Data Dictionary

| Abbreviation | Variable | Unit |
| ---- | --- | --- |
| AWND | average daily wind speed | miles per hour |
| PRCP | precipitation | inches |
| SNOW | snowfall | inches |
| SNWD | snow depth | inches |
| TAVG | average of hourly temp values | F | 
| TMAX | maximum temp | F | 
| TMIN | minimum temp | F | 
| WDF2 | direction of fastest 2-minute wind | degrees |
| WDF5 | direction of fastest 5-second wind | degrees |
| WSF2 | fastest 2-minute wind speed | miles per hour |
| WSF5 | fastest 5-second wind speed | miles per hour |

In [18]:
met_df = pd.read_csv('../data/raw/daily_summaries.csv', parse_dates=[5])
cols = met_df.columns.tolist()
# Lowercase column names
cols = [c.lower() for c in cols]
met_df.columns = cols

In [19]:
met_df.describe()

Unnamed: 0,latitude,longitude,elevation,awnd,prcp,snow,snwd,tavg,tmax,tmin,wdf2,wdf5,wsf2,wsf5
count,214708.0,214708.0,214708.0,128776.0,211931.0,97194.0,95553.0,44797.0,212276.0,212221.0,126871.0,126372.0,126873.0,126437.0
mean,30.873025,-91.904307,24.503474,6.198555,0.152967,0.000899,0.001346,68.467822,78.013091,58.134643,170.465016,170.805637,16.115983,21.447742
std,1.040609,1.300802,27.925133,3.439345,0.48603,0.055717,0.192237,14.419257,13.956633,14.981117,100.469662,100.818244,5.841993,8.010603
min,29.333,-93.8244,0.9,0.0,0.0,0.0,0.0,-13.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,30.0494,-93.18333,2.7,3.58,0.0,0.0,0.0,59.0,69.0,46.0,90.0,90.0,12.1,16.1
50%,30.3644,-92.0405,11.6,5.59,0.0,0.0,0.0,71.0,80.0,61.0,170.0,170.0,15.0,19.9
75%,31.9497,-91.02778,36.0,8.05,0.02,0.0,0.0,80.0,89.0,71.0,240.0,240.0,19.9,25.1
max,32.54278,-89.4075,114.0,90.82,13.54,7.5,55.0,807.0,861.0,752.0,360.0,360.0,128.0,340.0


In [20]:
met_df.head()

Unnamed: 0,station,name,latitude,longitude,elevation,date,awnd,prcp,snow,snwd,tavg,tmax,tmin,wdf2,wdf5,wsf2,wsf5
0,USC00162212,"CROWLEY 2 NE, LA US",30.2408,-92.3477,7.6,1996-01-01,,0.0,0.0,0.0,,67.0,62.0,,,,
1,USC00162212,"CROWLEY 2 NE, LA US",30.2408,-92.3477,7.6,1996-01-02,,0.3,0.0,0.0,,78.0,46.0,,,,
2,USC00162212,"CROWLEY 2 NE, LA US",30.2408,-92.3477,7.6,1996-01-03,,0.0,0.0,0.0,,,,,,,
3,USC00162212,"CROWLEY 2 NE, LA US",30.2408,-92.3477,7.6,1996-01-04,,0.0,0.0,0.0,,46.0,31.0,,,,
4,USC00162212,"CROWLEY 2 NE, LA US",30.2408,-92.3477,7.6,1996-01-05,,0.0,0.0,0.0,,62.0,36.0,,,,


## Parish/FIPS from Coordinates

In [21]:
# New DataFrame with unique parish names and coordinates
met_coords = met_df[['name', 'latitude', 'longitude']].drop_duplicates().reset_index(drop=True)

In [44]:
# Initialization for parallelization
pandarallel.initialize(progress_bar=True)

# Use find_parish function from earlier to find parishes with coordinates
met_coords['parish'] = met_coords.parallel_apply(lambda row: find_parish(row['latitude'], 
                                                                         row['longitude']), 
                                                 axis=1)

New pandarallel memory created - Size: 2000 MB
Pandarallel will run on 8 workers


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=4), Label(value='0 / 4'))), HBox(c…

In [23]:
met_coords.head()

Unnamed: 0,name,latitude,longitude,parish
0,"CROWLEY 2 NE, LA US",30.2408,-92.3477,Acadia
1,"FORT POLK ARMY AIR FIELD, LA US",31.05,-93.18333,Vernon
2,"TALLULAH VICKSBURG REGIONAL AIRPORT, LA US",32.35,-91.02778,Madison
3,"NEW IBERIA AIRPORT ACADIANA REGIONAL, LA US",30.0375,-91.8839,Iberia
4,"SALT POINT, LA US",29.56222,-91.52556,St. Mary


In [24]:
met_coords['fips'] = met_coords.parish.map(parish_fips_dict)

In [25]:
# Merge DataFrames and drop unnecessary columns
met_df = met_df.merge(met_coords, on=['name', 'latitude', 'longitude'])
met_df = met_df.drop(['station', 'elevation'], axis=1)

# Reorder columns
cols = met_df.columns.tolist()
cols = [cols[0]] + cols[-2:] + cols[1:-2]
met_df = met_df[cols]

In [26]:
met_df.head()

Unnamed: 0,name,parish,fips,latitude,longitude,date,awnd,prcp,snow,snwd,tavg,tmax,tmin,wdf2,wdf5,wsf2,wsf5
0,"CROWLEY 2 NE, LA US",Acadia,22001,30.2408,-92.3477,1996-01-01,,0.0,0.0,0.0,,67.0,62.0,,,,
1,"CROWLEY 2 NE, LA US",Acadia,22001,30.2408,-92.3477,1996-01-02,,0.3,0.0,0.0,,78.0,46.0,,,,
2,"CROWLEY 2 NE, LA US",Acadia,22001,30.2408,-92.3477,1996-01-03,,0.0,0.0,0.0,,,,,,,
3,"CROWLEY 2 NE, LA US",Acadia,22001,30.2408,-92.3477,1996-01-04,,0.0,0.0,0.0,,46.0,31.0,,,,
4,"CROWLEY 2 NE, LA US",Acadia,22001,30.2408,-92.3477,1996-01-05,,0.0,0.0,0.0,,62.0,36.0,,,,


## Aggregating Station Data to Parish

In [27]:
def means(row):
    '''
    Takes a row from DataFrame and returns mean for each variable
    Returns: Series
    '''
    means = met_df[(met_df.date == row['date']) & (met_df.parish == row['parish'])].mean()
    return means

'''
# Create a new DataFrame with unique date-parish combinations
averaged_met_df = met_df[['date', 'parish', 'fips']].drop_duplicates().reset_index(drop=True)

var_list = met_df.loc[:, 'awnd':'wsf5'].columns.tolist()

# Initialization for parallelization
pandarallel.initialize(progress_bar=True)

averaged_met_df[var_list] = averaged_met_df.parallel_apply(means, axis=1)

averaged_met_df.to_csv('../data/cleaned/parish_averaged_met.csv', index=False)
'''

"\n# Create a new DataFrame with unique date-parish combinations\naveraged_met_df = met_df[['date', 'parish', 'fips']].drop_duplicates().reset_index(drop=True)\n\nvar_list = met_df.loc[:, 'awnd':'wsf5'].columns.tolist()\n\n# Initialization for parallelization\npandarallel.initialize(progress_bar=True)\n\naveraged_met_df[var_list] = averaged_met_df.parallel_apply(means, axis=1)\n\naveraged_met_df.to_csv('../data/cleaned/parish_averaged_met.csv', index=False)\n"

In [28]:
averaged_met_df = pd.read_csv('../data/cleaned/parish_averaged_met.csv', parse_dates=[0], dtype={'fips': 'str'})
averaged_met_df.head()

Unnamed: 0,date,parish,fips,awnd,prcp,snow,snwd,tavg,tmax,tmin,wdf2,wdf5,wsf2,wsf5
0,1996-01-01,Acadia,22001,,0.0,0.0,0.0,,67.0,62.0,,,,
1,1996-01-02,Acadia,22001,,0.3,0.0,0.0,,78.0,46.0,,,,
2,1996-01-03,Acadia,22001,,0.0,0.0,0.0,,,,,,,
3,1996-01-04,Acadia,22001,,0.0,0.0,0.0,,46.0,31.0,,,,
4,1996-01-05,Acadia,22001,,0.0,0.0,0.0,,62.0,36.0,,,,


In [29]:
averaged_met_df.describe()

Unnamed: 0,awnd,prcp,snow,snwd,tavg,tmax,tmin,wdf2,wdf5,wsf2,wsf5
count,115735.0,160347.0,87155.0,85707.0,42636.0,160908.0,160941.0,114672.0,114240.0,114674.0,114296.0
mean,6.30606,0.154081,0.000967,0.001448,68.562729,78.004651,58.35307,169.648211,170.160058,16.232025,21.488811
std,3.43778,0.470034,0.058054,0.202785,14.397963,13.768088,14.863327,99.396223,99.576324,5.791028,7.750028
min,0.0,0.0,0.0,0.0,-13.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.8,0.0,0.0,0.0,59.0,69.0,47.0,90.0,90.0,12.1,16.1
50%,5.82,0.0,0.0,0.0,71.0,80.0,61.0,170.0,170.0,15.0,20.45
75%,8.28,0.04,0.0,0.0,80.0,89.0,72.0,235.0,240.0,19.9,25.1
max,90.82,13.54,7.5,55.0,807.0,861.0,752.0,360.0,360.0,128.0,299.1


# Merged Storm Events and Meteorological DataFrames

In [30]:
final_df = storm_df.merge(averaged_met_df, 
                          left_on=['begin_date', 'parish', 'fips'], 
                          right_on=['date', 'parish', 'fips'], 
                          how='outer')

In [31]:
final_df.describe()

Unnamed: 0,begin_lat,begin_lon,end_lat,end_lon,injuries_direct,injuries_indirect,deaths_direct,deaths_indirect,damage_property,awnd,prcp,snow,snwd,tavg,tmax,tmin,wdf2,wdf5,wsf2,wsf5
count,12492.0,12492.0,12492.0,12492.0,16946.0,16946.0,16946.0,16946.0,16946.0,117775.0,162804.0,88784.0,87318.0,43419.0,163371.0,163402.0,116706.0,116274.0,116708.0,116332.0
mean,31.435951,-92.235796,31.435211,-92.226821,0.061312,0.004839,0.060545,0.001416,3965077.0,6.348387,0.172352,0.000949,0.001421,68.627859,78.053445,58.418489,170.552554,170.994748,16.43571,21.769092
std,1.053692,1.140439,1.056235,1.141659,1.233574,0.480459,5.02419,0.046071,155601400.0,3.464942,0.528292,0.057519,0.200906,14.331789,13.721913,14.806187,99.69598,99.843807,6.053117,8.115087
min,29.03,-94.1,29.03,-94.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-13.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,30.45,-93.23,30.445325,-93.23,0.0,0.0,0.0,0.0,0.0,3.8,0.0,0.0,0.0,59.0,69.0,47.0,90.0,90.0,12.1,16.1
50%,31.6,-92.3,31.6029,-92.3,0.0,0.0,0.0,0.0,0.0,5.82,0.0,0.0,0.0,71.0,80.0,61.0,170.0,170.0,15.0,21.0
75%,32.47,-91.43,32.47,-91.427125,0.0,0.0,0.0,0.0,5000.0,8.28,0.05,0.0,0.0,80.0,89.0,71.5,240.0,240.0,19.9,25.1
max,33.02,-89.37,33.0302,-89.37,90.0,62.0,638.0,3.0,17900000000.0,90.82,13.54,7.5,55.0,807.0,861.0,752.0,360.0,360.0,128.0,299.1


In [32]:
# Inspecting abnormal tavg, tmax, tmin > 750
final_df[final_df.tavg > 120]

Unnamed: 0,begin_date,end_date,fips,parish,event_type,begin_lat,begin_lon,end_lat,end_lon,injuries_direct,...,prcp,snow,snwd,tavg,tmax,tmin,wdf2,wdf5,wsf2,wsf5
45531,NaT,NaT,22101,St. Mary,,,,,,,...,0.0,,,807.0,861.0,752.0,0.0,0.0,0.0,0.0


In [33]:
# Drop row - data is bad
final_df.drop(45531, inplace=True)

In [34]:
final_df['begin_date'] = np.where(final_df.begin_date.isnull(), final_df.date, final_df.begin_date)
final_df['end_date'] = np.where(final_df.end_date.isnull(), final_df.date, final_df.end_date)
final_df.drop('date', inplace=True, axis=1)

In [35]:
# No event happened on these days
final_df['event_type'] = np.where(final_df.event_type.isnull(), 'None', final_df.event_type)

In [36]:
flood = final_df[(final_df.event_type.str.find('Flood') != -1)]
flood.describe()

Unnamed: 0,begin_lat,begin_lon,end_lat,end_lon,injuries_direct,injuries_indirect,deaths_direct,deaths_indirect,damage_property,awnd,prcp,snow,snwd,tavg,tmax,tmin,wdf2,wdf5,wsf2,wsf5
count,1419.0,1419.0,1419.0,1419.0,2202.0,2202.0,2202.0,2202.0,2202.0,821.0,1004.0,621.0,616.0,287.0,999.0,999.0,819.0,819.0,819.0,819.0
mean,31.549304,-92.34233,31.526764,-92.3199,0.009083,0.0,0.012716,0.000454,4557471.0,8.273435,2.591899,0.0,0.0,70.444251,77.55689,63.755923,187.041107,195.543346,23.060155,30.826455
std,1.043736,1.163672,1.061909,1.161635,0.227403,0.0,0.140816,0.02131,60565440.0,3.857682,2.082798,0.0,0.0,12.037036,9.375478,9.23003,103.047042,101.691848,7.332549,10.022868
min,29.4,-94.0402,29.3983,-94.0485,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,41.0,29.0,10.0,10.0,6.0,8.9
25%,30.45455,-93.37,30.4182,-93.36295,0.0,0.0,0.0,0.0,0.0,5.59,0.93875,0.0,0.0,66.0,72.5,58.75,110.0,120.0,17.0,23.9
50%,31.8217,-92.4042,31.8159,-92.3693,0.0,0.0,0.0,0.0,0.0,7.61,2.29,0.0,0.0,75.0,78.0,65.0,180.0,200.0,22.3,30.0
75%,32.48805,-91.5678,32.483,-91.51495,0.0,0.0,0.0,0.0,20000.0,10.51,3.83,0.0,0.0,77.5,85.0,72.0,270.0,280.0,28.0,36.0
max,33.018,-89.713,33.0185,-89.6635,10.0,0.0,4.0,1.0,1680000000.0,30.2,13.54,0.0,0.0,86.0,101.0,79.0,360.0,360.0,63.1,81.0


In [37]:
none = final_df[final_df.event_type == 'None']
# None event type with same parishes and years as flood events
none = none[(none.parish.isin(flood.parish.unique())) \
         & (none.begin_date.dt.year.isin(flood.begin_date.dt.year.unique()))]

In [38]:
# Don't need snow or snow depth variables,
# tavg has a lot of missing data
flood.drop(['snow', 'snwd', 'tavg'], axis=1, inplace=True)
none.drop(['snow', 'snwd', 'tavg'], axis=1, inplace=True)

In [39]:
parish_coords = storm_df[['parish', 'begin_lat', 'begin_lon']]\
                    [storm_df.begin_lat.notnull()].drop_duplicates(subset='parish')
parish_coords.set_index('parish', inplace=True)

In [40]:
def replace_coords(row):
    return parish_coords.loc[row.parish]


# Add coords if missing
flood.loc[:, ['begin_lat', 'begin_lon']] = flood.apply(lambda row: replace_coords(row) \
                                                       if pd.isnull(row.begin_lat) \
                                                       else row[['begin_lat', 'begin_lon']], 
                                                       axis=1)
flood['end_lat'] = np.where(flood.end_lat.isnull(), flood.begin_lat, flood.begin_lat)
flood['end_lon'] = np.where(flood.end_lon.isnull(), flood.begin_lon, flood.begin_lon)
flood = flood[(flood.awnd.notnull()) & (flood.tmax.notnull()) & (flood.wsf5.notnull())]

none_sample = none[(none.awnd.notnull()) \
                   & (none.prcp.notnull()) \
                   & (none.tmax.notnull()) \
                   & (none.wsf5.notnull()) \
                   & (none.wsf2.notnull())].sample(5000)
# Add coords
none_sample.loc[:, ['begin_lat', 'begin_lon']] = none_sample.apply(lambda row: replace_coords(row) \
                                                       if pd.isnull(row.begin_lat) \
                                                       else row[['begin_lat', 'begin_lon']], 
                                                       axis=1)
none_sample['end_lat'] = none_sample.begin_lat
none_sample['end_lon'] = none_sample.begin_lon
none_sample = none_sample.fillna(0)

In [41]:
# Combine
flood_none = pd.concat([none_sample, flood], sort=False)

In [42]:
flood_none.describe()

Unnamed: 0,begin_lat,begin_lon,end_lat,end_lon,injuries_direct,injuries_indirect,deaths_direct,deaths_indirect,damage_property,awnd,prcp,tmax,tmin,wdf2,wdf5,wsf2,wsf5
count,5812.0,5812.0,5812.0,5812.0,5812.0,5812.0,5812.0,5812.0,5812.0,5812.0,5812.0,5812.0,5812.0,5812.0,5812.0,5812.0,5812.0
mean,31.016609,-91.980547,31.016609,-91.980547,0.000344,0.0,0.002237,0.0,730785.4,6.496058,0.499157,78.091678,59.05027,171.847327,173.795022,16.993622,22.555896
std,1.086377,1.347666,1.086377,1.347666,0.018549,0.0,0.065553,0.0,26374200.0,3.506841,1.239014,12.872957,14.26582,99.761714,99.90247,6.283017,8.633529
min,29.703,-94.0402,29.703,-94.0402,0.0,0.0,0.0,0.0,0.0,0.0,0.0,33.0,13.0,10.0,0.0,0.0,0.0
25%,30.05,-93.2352,30.05,-93.2352,0.0,0.0,0.0,0.0,0.0,3.8,0.0,70.0,48.0,90.0,100.0,13.0,17.0
50%,30.47,-92.18,30.47,-92.18,0.0,0.0,0.0,0.0,0.0,5.82,0.0,80.0,62.0,170.0,170.0,16.1,21.0
75%,32.42,-91.17,32.42,-91.17,0.0,0.0,0.0,0.0,0.0,8.38625,0.22,88.5,72.0,240.0,250.0,21.0,26.5
max,33.018,-89.7144,33.018,-89.7144,1.0,0.0,4.0,0.0,1680000000.0,30.2,13.54,108.0,84.0,360.0,360.0,63.1,225.0


In [45]:
def get_7d_prcp_sum(row):
    '''
    Gets previous 7 day precipitation
    '''
    parish = row['parish']
    date = row['begin_date']
    total_7d = final_df[(final_df.parish == parish) & ((date - final_df.begin_date <= pd.Timedelta('7 days')) \
          & (date - final_df.begin_date > pd.Timedelta('0 days')))].prcp.sum()
    return total_7d


pandarallel.initialize(progress_bar=True)
flood_none['prev_7d_prcp'] = flood_none.parallel_apply(get_7d_prcp_sum, axis=1)

New pandarallel memory created - Size: 2000 MB
Pandarallel will run on 8 workers


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=727), Label(value='0 / 727'))), HB…

In [46]:
def drop_no_history(row):
    '''
    Returns 1 if there isn't complete data for the previous 7 days
    '''
    parish = row['parish']
    date = row['begin_date']
    if final_df[(final_df.parish == parish) & ((date - final_df.begin_date <= pd.Timedelta('7 days')) \
          & (date - final_df.begin_date > pd.Timedelta('0 days')))].begin_date.nunique() < 4:
        return 1
    else:
        return 0
        

pandarallel.initialize(progress_bar=True)

flood_none['drop'] = flood_none.parallel_apply(drop_no_history, axis=1)

New pandarallel memory created - Size: 2000 MB
Pandarallel will run on 8 workers


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=727), Label(value='0 / 727'))), HB…

In [47]:
# Drop rows with not enough historical 7-d prcp data
flood_none = flood_none[flood_none['drop'] == 0]
flood_none.drop('drop', axis=1, inplace=True)

## Region Data

In [48]:
northwest = ['Bienville', 'Bossier', 'Caddo', 'Claiborne', 'De Soto', 
             'Jackson', 'Lincoln', 'Natchitoches', 'Red River', 'Sabine', 
             'Webster', 'Winn']
northeast = ['Caldwell', 'Catahoula', 'Concordia', 'East Carroll', 
             'Franklin', 'Madison', 'Morehouse', 'Ouachita', 'Richland', 
             'Tensas', 'Union', 'West Carroll']
central = ['Allen', 'Avoyelles', 'Beauregard', 'Evangeline', 'Grant', 
           'La Salle', 'Pointe Coupee', 'Rapides', 'St. Landry', 'Vernon']
southwest = ['Acadia', 'Assumption', 'Calcasieu', 'Cameron', 'Iberia', 
             'Iberville', 'Jefferson Davis', 'Lafayette', 'Lafourche', 
             'St. Martin', 'St. Mary', 'Terrebonne', 'Vermilion',
             'West Baton Rouge']
southeast = ['Ascension', 'East Baton Rouge', 'East Feliciana', 'Jefferson',
             'Livingston', 'Orleans', 'Plaquemines', 'St. Bernard', 
             'St. Charles', 'St. Helena', 'St. James', 'St. John the Baptist', 
             'St. Tammany', 'Tangipahoa', 'Washington', 'West Feliciana']

In [49]:
northwest = dict.fromkeys(northwest, 'Northwest')
northeast = dict.fromkeys(northeast, 'Northeast')
central = dict.fromkeys(central, 'Central')
southwest = dict.fromkeys(southwest, 'Southwest')
southeast = dict.fromkeys(southeast, 'Southeast')
region_dict = {**northwest, **northeast, **central, **southwest, **southeast}

flood_none['region'] = flood_none.parish.map(region_dict)

In [50]:
# Rearrange columns
cols = flood_none.columns.tolist()
cols[:4] + cols[5:9] + [cols[-1]] + [cols[4]] + cols[9:-1]
flood_none = flood_none[cols]

In [51]:
flood_none.head()

Unnamed: 0,begin_date,end_date,fips,parish,event_type,begin_lat,begin_lon,end_lat,end_lon,injuries_direct,...,awnd,prcp,tmax,tmin,wdf2,wdf5,wsf2,wsf5,prev_7d_prcp,region
87542,2017-10-21,2017-10-21,22085,Sabine,,31.78,-93.7,31.78,-93.7,0.0,...,4.47,0.045,82.5,62.5,180.0,160.0,12.1,18.1,0.015,Northwest
158373,2001-04-03,2001-04-03,22055,Lafayette,,30.23,-92.18,30.23,-92.18,0.0,...,10.29,0.0,83.0,71.0,180.0,170.0,19.9,23.0,3.14,Southwest
51367,1997-12-14,1997-12-14,22017,Caddo,,32.48,-93.75,32.48,-93.75,0.0,...,3.8,0.0,54.0,27.0,320.0,330.0,14.1,16.1,1.1,Northwest
41818,2013-01-15,2013-01-15,22045,Iberia,,30.03,-91.88,30.03,-91.88,0.0,...,11.63,0.63,46.0,39.0,350.0,340.0,17.9,23.9,9.74,Southwest
77410,2002-11-22,2002-11-22,22033,East Baton Rouge,,30.47,-91.17,30.47,-91.17,0.0,...,6.04,0.0,64.5,43.5,360.0,340.0,14.1,17.9,1.16,Southeast


In [55]:
seasons = {1: 'winter',
          2: 'winter',
          3: 'spring',
          4: 'spring',
          5: 'spring',
          6: 'summer',
          7: 'summer',
          8: 'summer',
          9: 'fall',
          10: 'fall',
          11: 'fall',
          12: 'winter'}

flood_none['season'] = flood_none.begin_date.dt.month.map(seasons)

In [60]:
cols = flood_none.columns.tolist()
cols = cols[:2] + [cols[-1]] + cols[2:4] + [cols[-2]] + cols[4:-2]
flood_none = flood_none[cols]

In [63]:
# Wind direction not an important feature
flood_none.drop(['wdf2', 'wdf5'], axis=1, inplace=True)

In [64]:
flood_none.to_csv('../data/cleaned/cleaned_flood_data.csv', index=False)

# Other stuff to potentially explore later

## `event_type` 

In [None]:
storm_event_df = storm_df.copy()
storm_event_df.head()

In [None]:
storm_event_df['event_type'].unique()

In [None]:
# Add a new column with counts by event type
storm_event_df['event_count'] = storm_event_df.groupby(['date', 
                                                        'parish',
                                                        'fips',
                                                        'event_type'])['injuries_direct'] \
                                              .transform('count')

In [None]:
# Sums up storm event statistics if multiple events in a day
storm_event_df = storm_event_df.groupby(['date', 
                                         'parish',
                                         'fips', 
                                         'event_type', 
                                         'event_count']) \
                               .sum() \
                               .reset_index()

In [None]:
# Resample by month
storm_event_df = storm_event_df.groupby(['parish', 
                                         'fips', 
                                         'event_type']) \
                               .resample('M', on='date') \
                               .sum() \
                               .reset_index() \

In [None]:
# Drop rows where event_count is 0, sort by date then parish
storm_event_df = storm_event_df[storm_event_df.event_count != 0].sort_values(by=['date', 
                                                                                 'parish']) \
                                                                .reset_index(drop=True)

# Change date to year-month
storm_event_df['date'] = storm_event_df.date.dt.to_period('M')

In [None]:
storm_event_df['region'] = storm_event_df.parish.map(region_dict)

# Rearrange columns
cols = storm_event_df.columns.tolist()
cols = [cols[3]] + [cols[0]] + [cols[-1]] + cols[1:3] + cols[4:-1]
storm_event_df = storm_event_df[cols]

| include | event type | count |direct injuries | indirect injuries | direct deaths | indirect deaths | property damage |
| --- | --- | --- | --- | --- | --- | --- | --- |
| tornado | Tornado | 1059 | 654 | 0 | 30 | 1 | 370436200 |
| no | Dust Devil | 2 | 0 | 0 | 0 | 0 | 12000 |
| no | Funnel Cloud | 153 | 0 | 0 | 0 | 0 | 0 |
| no | Drought | 784 | 0 | 0 | 0 | 0 | 0 |
| flood | Flash Flood | 1764 | 19 | 0 | 20 | 0 | 3666451000 | 
| flood | Flood | 374 | 1 | 0 | 8 | 1 | 6363354000 |
| flood | Coastal Flood | 64 | 0 | 0 | 0 | 0 | 5746000 |
| heat | Heat | 354 | 3 | 0 | 45 | 0 | 20000 |
| heat | Excessive Heat | 111 | 1 | 0 | 16 | 0 | 90000 |
| storm | Heavy Rain | 109 | 0 | 0 | 1 | 0 | 75000 |
| storm | Lightning | 399 | 110 | 3 | 30 | 6 | 34170950 |
| wind | Thunderstorm Wind | 6265 | 119 | 4 | 22 | 0 | 855043750 |
| wind | High Wind | 90 | 5 | 0 | 5 | 0 | 1936500 |
| wind | Strong Wind | 84 | 9 | 0 | 4 | 0 | 2127800 |
| hurr_trop | Hurricane (Typhoon) | 257 | 3 | 0 | 821 | 1 | 22479920000 | 
| hurr_trop | Hurricane | 2 | 0 | 0 | 0 | 0 | 400000.0 | 
| hurr_trop | Tropical Storm' | 360 | 14 | 4 | 2 | 2 | 328262000 |
| hurr_trop | Tropical Depression | 30 | 0 | 0 | 0 | 0 | 0 |
| hurr_trop | Storm Surge/Tide | 195 | 0 | 0 | 3 | 0 | 32834960000 | 
| winter_prec | Winter Weather | 369 | 0 | 1 | 0 | 4 | 136000 |
| winter_prec | Winter Storm | 119 | 4 | 2 | 0 | 2 | 305000 |
| winter_prec | Heavy Snow | 104 | 0 | 0 | 0 | 0 | 2475000 |
| winter_prec | Sleet | 24 | 0 | 0 | 0 | 0 | 300000.0
| winter_prec | Hail | 3392 | 2 | 0 | 0 | 0 | 73773200 |
| winter_prec | Ice Storm | 175 | 17 | 0 | 6 | 3 | 157065000 |
| chill | Extreme Cold/Wind Chill | 3 | 1 | 0 | 3 | 0 | 
| chill | Cold/Wind Chill | 125 | 0 | 0 | 5| 1 | 12685000 |
| fog | Dense Fog | 11 | 71 | 68 | 1 | 3 | 253000 |
| no | Freezing Fog | 11 | 0 | 0 | 0 | 0 | 0 |
| no | Seiche | 5 | 0 | 0 | 0 | 0 | 75000 |
| no | Waterspout | 10 | 0 | 0 | 2 | 0 | 20000 |
| no | Rip Current | 2 | 2 | 0 | 2 | 0 | 0 |
| fire | Wildfire | 19 | 4 | 0 | 0 | 0 | 2101000 |
| no | Astronomical Low Tide | 121 | 0 | 0 | 0 | 0 | 0 |

In [None]:
drop_events = ['Dust Devil', 'Funnel Cloud', 'Drought', 'Freezing Fog', 
               'Seiche', 'Waterspout', 'Rip Current', 'Astronomical Low Tide']

storm_event_df = storm_event_df[~storm_event_df['event_type'].isin(drop_events)]

In [None]:
storm_event_df.head()

In [None]:
storm_event_df.to_csv('../data/cleaned/event_type_data.csv', index=False)