# Data Cleaning - Historic Hurricane Data

## Imports

In [3]:
import pandas as pd
import seaborn as sns
import time
import geopandas as gpd

pd.set_option("display.max_columns", None)

## Read-In Data
Source: [IBTrACS Version 4 from the National Centers for Environmental Information](https://www.ncdc.noaa.gov/ibtracs/)

Data on tropical storm systems that have occurred in the North Atlantic Ocean was retrieved from the International Best Track Archive for Climate Stewardship (IBTrACS) on August 2, 2020. The IBTrACS data serve to provide tracking information for all tropical cyclones and storms in order to aid understanding of the distribution, frequency, and intensity of tropical storm systems. 

In [5]:
atlantic = pd.read_csv('../data/hurricanes_east_coast.csv', na_values = ' ');

### Selection of Relevant Columns

In [6]:
atlantic = atlantic[['SID','SEASON', 'NUMBER', 'NAME','ISO_TIME','NATURE','LAT','LON','WMO_WIND','WMO_PRES', 'WMO_AGENCY', 'DIST2LAND', 'LANDFALL', 'USA_LAT', 'USA_LON', 'USA_RECORD', 'USA_STATUS', 'USA_WIND', 'USA_PRES', 'USA_SSHS', 'USA_EYE', 'USA_SEAHGT', 'STORM_SPEED', 'STORM_DIR']]

#### Rename Columns

In [8]:
atlantic.rename(columns = {
    "SID" : "storm_id",
    "SEASON" : "year",
    "LAT" : "latitude",
    "LON" : "longitude",
    "WMO_WIND" : "wmo_wind(knots)",
    "WMO_PRES" : "wmo_pressure(mb)", # World Meteorological Organization
    "DIST2LAND" : "dist2land(km)",
    "LANDFALL" : "landfall(km)",
    "USA_SSHS" : "storm_category",
    "USA_EYE" : "eye_diameter",
    "USA_SEAHGT" : "sea_height(ft)",
    "STORM_SPEED" : "storm_speed(knots)",
    "STORM_DIR" : "storm_direction(degrees)"
    }, inplace = True)
atlantic.columns = atlantic.columns.str.lower()
atlantic.drop(index = 0, inplace = True)
atlantic.reset_index(drop = True, inplace = True)

### Selection of Only Storms That Made Landfall

In [9]:
atlantic['landfall(km)'] = atlantic['landfall(km)'].map(float)
atlantic = atlantic[atlantic['landfall(km)'] == 0]

In [10]:
atlantic.head(2)

Unnamed: 0,storm_id,year,number,name,iso_time,nature,latitude,longitude,wmo_wind(knots),wmo_pressure(mb),wmo_agency,dist2land(km),landfall(km),usa_lat,usa_lon,usa_record,usa_status,usa_wind,usa_pres,storm_category,eye_diameter,sea_height(ft),storm_speed(knots),storm_direction(degrees)
19,1851175N26270,1851,5.0,NOT_NAMED,1851-06-25 21:00:00,TS,28.1333,-96.5667,80,,hurdat_atl,22,0.0,28.2,-96.8,L,HU,80,,1.0,,,5,286
20,1851175N26270,1851,5.0,NOT_NAMED,1851-06-26 00:00:00,TS,28.1667,-96.8,70,,hurdat_atl,0,0.0,28.2,-97.0,,HU,70,,1.0,,,5,282


## Read-In Shape Files for State and County Labeling

### USA - States
Source: **ASK URI!**

This geopandas data frame will allows us to label the state of each point in our hurricanes data frame.

In [None]:
usa_states = gpd.read_file('../maps/states_21basic/states.shp')
usa_states.head()

### USA - Counties
Source: [United States Census Bureau](https://www.census.gov/geographies/mapping-files/time-series/geo/carto-boundary-file.html)

This geopandas data frame will allow us to label the county of each point in our hurricanes data frame.

In [11]:
usa_counties = gpd.read_file('../maps/cb_2018_us_county_20m/cb_2018_us_county_20m.shp')
usa_counties.head()

Unnamed: 0,STATEFP,COUNTYFP,COUNTYNS,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,geometry
0,37,17,1026336,0500000US37017,37017,Bladen,6,2265887723,33010866,"POLYGON ((-78.90200 34.83527, -78.79960 34.850..."
1,37,167,1025844,0500000US37167,37167,Stanly,6,1023370459,25242751,"POLYGON ((-80.49737 35.20210, -80.29542 35.502..."
2,39,153,1074088,0500000US39153,39153,Summit,6,1069181981,18958267,"POLYGON ((-81.68699 41.13596, -81.68495 41.277..."
3,42,113,1213687,0500000US42113,42113,Sullivan,6,1165338428,6617028,"POLYGON ((-76.81373 41.59003, -76.22014 41.541..."
4,48,459,1384015,0500000US48459,48459,Upshur,6,1509910100,24878888,"POLYGON ((-95.15274 32.66095, -95.15211 32.902..."


## Identifying Hurricane Rows Within the USA

In this step, we will compare the coordinates of each tropical storm in the North Atlantic Hurricanes data frame with the states and counties in the shape files. We will assign a state and county to each storm that made landfall in the United States.

#### Assign Hurricanes Data Frame to a Geopandas Data Frame

In [14]:
geo_df = gpd.GeoDataFrame(atlantic, geometry = gpd.points_from_xy(atlantic['longitude'], atlantic['latitude']))

In [16]:
geo_df.head(1)

Unnamed: 0,storm_id,year,number,name,iso_time,nature,latitude,longitude,wmo_wind(knots),wmo_pressure(mb),wmo_agency,dist2land(km),landfall(km),usa_lat,usa_lon,usa_record,usa_status,usa_wind,usa_pres,storm_category,eye_diameter,sea_height(ft),storm_speed(knots),storm_direction(degrees),geometry
19,1851175N26270,1851,5.0,NOT_NAMED,1851-06-25 21:00:00,TS,28.1333,-96.5667,80,,hurdat_atl,22,0.0,28.2,-96.8,L,HU,80,,1.0,,,5,286,POINT (-96.56670 28.13330)


#### Write a Function to Check if a Point from the Hurricanes Data Frame is Within a State and County Boundary
We will store the indices of the storm and their states/counties to a dictionary.

In [17]:
def usa_region_column(df1, df2, col1, new_column = None):
    # empty dictionary
    location_dict = {}
    # iterate through hurricane points
    for point_index, point in df1['geometry'].iteritems():
        # iterate through our location dataframe
        for index, location in df2['geometry'].iteritems():
            if point.within(location) == True:
                location_dict[point_index] = df2[col1][index]      
    return location_dict

#### Assign States

In [None]:
# Iterate Through dataframe
hurricane_states = usa_region_column(geo_df, usa_states, 'STATE_NAME')

# Only keep indices for storms within the USA
indices_to_keep = list(hurricane_states.keys())

# Create a new dataframe with points from the USA
geo_df_usa = geo_df.loc[indices_to_keep, :]

# Create a list of states 
states_list = list(hurricane_states.values())

# Create a 'states' column in the USA data frame
geo_df_usa['state'] = states_list

