In [None]:
import os
import re
import folium
import pandas as pd
import numpy as np
import datetime
from urllib.request import urlretrieve
from tqdm import tqdm_notebook as tqdm
from mpl_toolkits.basemap import Basemap

### 1. Load Data

In [None]:
def read_hurdat(url, local_fname, location):
    if not os.path.exists(local_fname):
        urlretrieve(url, local_fname)

    records = []
    with open(local_fname,'r') as f:
        for line in f:
            if line.startswith(location):
                record = line.strip()
                reports = []
                records.append((record, reports))
            else:
                reports.append(line.strip())
                
    return records

In [None]:
url = "https://www.nhc.noaa.gov/data/hurdat/hurdat2-1851-2017-050118.txt"
local_fname = "../data/hurdat2.txt"

records = read_hurdat(url, local_fname, "AL") # AL for atlantic hurricanes

### 2. Parse data into dataframe

In [None]:
def get_all_hurricanes(records):
    """
    The raw records from hurdat2 is a list of lists. We want 
    to parse this into a neat dataframe for easy analysis.
    """
    hurr_list = []
    for hurricane in tqdm(records, total=len(records)):
        year = hurricane[0].split(',')[0][-4:]
        hurr_id = hurricane[0].split(',')[0]
        hurr_list.append(hurr_id)
            
    num_hurr = len(hurr_list)
    records_for_yr = records[-num_hurr:]
    
    return records_for_yr

def create_hurricane_df(records):
    """
    The raw data from NOAA is a text file. This function parses that
    text file into a nice, readable Pandas dataframe for easy manipulation.
    """
    records_df = pd.DataFrame()
    idx = 0

    for record in records:
        hurricane_id = record[0].split(',')[0]
        hurricane_name = record[0].split(',')[1].strip()

        for datapoint in record[1]:
            data_list = [x.strip() for x in datapoint.split(',')]
            record_date = data_list[0]
            time = data_list[1]
            storm_status = data_list[3]
            lat = data_list[4]
            lon = data_list[5]
            max_wind = data_list[6]
            min_pressure = data_list[7]

            # Add to df
            records_df.loc[idx, 'id'] = hurricane_id
            records_df.loc[idx, 'name'] = hurricane_name
            records_df.loc[idx, 'date'] = record_date
            records_df.loc[idx, 'time'] = time
            records_df.loc[idx, 'dt'] = datetime.datetime.strptime(record_date+time,'%Y%m%d%H%M')        
            records_df.loc[idx, 'storm_status'] = storm_status
            records_df.loc[idx, 'lat'] = convert_lat_lon(lat, 'lat')
            records_df.loc[idx, 'lon'] = convert_lat_lon(lon, 'lon')
            records_df.loc[idx, 'max_wind'] = float(max_wind)
            records_df.loc[idx, 'min_pressure'] = float(min_pressure)

            idx +=1
            
    return records_df


In [None]:
all_records = get_all_hurricanes(records)
hurricanes_df_raw = create_hurricane_df(all_records)

### 3. Functions to identify individual storms

Feature engineering for:
* If hurricane was over land at each time interval
* If hurricane ever made landfall 
* Duration of the hurricane
* If the storm reached hurricane status|

In [None]:
def get_highest_category(input_df):
    """
    Each row is a hurricane at a point in time. Category will vary
    depending on wind speed at the time. We want to have an identifier
    for what each hurricane's max recorded category was.
    """
    df = input_df.copy(deep=True)
    for hurricane in df['id'].unique():
        hurr_df = df[df['id']==hurricane]
        try:
            max_cat = max(hurr_df[hurr_df['category']!='TS']['category'])
        except ValueError:
            max_cat = 'TS'
        df_indexes = hurr_df.index
        df.loc[df_indexes, 'category_highest'] = max_cat
        
    return df

def over_land(hurricane_df):
    bm = Basemap()
    hurricane_df['over_land'] = np.nan

    for i, row in hurricane_df.iterrows():
        hurricane_df.loc[i, 'over_land'] = np.where(bm.is_land(hurricane_df.loc[i, 'lat'], 
                                                                hurricane_df.loc[i, 'lon'])==True, 
                                                     1, 0)
        
    return hurricane_df

def made_landfall(storm_df):
    storm_df['landfall'] = np.nan # initialize
    storm_df['landfall'] = np.where(storm_df['over_land'].sum()>0, 1, 0)
    
    return storm_df

def storm_duration(storm_df):
    storm_df['duration'] = np.nan # initialize
    start = storm_df['dt'].min()
    end = storm_df['dt'].max()
    storm_df['duration'] = end - start
    
    return storm_df

def is_hurricane(storm_df):
    storm_df['is_hurricane'] = np.nan
    storm_df['is_hurricane'] = np.where(storm_df['storm_status'].any()=="HU", 1, 0)
    
    return storm_df

In [None]:
# Determine if hurricane was over land for each data point
all_hurricanes_df = over_land(hurricanes_df_raw)

# Get storm-specific data
hurricanes_cleaned = []
for storm_id in all_hurricanes_df['id'].unique():
    storm_df = all_hurricanes_df[all_hurricanes_df['id']==storm_id]
    
    # Did it make landfall?
    made_landfall(storm_df)
    # How long was it?
    storm_duration(storm_df)
    # Did it reach hurricane status?
    is_hurricane(storm_df)
    
    hurricanes_cleaned.append(storm_df)

all_hurricanes_df_cleaned = pd.concat(hurricanes_cleaned)

# Add year
all_hurricanes_df_cleaned['year'] = pd.DatetimeIndex(all_hurricanes_df_cleaned['dt']).year

### 3. Save data

In [None]:
all_hurricanes_df_cleaned.to_csv('../data/hurricanes_df_cleaned.csv', index=False)