In [1]:
import os
import re
import folium
import pandas as pd
import numpy as np
import datetime
import json
from urllib.request import urlretrieve
from tqdm import tqdm_notebook as tqdm
from mpl_toolkits.basemap import Basemap

### 1. Load Data

In [2]:
def read_hurdat(url, local_fname, location):
    if not os.path.exists(local_fname):
        urlretrieve(url, local_fname)

    records = []
    with open(local_fname,'r') as f:
        for line in f:
            if line.startswith(location):
                record = line.strip()
                reports = []
                records.append((record, reports))
            else:
                reports.append(line.strip())
                
    return records

In [3]:
url = "https://www.nhc.noaa.gov/data/hurdat/hurdat2-1851-2017-050118.txt"
local_fname = "../data/hurdat2.txt"

records = read_hurdat(url, local_fname, "AL") # AL for atlantic hurricanes

In [None]:
all_records = get_all_hurricanes(records)
hurricanes_df_raw = create_hurricane_df(all_records)

### 2. Parse into Dictionary
Runs much faster than converting to dataframe

JSON structure will be used for hypothesis analysis

In [71]:
def convert_lat_lon(value, col):
    """
    Lat/lon is encoded with the numeric value plus E/W/N/S. We want to 
    convert this into an absolute decimal value for maps.
    """
    if col=='lon':
        amount = -float(re.sub('[EW]', '', value)) if 'W' in value else float(re.sub('[EW]', '', value))
    elif col=='lat':
        amount = -float(re.sub('[NS]', '', value)) if 'S' in value else float(re.sub('[NS]', '', value))
    
    return amount

In [72]:
hurdat_dict = {}

for record in records:
    hurricane_id = record[0].split(',')[0]
    hurricane_name = record[0].split(',')[1].strip()
    
    # add to dict
    hurdat_dict.setdefault(hurricane_id, {'name': hurricane_name})
    hurdat_dict[hurricane_id].setdefault('datapoints', [])
    
    for datapoint in record[1]:
        data_list = [x.strip() for x in datapoint.split(',')]
        datapoint_dict = {'record_date': data_list[0],
                          'time': data_list[1],
                          'storm_status': data_list[3],
                          'lat': convert_lat_lon(data_list[4], 'lat'), 
                          'lon': convert_lat_lon(data_list[5], 'lon'),
                          'max_wind': data_list[6], 
                          'min_pressure': data_list[7]}
        
        hurdat_dict[hurricane_id]['datapoints'].append(datapoint_dict)

#### Save data

In [73]:
with open('../data/hurricanes_parsed.json', 'w') as fp:
    json.dump(hurdat_dict, fp)

### 3. Feature Engineering for individual storms
* Indicator if the storm reached hurricane status
* Whether the storm made landfall
* How long the storm lasted

In [88]:
def datapoint_datetime(datapoint):
    """
    For each record, we want a datetime field to calculate
    length of the storm.
    """
    date = datapoint['record_date']
    tm = datapoint['time']
    
    datapoint.setdefault('dt', datetime.datetime.strptime(date+tm, "%Y%m%d%H%M"))
    
    return datapoint

def is_hurricane(storm):
    """
    Did the storm ever make it to hurricane status?
    """
    status_list = []
    for record in storm['datapoints']:
        status_list.append(record['storm_status'])
    
    hurricane = True if 'HU' in status_list else False
    
    return hurricane

def landfall(storm):
    """
    Did the storm make landfall?
    """
    records_on_land = 0
    for record in storm['datapoints']:
        over_land = np.where(bm.is_land(record['lat'], record['lon'])==True, 1, 0)
        records_on_land += over_land
    landfall = True if records_on_land > 0 else False
    
    return landfall

def storm_duration(storm):
    """
    How long did the storm last?
    """
    date_list = []
    for record in storm['datapoints']:
        rec_date = record['dt']
        date_list.append(rec_date)

    duration = max(date_list) - min(date_list)
    
    return duration

In [105]:
# Apply functions to create features

for key, value in hurdat_dict.items():
    value['is_hurricane'] = is_hurricane(value)
    value['landfall'] = landfall(value)
    value['duration'] = storm_duration(value)
    for record in value['datapoints']:
        record = datapoint_datetime(record)

In [106]:
maria = hurdat_dict['AL152017']

In [107]:
maria

{'name': 'MARIA',
 'datapoints': [{'record_date': '20170916',
   'time': '1200',
   'storm_status': 'TD',
   'lat': 12.2,
   'lon': -49.7,
   'max_wind': '30',
   'min_pressure': '1006',
   'dt': datetime.datetime(2017, 9, 16, 12, 0)},
  {'record_date': '20170916',
   'time': '1800',
   'storm_status': 'TS',
   'lat': 12.2,
   'lon': -51.7,
   'max_wind': '40',
   'min_pressure': '1004',
   'dt': datetime.datetime(2017, 9, 16, 18, 0)},
  {'record_date': '20170917',
   'time': '0000',
   'storm_status': 'TS',
   'lat': 12.4,
   'lon': -53.1,
   'max_wind': '45',
   'min_pressure': '1002',
   'dt': datetime.datetime(2017, 9, 17, 0, 0)},
  {'record_date': '20170917',
   'time': '0600',
   'storm_status': 'TS',
   'lat': 12.8,
   'lon': -54.4,
   'max_wind': '55',
   'min_pressure': '994',
   'dt': datetime.datetime(2017, 9, 17, 6, 0)},
  {'record_date': '20170917',
   'time': '1200',
   'storm_status': 'TS',
   'lat': 13.3,
   'lon': -55.7,
   'max_wind': '60',
   'min_pressure': '990',
 