In [1]:
import os
import re
import pandas as pd
import numpy as np
import datetime
import json
import pickle
from urllib.request import urlretrieve
from tqdm import tqdm_notebook as tqdm
from mpl_toolkits.basemap import Basemap

### 1. Load Data

In [2]:
def read_hurdat(url, local_fname, location):
    if not os.path.exists(local_fname):
        urlretrieve(url, local_fname)

    records = []
    with open(local_fname,'r') as f:
        for line in f:
            if line.startswith(location):
                record = line.strip()
                reports = []
                records.append((record, reports))
            else:
                reports.append(line.strip())
                
    return records

In [3]:
url = "https://www.nhc.noaa.gov/data/hurdat/hurdat2-1851-2017-050118.txt"
local_fname = "../data/hurdat2.txt"

records = read_hurdat(url, local_fname, "AL") # AL for atlantic hurricanes

Lets look at what a raw record looks like (Hurricane RINA):

In [4]:
records[-1]

('AL192017,               RINA,     21,',
 ['20171104, 1200,  , LO, 25.5N,  52.3W,  25, 1013,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,',
  '20171104, 1800,  , LO, 26.5N,  52.1W,  25, 1013,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,',
  '20171105, 0000,  , LO, 27.5N,  52.0W,  25, 1013,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,',
  '20171105, 0600,  , LO, 28.4N,  52.0W,  25, 1013,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,',
  '20171105, 1200,  , LO, 29.0N,  51.9W,  25, 1013,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,',
  '20171105, 1800,  , TD, 29.2N,  51.7W,  30, 1012,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,',
  '20171106, 0000,  , TD, 29.1N,  51.2W,  30, 1011,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,',
  '20171106, 0600,  , TD, 29.0N,  50.7W,  30, 1010,    0,    0,    0,    0, 

From above, it is evident that the data is not very structured. We will parse each record into a nicely formatted dictionary for easier analysis.

### 2. Parse into Dictionary

In [7]:
def convert_lat_lon(value, col):
    """
    Lat/lon is encoded with the numeric value plus E/W/N/S. We want to 
    convert this into an absolute decimal value for maps.
    """
    if col=='lon':
        amount = -float(re.sub('[EW]', '', value)) if 'W' in value else float(re.sub('[EW]', '', value))
    elif col=='lat':
        amount = -float(re.sub('[NS]', '', value)) if 'S' in value else float(re.sub('[NS]', '', value))
    
    return amount

In [10]:
def hurr_category(max_wind):
    """
    Defining storm category based on maximum observed winspeed at the time.
    """
    max_wind = int(max_wind)
    if max_wind <= 73:
        cat = 'TS'
    if (74 <= max_wind <= 95):
        cat = 1
    if (96 <= max_wind <= 110):
        cat = 2
    if (111 <= max_wind <= 129):
        cat = 3
    if (130 <= max_wind <= 156):
        cat = 4
    if max_wind >= 157:
        cat = 5
    return cat

In [11]:
hurdat_dict = {}

for record in records:
    hurricane_id = record[0].split(',')[0]
    hurricane_name = record[0].split(',')[1].strip()
    
    # add to dict
    hurdat_dict.setdefault(hurricane_id, {'name': hurricane_name})
    hurdat_dict[hurricane_id].setdefault('datapoints', [])
    
    for datapoint in record[1]:
        data_list = [x.strip() for x in datapoint.split(',')]
        datapoint_dict = {'record_date': data_list[0],
                          'time': data_list[1],
                          'storm_status': data_list[3],
                          'lat': convert_lat_lon(data_list[4], 'lat'), 
                          'lon': convert_lat_lon(data_list[5], 'lon'),
                          'max_wind': data_list[6], 
                          'min_pressure': data_list[7],
                          'category': hurr_category(data_list[6])}
        
        hurdat_dict[hurricane_id]['datapoints'].append(datapoint_dict)

Lets see what that same record for Hurricane RINA looks like now:

In [12]:
hurdat_dict['AL192017']

{'name': 'RINA',
 'datapoints': [{'record_date': '20171104',
   'time': '1200',
   'storm_status': 'LO',
   'lat': 25.5,
   'lon': -52.3,
   'max_wind': '25',
   'min_pressure': '1013',
   'category': 'TS'},
  {'record_date': '20171104',
   'time': '1800',
   'storm_status': 'LO',
   'lat': 26.5,
   'lon': -52.1,
   'max_wind': '25',
   'min_pressure': '1013',
   'category': 'TS'},
  {'record_date': '20171105',
   'time': '0000',
   'storm_status': 'LO',
   'lat': 27.5,
   'lon': -52.0,
   'max_wind': '25',
   'min_pressure': '1013',
   'category': 'TS'},
  {'record_date': '20171105',
   'time': '0600',
   'storm_status': 'LO',
   'lat': 28.4,
   'lon': -52.0,
   'max_wind': '25',
   'min_pressure': '1013',
   'category': 'TS'},
  {'record_date': '20171105',
   'time': '1200',
   'storm_status': 'LO',
   'lat': 29.0,
   'lon': -51.9,
   'max_wind': '25',
   'min_pressure': '1013',
   'category': 'TS'},
  {'record_date': '20171105',
   'time': '1800',
   'storm_status': 'TD',
   'lat': 

Much better! We will save this raw data first before creating new features and indicators for each hurricane record.
#### Save data

In [13]:
with open('../data/hurricanes_raw.json', 'w') as fp:
    json.dump(hurdat_dict, fp)

### 3. Feature Engineering for individual storms
In order to test our hypotheses, we will need some additional data for each hurricane that isn't directly available at the moment, including:
* Indicator if the storm reached hurricane status
* Whether the storm made landfall
* How long the storm lasted

In [14]:
def datapoint_datetime(datapoint):
    """
    For each record, we want a datetime field to calculate
    length of the storm.
    """
    date = datapoint['record_date']
    tm = datapoint['time']
    
    datapoint.setdefault('dt', datetime.datetime.strptime(date+tm, "%Y%m%d%H%M"))
    
    return datapoint

def is_hurricane(storm):
    """
    Did the storm ever make it to hurricane status?
    """
    status_list = []
    for record in storm['datapoints']:
        status_list.append(record['storm_status'])
    
    hurricane = True if 'HU' in status_list else False
    
    return hurricane

def landfall(storm):
    """
    Did the storm make landfall?
    """
    bm = Basemap()
    records_on_land = 0
    for record in storm['datapoints']:
        over_land = np.where(bm.is_land(record['lat'], record['lon'])==True, 1, 0)
        records_on_land += over_land
    landfall = True if records_on_land > 0 else False
    
    return landfall

def storm_duration(storm):
    """
    How long did the storm last?
    """
    date_list = []
    for record in storm['datapoints']:
        rec_date = record['dt']
        date_list.append(rec_date)

    duration = max(date_list) - min(date_list)
    
    return duration

def get_year(storm):
    """
    Use the first record of the storm to determine the year, 
    since a majority of the analysis is based on how hurricanes
    have changed over the years.
    """
    year = storm['datapoints'][0]['dt'].year
    
    return year

In [15]:
# Apply functions to create features

for key, value in tqdm(hurdat_dict.items(), total=len(hurdat_dict.items())):
    for record in value['datapoints']:
        record = datapoint_datetime(record)
    
    value['is_hurricane'] = is_hurricane(value)
    value['landfall'] = landfall(value)
    value['duration'] = storm_duration(value)
    value['year'] = get_year(value)

HBox(children=(IntProgress(value=0, max=1848), HTML(value='')))




Now, let's look Hurricane Rina again to see what we added

In [16]:
rina = hurdat_dict['AL192017']

In [17]:
rina

{'name': 'RINA',
 'datapoints': [{'record_date': '20171104',
   'time': '1200',
   'storm_status': 'LO',
   'lat': 25.5,
   'lon': -52.3,
   'max_wind': '25',
   'min_pressure': '1013',
   'category': 'TS',
   'dt': datetime.datetime(2017, 11, 4, 12, 0)},
  {'record_date': '20171104',
   'time': '1800',
   'storm_status': 'LO',
   'lat': 26.5,
   'lon': -52.1,
   'max_wind': '25',
   'min_pressure': '1013',
   'category': 'TS',
   'dt': datetime.datetime(2017, 11, 4, 18, 0)},
  {'record_date': '20171105',
   'time': '0000',
   'storm_status': 'LO',
   'lat': 27.5,
   'lon': -52.0,
   'max_wind': '25',
   'min_pressure': '1013',
   'category': 'TS',
   'dt': datetime.datetime(2017, 11, 5, 0, 0)},
  {'record_date': '20171105',
   'time': '0600',
   'storm_status': 'LO',
   'lat': 28.4,
   'lon': -52.0,
   'max_wind': '25',
   'min_pressure': '1013',
   'category': 'TS',
   'dt': datetime.datetime(2017, 11, 5, 6, 0)},
  {'record_date': '20171105',
   'time': '1200',
   'storm_status': 'LO

Great, the data looks ready for some analysis to test our hypotheses! Let's save this cleaned up version before proceeding.
#### Save cleaned Hurdat data

In [18]:
# to preserve data types, save as pickle
with open('../data/hurricanes_cleaned.pkl', 'wb') as handle:
    pickle.dump(hurdat_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)