In [20]:
#imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from global_land_mask import globe
import reverse_geocoder as rg

# Build the Dataset

In [21]:
# Read in the Data
hurdat = pd.read_csv('hurdat.csv')
print(hurdat.shape)
print(hurdat.Code.nunique())

(52717, 22)
1924


In [22]:
hurdat.columns

Index(['Code', 'Name', 'Date', 'Time', 'Record Identifier', 'Storm Status',
       'Latitude', 'Longitude', 'Max Sustained Wind Speed', 'Minumum Pressure',
       '34 KT Wind Radii Max NE', '34 KT Wind Radii Max SE',
       '34 KT Wind Radii Max SW', '34 KT Wind Radii Max NW',
       '50 KT Wind Radii Max NE', '50 KT Wind Radii Max SE',
       '50 KT Wind Radii Max SW', '50 KT Wind Radii Max NW',
       '64 KT Wind Radii Max NE', '64 KT Wind Radii Max SE',
       '64 KT Wind Radii Max SW', '64 KT Wind Radii Max NW'],
      dtype='object')

In [23]:
# Create datetime column Date
hurdat['Date'] = pd.to_datetime(hurdat.Date,format='%Y%m%d', errors='ignore')

In [24]:
## Create day, month, year columns for visuals
hurdat['Day'] = hurdat['Date'].dt.day
hurdat['Month'] = hurdat['Date'].dt.month
hurdat['Year'] = hurdat['Date'].dt.year

In [25]:
### Create decade start and range for visuals
hurdat['Decade_Start'] = (hurdat['Year'] // 10) * 10 + 1

# ADJUST FOR YEARS ENDING IN ZERO
hurdat.loc[(hurdat['Year'] % 10) == 0, 'Decade_Start'] = hurdat['Decade_Start'] - 10

# CALCULATE DECADE RANGE
hurdat['Decade_Range'] = hurdat['Decade_Start'].astype('str') + ' - ' + \
                     (hurdat['Decade_Start'] + 9).astype('str') 

## Create Categories

In [26]:
#Based on windspeed
conditions = [
    ((hurdat['Max Sustained Wind Speed'] >= 64) & (hurdat['Max Sustained Wind Speed'] <= 82)),
    ((hurdat['Max Sustained Wind Speed'] >= 83) & (hurdat['Max Sustained Wind Speed'] <= 95)),
    ((hurdat['Max Sustained Wind Speed'] >= 96) & (hurdat['Max Sustained Wind Speed'] <= 112)),
    ((hurdat['Max Sustained Wind Speed'] >= 113) & (hurdat['Max Sustained Wind Speed'] <= 136)),
    (hurdat['Max Sustained Wind Speed'] >= 137)
    ]
values = [1, 2, 3, 4,5]
hurdat['Category'] = np.select(conditions, values)

## Create Major Storm Category

In [27]:
conditions = [((hurdat['Category'] >= 0) & (hurdat['Category'] <=2)),
              (hurdat['Category'] >= 3)]
values = [0,1]
# 1 if a major storm ( 3 or higher), 0 otherwise
hurdat['Major Storm'] = np.select(conditions, values)

In [28]:
#How many storms become a major storm at some point?
hurdat[hurdat['Major Storm'] == 1]['Code'].nunique()

325

## Convert the latitude and longitude

In [29]:
def convert_coord(lat, long):
    '''Takes a string tuple for lattitude and longitude and returns the corresponding float tuple'''
    
    latitude, longitude = float(lat[:-1]), float(long[:-1])
    
    if lat[-1] == 'S': latitude *= -1
    if long[-1] == 'W': longitude *= -1
        
    return latitude, longitude

In [30]:
def convert_coords(coordinates):
    '''Takes a zipped list of latitudes and longitudes and returns a list of converted coordinates'''
    converted = []
    for coordinate in coordinates:
        converted.append(convert_coord(*coordinate))
    
    return converted

In [31]:
converted = list(zip(hurdat['Latitude'], hurdat['Longitude']))
converted = convert_coords(converted)
converted = pd.DataFrame(converted, columns=['Latitude_c', 'Longitude_c'])

In [32]:
hurdat = pd.concat([hurdat, converted], axis=1)

## Find if a storm is on land

In [33]:
land = pd.DataFrame(list(
                    globe.is_land(hurdat['Latitude_c'],
                                  hurdat['Longitude_c'])),
                    columns = ['Land'])
land.sample(5)

Unnamed: 0,Land
42681,False
14383,False
23807,False
1428,False
14945,True


In [34]:
hurdat = pd.concat([hurdat,land],axis =1)

## Find the State

In [35]:
coor = list(zip(hurdat['Latitude_c'],hurdat['Longitude_c']))
info = rg.search(coor)
state = pd.DataFrame(map(lambda x: x['admin1'],info),columns = ['State'])

In [36]:
hurdat = pd.concat([hurdat,state], axis = 1)

In [37]:
hurdat

Unnamed: 0,Code,Name,Date,Time,Record Identifier,Storm Status,Latitude,Longitude,Max Sustained Wind Speed,Minumum Pressure,...,Month,Year,Decade_Start,Decade_Range,Category,Major Storm,Latitude_c,Longitude_c,Land,State
0,AL011851,UNNAMED,1851-06-25,0,,HU,28.0N,94.8W,80,-999,...,6,1851,1851,1851 - 1860,1,0,28.0,-94.8,False,Texas
1,AL011851,UNNAMED,1851-06-25,600,,HU,28.0N,95.4W,80,-999,...,6,1851,1851,1851 - 1860,1,0,28.0,-95.4,False,Texas
2,AL011851,UNNAMED,1851-06-25,1200,,HU,28.0N,96.0W,80,-999,...,6,1851,1851,1851 - 1860,1,0,28.0,-96.0,False,Texas
3,AL011851,UNNAMED,1851-06-25,1800,,HU,28.1N,96.5W,80,-999,...,6,1851,1851,1851 - 1860,1,0,28.1,-96.5,False,Texas
4,AL011851,UNNAMED,1851-06-25,2100,L,HU,28.2N,96.8W,80,-999,...,6,1851,1851,1851 - 1860,1,0,28.2,-96.8,False,Texas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52712,AL312020,IOTA,2020-11-17,1200,,HU,13.7N,84.7W,75,965,...,11,2020,2011,2011 - 2020,1,0,13.7,-84.7,True,Atlantico Norte (RAAN)
52713,AL312020,IOTA,2020-11-17,1800,,TS,13.7N,85.7W,55,988,...,11,2020,2011,2011 - 2020,0,0,13.7,-85.7,True,Nueva Segovia
52714,AL312020,IOTA,2020-11-18,0,,TS,13.8N,86.7W,40,1000,...,11,2020,2011,2011 - 2020,0,0,13.8,-86.7,True,El Paraiso
52715,AL312020,IOTA,2020-11-18,600,,TS,13.8N,87.8W,35,1005,...,11,2020,2011,2011 - 2020,0,0,13.8,-87.8,True,La Union


In [87]:
hurdat["datetime"] = hurdat["Date"] + pd.to_timedelta(hurdat.Time.apply(lambda x: f"{x:04}").apply(lambda x: f"{str(x)[:2]} hours"))

In [89]:
# to normalize:
# .agg({"Latitude_c": lambda x: x/90, "Longitude_c": lambda x: x/180})
y = hurdat[["Code", "datetime", "Latitude_c","Longitude_c"]].set_index(["Code", "datetime"])
y

Unnamed: 0_level_0,Latitude_c,Longitude_c
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1
1851-06-25 00:00:00,28.0,-94.8
1851-06-25 06:00:00,28.0,-95.4
1851-06-25 12:00:00,28.0,-96.0
1851-06-25 18:00:00,28.1,-96.5
1851-06-25 21:00:00,28.2,-96.8
...,...,...
2020-11-17 12:00:00,13.7,-84.7
2020-11-17 18:00:00,13.7,-85.7
2020-11-18 00:00:00,13.8,-86.7
2020-11-18 06:00:00,13.8,-87.8


In [38]:
hurdat['State'].unique()

array(['Texas', 'Veracruz', 'Eastern Tobago', 'Guyane', 'Saint Philip',
       'Saint Joseph', '', 'Saint Anthony', 'Saint Croix Island',
       'Arroyo', 'Cabo Rojo', 'San Pedro de Macoris', 'Barahona',
       'Artibonite', 'Grandans', 'Santiago de Cuba', 'Las Tunas',
       'Camaguey', 'Cienfuegos', 'Matanzas', 'La Habana', 'Artemisa',
       'Pinar del Rio', 'Florida', 'Georgia', 'South Carolina',
       'North Carolina', 'Virginia', 'Maryland', 'New York',
       'Massachusetts', 'Nova Scotia', 'Miquelon-Langlade',
       'Newfoundland and Labrador', 'North Abaco', 'Isabela', 'Samana',
       'Maria Trinidad Sanchez', 'Mayaguana', 'Acklins', 'Ragged Island',
       'Ciego de Avila', 'Villa Clara', 'Alabama', 'Mississippi',
       'Guayama', 'Mayaguez', 'La Altagracia', 'Duarte',
       'Santiago Rodriguez', 'Nord', 'Louisiana', 'Saint Peter',
       'Barbuda', 'Saint Thomas Island', 'Loiza', 'Arecibo', 'Aguadilla',
       'San Juan', 'Puerto Plata', 'Monte Cristi', 'Inagua',
      