In [1]:
from glob import glob
import pandas as pd
import matplotlib.pyplot as plot
from collections import Counter
import seaborn as sns
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
import numpy as np
from datetime import datetime
import urllib.request
from bs4 import BeautifulSoup
import requests

pd.set_option('display.max_columns', None)
cols =['BASIN', 'CY', 'YYYYMMDDHH', 'TECHNUM', 'TECH', 'TAU', 'LATITUDE', 'LONGITUDE', 'MAX_SUS_WIND_SPEED', 'MIN_SEA_LVL_PRESSURE', 'LVL_OF_DEVELOPMENT', 'WIND_INTENSITY_FOR_RADII_DEFINED', 'WINDCODE', 'RAD1', 'RAD2', 'RAD3', 'RAD4','POUTER', 'ROUTER', 'RADIUS_OF_MAX_WINDS', 'GUSTS', 'EYE', 'SUBREGION', 'MAXSEAS', 'INITIALS', 'DIR', 'SPEED', 'STORMNAME', 'DEPTH', 'SEAS', 'SEASCODE', 'SEAS1', 'SEAS2', 'SEAS3', 'SEAS4']


In [2]:
# Cleans up our storm data into what we actually want
def cleanStorm(df):
    # Drop Random Columns
    df = df.iloc[:,0:35]
    # Set Column Names
    df.columns = cols
    # Drop not useful columns
    df.drop(['TECHNUM', 'TECH', 'TAU', 'LVL_OF_DEVELOPMENT', 'WIND_INTENSITY_FOR_RADII_DEFINED', 'WINDCODE','RAD1', 'RAD2', 'RAD3','RAD4','POUTER', 'ROUTER', 'RADIUS_OF_MAX_WINDS', 'RADIUS_OF_MAX_WINDS', 'EYE', 'GUSTS', 'SUBREGION', 'MAXSEAS', 'INITIALS', 'DIR', 'SPEED', 'DEPTH', 'SEAS', 'SEASCODE', 'SEAS1', 'SEAS2', 'SEAS3', 'SEAS4' ], axis = 1, inplace = True)

    # Convert time stamp from int to string
    df.YYYYMMDDHH = df.YYYYMMDDHH.astype(str)
    # slice off the year and create our time code
    df['STORM_CODE'] = df.BASIN + df.CY.astype(str) + df.YYYYMMDDHH.str.slice(stop=4)
    # format time stamp to datetime fromat
    df.YYYYMMDDHH = pd.to_datetime(pd.Series(df.YYYYMMDDHH), format = '%Y%m%d%H' )

    
    # Make lattidue and longitude numeric
    for index in range(len(df['LATITUDE'])):
        holdThis = df['LATITUDE'][index]
        if holdThis[-1] == 'N':
            df['LATITUDE'][index] = int(holdThis[:(len(holdThis) -1)]) / 10
        elif holdThis[-1] == 'S':
            df['LATITUDE'][index] = int(holdThis[:(len(holdThis) -1)]) / -10
            
    for index in range(len(df['LONGITUDE'])):
        holdThat = df['LONGITUDE'][index]
        if holdThat[-1] == 'E':

            df['LONGITUDE'][index] = int(holdThat[:(len(holdThat) -1)]) / 10
        elif holdThat[-1] == 'W':
            df['LONGITUDE'][index] = int(holdThat[:(len(holdThat) -1)]) / -10
    # change the datatype to a float      
    df["LATITUDE"] = pd.to_numeric(df["LATITUDE"])
    df["LONGITUDE"] = pd.to_numeric(df["LONGITUDE"])
    
    #Change the storm name for older enteries like invest to what the strom was eventually named
    l = df.STORMNAME.iat[-1]
    df['STORMNAME'] = l

 
    
    return df


    

In [5]:
url = 'http://hurricanes.ral.ucar.edu/repository/data/bdecks_open/2018/'
html_content = requests.get(url).text
soup = BeautifulSoup(html_content, "lxml")
# Grabbing all of the links off of the webpage
li = []
for link in soup.find_all("a"):
    li.append(format(link.get("href")))

links = []
while li:
	link = li.pop()
	if link[-4:] == '.dat':
		links.append(link)
print(len(links))

163


In [4]:
w = []
tossed = 0 
for link in links:
    name = url +links.pop()
    df = pd.read_csv(name, header = None, error_bad_lines=False)
    if(len(df.columns) >= 35):
        df = cleanStorm(df)
        if df.MAX_SUS_WIND_SPEED.max() >= 35:
            w.append(df)
        else:
            tossed += 1
    else:
        tossed += 1

dataframe = pd.concat(w, axis=0,  ignore_index=True,  sort = False)



b'Skipping line 5: expected 38 fields, saw 40\nSkipping line 20: expected 38 fields, saw 40\n'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
b'Skipping line 8: expected 38 fields, saw 40\nSkipping line 15: expected 38 fields, saw 40\n'
b'Skipping line 5: expected 38 fields, saw 40\nSkipping line 6: expected 38 fields, saw 40\n'
b'Skipping line 5: expected 38 fields, saw 40\nSkipping line 6: expected 38 fields, saw 40\n'
b'Skipping line 5: expected 38 fields, saw 40\nSkipping line 21: expected 38 fields, saw 40\nSkipping line 22: expected 38 fields, saw 42\n'
b'Skipping line 8: expected 38 fields, saw 40\nSkipping line 17: expect

In [35]:
tossed

37

In [5]:
dataframe

Unnamed: 0,BASIN,CY,YYYYMMDDHH,LATITUDE,LONGITUDE,MAX_SUS_WIND_SPEED,MIN_SEA_LVL_PRESSURE,STORMNAME,STORM_CODE
0,AL,1,2018-05-20 18:00:00,17.2,-81.3,20,1010,ALBERTO,AL12018
1,AL,1,2018-05-21 00:00:00,17.3,-82.2,20,1010,ALBERTO,AL12018
2,AL,1,2018-05-21 06:00:00,17.4,-83.1,20,1010,ALBERTO,AL12018
3,AL,1,2018-05-21 12:00:00,17.5,-83.8,20,1010,ALBERTO,AL12018
4,AL,1,2018-05-22 00:00:00,17.6,-85.2,25,1009,ALBERTO,AL12018
...,...,...,...,...,...,...,...,...,...
2327,IO,8,2018-12-16 18:00:00,14.2,82.2,55,996,PHETHAI,IO82018
2328,IO,8,2018-12-17 00:00:00,15.3,82.4,45,998,PHETHAI,IO82018
2329,IO,8,2018-12-17 06:00:00,16.2,82.3,45,1002,PHETHAI,IO82018
2330,IO,8,2018-12-17 12:00:00,18.0,82.3,35,1007,PHETHAI,IO82018


In [7]:
dataframe.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2332 entries, 0 to 2331
Data columns (total 9 columns):
BASIN                   2332 non-null object
CY                      2332 non-null int64
YYYYMMDDHH              2332 non-null datetime64[ns]
LATITUDE                2332 non-null float64
LONGITUDE               2332 non-null float64
MAX_SUS_WIND_SPEED      2332 non-null int64
MIN_SEA_LVL_PRESSURE    2332 non-null int64
STORMNAME               2332 non-null object
STORM_CODE              2332 non-null object
dtypes: datetime64[ns](1), float64(2), int64(3), object(3)
memory usage: 164.1+ KB


In [8]:
dataframe.shape

(2332, 9)

In [9]:
dataframe.describe()

Unnamed: 0,CY,LATITUDE,LONGITUDE,MAX_SUS_WIND_SPEED,MIN_SEA_LVL_PRESSURE
count,2332.0,2332.0,2332.0,2332.0,2332.0
mean,13.060034,20.604631,-84.315523,63.75,975.38422
std,12.226477,8.19375,56.162919,30.912972,99.584714
min,1.0,7.0,-179.5,10.0,0.0
25%,7.0,14.5,-123.925,35.0,970.0
50%,12.0,17.7,-99.3,60.0,989.0
75%,16.0,25.6,-48.2,85.0,1003.0
max,96.0,51.2,178.6,140.0,1021.0
