# Hurricane Data Cleaning
by David Nguyen - github: @david-kishi

### Imports

In [1]:
import pandas as pd

### Declarations

In [2]:
hurricane_path = 'bq-results-20200106-205330-dpcbw0ewpjpb.csv'

### Create dataframe from hurricane data csv

In [3]:
hurricane_df = pd.read_csv(hurricane_path, low_memory=False)
hurricane_df.head()

Unnamed: 0,season,basin,subbasin,name,iso_time,nature,latitude,longitude,usa_wind,usa_pressure
0,2018,,,LESLIE,2018-09-26 12:00:00 UTC,NR,33.7468,-40.0305,57.0,986.0
1,2018,EP,MM,XAVIER,2018-11-06 06:00:00 UTC,NR,18.9371,-107.832,38.0,1005.0
2,2018,EP,CP,OLIVIA,2018-09-14 06:00:00 UTC,NR,18.9405,-164.639,29.0,1008.0
3,2018,,,GORDON,2018-09-06 06:00:00 UTC,NR,33.5131,-90.9543,28.0,1013.0
4,2018,,,GORDON,2018-09-06 18:00:00 UTC,NR,34.4372,-91.6335,32.0,1013.0


In [4]:
hurricane_df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 176211 entries, 0 to 176210
Data columns (total 10 columns):
season          176211 non-null int64
basin           57928 non-null object
subbasin        86212 non-null object
name            176211 non-null object
iso_time        176211 non-null object
nature          176211 non-null object
latitude        176211 non-null float64
longitude       176211 non-null float64
usa_wind        154825 non-null float64
usa_pressure    76971 non-null float64
dtypes: float64(4), int64(1), object(5)
memory usage: 13.4+ MB


In [5]:
hurricane_df.basin.isna().sum()

118283

In [6]:
hurricane_df.usa_wind.isna().sum()

21386

### Get rows with basin defined within North American waters

In [7]:
# BigQuery still included nulls. Let's get rid of those by just selecting the basin we need.
hurricane_clean = hurricane_df.loc[(hurricane_df.basin == 'NA') | (hurricane_df.basin == 'EP')]
hurricane_clean.count()

season          57928
basin           57928
subbasin        57928
name            57928
iso_time        57928
nature          57928
latitude        57928
longitude       57928
usa_wind        57095
usa_pressure    30937
dtype: int64

In [8]:
hurricane_clean = hurricane_clean.loc[hurricane_clean.usa_wind.notnull()].reset_index()
hurricane_clean

Unnamed: 0,index,season,basin,subbasin,name,iso_time,nature,latitude,longitude,usa_wind,usa_pressure
0,1,2018,EP,MM,XAVIER,2018-11-06 06:00:00 UTC,NR,18.9371,-107.832,38.0,1005.0
1,2,2018,EP,CP,OLIVIA,2018-09-14 06:00:00 UTC,NR,18.9405,-164.639,29.0,1008.0
2,5,2018,EP,CP,OLIVIA,2018-09-14 12:00:00 UTC,NR,18.8559,-165.699,29.0,1008.0
3,11,1966,EP,CP,KATHY,1966-10-20 06:00:00 UTC,TS,44.6143,180.000,55.0,
4,12,1992,EP,CP,DAN,1992-10-25 06:00:00 UTC,TS,11.6714,180.229,30.0,
...,...,...,...,...,...,...,...,...,...,...,...
57090,175861,2015,EP,MM,ENRIQUE,2015-07-15 12:00:00 UTC,TS,18.8000,-132.600,40.0,1003.0
57091,175862,2001,EP,MM,HENRIETTE,2001-09-05 06:00:00 UTC,TS,17.9000,-112.900,35.0,1005.0
57092,175874,2009,EP,MM,LINDA,2009-09-11 18:00:00 UTC,TS,20.5273,-131.173,35.0,1005.0
57093,175879,2014,EP,CP,ANA,2014-10-23 18:00:00 UTC,TS,26.7000,-169.500,45.0,999.0


In [9]:
hurricane_clean.count()

index           57095
season          57095
basin           57095
subbasin        57095
name            57095
iso_time        57095
nature          57095
latitude        57095
longitude       57095
usa_wind        57095
usa_pressure    30937
dtype: int64

### Output cleaned data CSV

In [10]:
hurricane_clean.to_csv('hurricane_cleaned_df.csv')