In [1]:
import pandas as pd

In [2]:
# read in cleaned hurricane data
hurricane_data = pd.read_csv("../Data/hurricane_clean_dataset.csv")

# get only the name and the year
hurricane_data = hurricane_data[["Name", "Year"]].copy()

# make name all lowercase
hurricane_data["Name"] = hurricane_data["Name"].str.lower()

# rename columns
hurricane_data = hurricane_data.rename(columns={"Name": "name", "Year": "year"})

In [3]:
hurricane_data.head()

Unnamed: 0,name,year
0,san felipe,1928
1,camaguey,1932
2,new england,1938
3,janet,1955
4,inez,1966


In [4]:
# since no column names on csv, need to name them
col_names = ["name", "date", "time_UTC", "record_identifier", "status", "latitude", "longitude", "max_wind_knots", "min_pressure_millibars", 
             "34kt_wind_radii_NE", "34kt_wind_radii_SE", "34kt_wind_radii_SW", "34kt_wind_radii_NW",
            "50kt_wind_radii_NE", "50kt_wind_radii_SE", "50kt_wind_radii_SW", "50kt_wind_radii_NW",
            "64kt_wind_radii_NE", "64kt_wind_radii_SE", "64kt_wind_radii_SW", "64kt_wind_radii_NW", "radius_max_wind"]

# read in csv with column names above
noaa_data = pd.read_csv("../Data/NOAA_reformatted.csv", header=None, names=col_names)

In [5]:
noaa_data

Unnamed: 0,name,date,time_UTC,record_identifier,status,latitude,longitude,max_wind_knots,min_pressure_millibars,34kt_wind_radii_NE,...,34kt_wind_radii_NW,50kt_wind_radii_NE,50kt_wind_radii_SE,50kt_wind_radii_SW,50kt_wind_radii_NW,64kt_wind_radii_NE,64kt_wind_radii_SE,64kt_wind_radii_SW,64kt_wind_radii_NW,radius_max_wind
0,,AL011851,UNNAMED,14,,,,,,,...,,,,,,,,,,
1,UNNAMED,18510625,0,,HU,28.0N,94.8W,80.0,-999.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
2,UNNAMED,18510625,600,,HU,28.0N,95.4W,80.0,-999.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
3,UNNAMED,18510625,1200,,HU,28.0N,96.0W,80.0,-999.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
4,UNNAMED,18510625,1800,,HU,28.1N,96.5W,80.0,-999.0,-999.0,...,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0,-999.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
55432,WANDA,20211107,0,,TS,37.4N,37.4W,35.0,1003.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,40.0
55433,WANDA,20211107,600,,TS,38.1N,36.4W,35.0,1004.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,45.0
55434,WANDA,20211107,1200,,LO,39.2N,34.9W,35.0,1006.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0
55435,WANDA,20211107,1800,,LO,40.9N,32.8W,40.0,1006.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,50.0


In [6]:
# get rid of unnecessary columns
wanted_cols = ["name", "date", "time_UTC", "latitude", "longitude"]
noaa_df = noaa_data[wanted_cols].copy()

# drop rows with NaN values (the row with the name of the hurricane has many of these)
noaa_df.dropna(inplace=True)

# make names all lowercase
noaa_df["name"] = noaa_df["name"].str.lower()

# strip whitespace from beginning of names
noaa_df["name"] = noaa_df["name"].str.lstrip()

In [7]:
noaa_df

Unnamed: 0,name,date,time_UTC,latitude,longitude
1,unnamed,18510625,0,28.0N,94.8W
2,unnamed,18510625,600,28.0N,95.4W
3,unnamed,18510625,1200,28.0N,96.0W
4,unnamed,18510625,1800,28.1N,96.5W
5,unnamed,18510625,2100,28.2N,96.8W
...,...,...,...,...,...
55432,wanda,20211107,0,37.4N,37.4W
55433,wanda,20211107,600,38.1N,36.4W
55434,wanda,20211107,1200,39.2N,34.9W
55435,wanda,20211107,1800,40.9N,32.8W


In [8]:
# split up year, month and day
noaa_df["year"] = noaa_df["date"].str.slice(0,4)
noaa_df["month"] = noaa_df["date"].str.slice(4,6)
noaa_df["day"] = noaa_df["date"].str.slice(start=6)

noaa_df

Unnamed: 0,name,date,time_UTC,latitude,longitude,year,month,day
1,unnamed,18510625,0,28.0N,94.8W,1851,06,25
2,unnamed,18510625,600,28.0N,95.4W,1851,06,25
3,unnamed,18510625,1200,28.0N,96.0W,1851,06,25
4,unnamed,18510625,1800,28.1N,96.5W,1851,06,25
5,unnamed,18510625,2100,28.2N,96.8W,1851,06,25
...,...,...,...,...,...,...,...,...
55432,wanda,20211107,0,37.4N,37.4W,2021,11,07
55433,wanda,20211107,600,38.1N,36.4W,2021,11,07
55434,wanda,20211107,1200,39.2N,34.9W,2021,11,07
55435,wanda,20211107,1800,40.9N,32.8W,2021,11,07


In [9]:
# create an id column in both data frames so we can compare
hurricane_data["id"] = hurricane_data["name"] + " " + hurricane_data["year"].astype(str)

noaa_df["id"] = noaa_df["name"] + " " + noaa_df["year"].astype(str)

In [10]:
hurricane_data.head()

Unnamed: 0,name,year,id
0,san felipe,1928,san felipe 1928
1,camaguey,1932,camaguey 1932
2,new england,1938,new england 1938
3,janet,1955,janet 1955
4,inez,1966,inez 1966


In [11]:
noaa_df.head()

Unnamed: 0,name,date,time_UTC,latitude,longitude,year,month,day,id
1,unnamed,18510625,0,28.0N,94.8W,1851,6,25,unnamed 1851
2,unnamed,18510625,600,28.0N,95.4W,1851,6,25,unnamed 1851
3,unnamed,18510625,1200,28.0N,96.0W,1851,6,25,unnamed 1851
4,unnamed,18510625,1800,28.1N,96.5W,1851,6,25,unnamed 1851
5,unnamed,18510625,2100,28.2N,96.8W,1851,6,25,unnamed 1851


In [12]:
# create list of hurricane ids from cleaned dataset to compare with noaa dataset
hurr_names = hurricane_data["id"].unique().tolist()

In [13]:
# get only the rows with the same hurricanes as in our cleaned dataset
noaa_match_names = noaa_df.loc[noaa_df["id"].isin(hurr_names)]
noaa_match_names = noaa_match_names.reset_index(drop=True)
noaa_match_names

Unnamed: 0,name,date,time_UTC,latitude,longitude,year,month,day,id
0,baker,19500818,1200,12.0N,54.0W,1950,08,18,baker 1950
1,baker,19500818,1800,12.5N,54.2W,1950,08,18,baker 1950
2,baker,19500819,0,13.0N,54.4W,1950,08,19,baker 1950
3,baker,19500819,600,13.5N,54.7W,1950,08,19,baker 1950
4,baker,19500819,1200,14.0N,55.0W,1950,08,19,baker 1950
...,...,...,...,...,...,...,...,...,...
3811,larry,20210911,0,45.1N,56.6W,2021,09,11,larry 2021
3812,larry,20210911,330,47.3N,54.6W,2021,09,11,larry 2021
3813,larry,20210911,600,48.8N,53.3W,2021,09,11,larry 2021
3814,larry,20210911,1200,52.5N,49.7W,2021,09,11,larry 2021


In [14]:
# write our results as a csv
noaa_match_names.to_json("../Data/hurricane_path.json")