In [18]:
#######################################
# CS 424 - Project 2
# Kevin Kowalski
# Samuel Kajah
# Vijay Vemu
#######################################
#
# This file contains the code to clean
# the original data set. It outputs
# the necessary plots and updated data
# set to be used by shiny.
#
# This file was used for testing and 
# plotting before using RStudio for a
# dashboard.
#
#######################################

In [19]:
# import libraries
library(comprehenr)
library(dplyr)
library(hashmap)
library(leaflet)
library(lubridate)
library(stringr)

In [20]:
# create appropriate column names for data in the following format
"
data rows
col 1 – date YYYYmmDD
col 2 – time HHMM
col 3 – record identifier (factor)
col 4 – storm type (factor)
col 5 – latitude (n/s)
col 6 – longitude (e/w)
col 7 – Maximum sustained wind (in knots)
col 8 – Minimum Pressure (in millibars)
col 9 – 34 kt wind radii maximum extent in northeastern quadrant (in nautical miles) 
col 10 – 34 kt wind radii maximum extent in southeastern quadrant (in nautical miles) 
col 11 – 34 kt wind radii maximum extent in southwestern quadrant (in nautical miles) 
col 12 – 34 kt wind radii maximum extent in northwestern quadrant (in nautical miles) 
col 13 – 50 kt wind radii maximum extent in northeastern quadrant (in nautical miles) 
col 14 – 50 kt wind radii maximum extent in southeastern quadrant (in nautical miles) 
col 15 – 50 kt wind radii maximum extent in southwestern quadrant (in nautical miles) 
col 16 – 50 kt wind radii maximum extent in northwestern quadrant (in nautical miles) 
col 17 – 64 kt wind radii maximum extent in northeastern quadrant (in nautical miles)
col 18 – 64 kt wind radii maximum extent in southeastern quadrant (in nautical miles)
col 19 – 64 kt wind radii maximum extent in southwestern quadrant (in nautical miles) 
col 20 – 64 kt wind radii maximum extent in northwestern quadrant (in nautical miles)
"

data_row_header = c('Date', 'Time', 'Record_ID', 'Storm_Type', 'Lat', 'Lon', 'Speed', 'Pressure', 
                   'Wind_Radii_NE_34', 'Wind_Radii_SE_34', 'Wind_Radii_SW_34', 'Wind_Radii_NW_34',
                   'Wind_Radii_NE_50', 'Wind_Radii_SE_50', 'Wind_Radii_SW_50', 'Wind_Radii_NW_50',
                   'Wind_Radii_NE_64', 'Wind_Radii_SE_64', 'Wind_Radii_SW_64', 'Wind_Radii_NW_64', 'Size')

In [21]:
# rename original text files to easier-to-work-with CSV files
file.rename("hurdat2-1851-2018-120319.txt", "atlantic_hurricanes.csv")
file.rename("hurdat2-nepac-1949-2018-122019.txt", "pacific_hurricanes.csv")

In [22]:
# read in the renamed CSV files
atlantic_data = read.csv('atlantic_hurricanes.csv', header = FALSE, stringsAsFactors = FALSE)
pacific_data = read.csv('pacific_hurricanes.csv', header = FALSE, stringsAsFactors = FALSE)

# apply column names to data
colnames(atlantic_data) = data_row_header
colnames(pacific_data) = data_row_header

In [23]:
# trim the first 6 string columns that contain leading/trailing whitespace
for (col in 1: 6) {
    atlantic_data[, col] = str_trim(atlantic_data[, col])
    pacific_data[, col] = str_trim(pacific_data[, col])
}

In [24]:
# create a new timestamp column containing both date and time
atlantic_data = atlantic_data %>% mutate(Timestamp = parse_date_time(paste(atlantic_data$Date, atlantic_data$Time, sep = ' '), "Ymd HM", tz = 'America/Chicago', quiet = TRUE))
pacific_data = pacific_data %>% mutate(Timestamp = parse_date_time(paste(pacific_data$Date, pacific_data$Time, sep = ' '), "Ymd HM", tz = 'America/Chicago', quiet = TRUE))

In [25]:
# the lat and long are using NESW; change it to positive/negative; because plotting only accepts numbers
# N/E = positive, S/W = negative
remake_coordinates = function(coordinates) {
    result = numeric(length(coordinates)) # empty vector of characters
    i = 1
    for(coordinate in coordinates) {
        bearing_index = nchar(coordinate)
        double_value = substr(coordinate, 1, bearing_index - 1) # get just the digits
        bearing = substr(coordinate, bearing_index, bearing_index) # get the direction
        if (bearing == "N" || bearing == "E") {
            result[i] = as.numeric(double_value)
        } else { # must be South or West
            result[i] = -as.numeric(double_value)
        }
        i = i + 1
    }
    result
}

In [26]:
# update coordinates to plot-friendly values
atlantic_data$Lat = remake_coordinates(atlantic_data$Lat)
atlantic_data$Lon = remake_coordinates(atlantic_data$Lon)
pacific_data$Lat = remake_coordinates(pacific_data$Lat)
pacific_data$Lon = remake_coordinates(pacific_data$Lon)

In [27]:
# custom functions for reading and formatting data

# determine's if a particular row begins a header for a hurricane entry (by at having at least 15 NA in its row)
is_header_row = function(row) {
    if (sum(is.na(row)) >= 15) 
        TRUE 
    else 
        FALSE
}

# create a vector of indices of all hurricane header rows
header_locations = function(data) {
    to_vec(
        for(row in 1: nrow(data)) 
            if (is_header_row(data[row, ])) 
                row)
}

# assign a new new unique name to unnamed storms
storm_name_from_header = function(header_df, prefix, id) {
    if (header_df[1, 2] == "UNNAMED") {
        paste(prefix, "STORM", id, sep = " ") # use STORM + id as new name
    } else {
        header_df[1, 2]
    }
}

# prefix is for naming unnamed storms
# if prefix is "ATLANTIC" then unnamed storms will be named 
# ATLANTIC STORM <storm_id>
make_huricane_data = function(data, header_indices, prefix) {
    tables = list() # keep track of the data associated with each storm
    i = 1 
    for(row_index in 1: length(header_indices)) {
        row = header_indices[row_index] # get the next header row index
        current_header = data[row, ] # get the header data
        storm_name = storm_name_from_header(current_header, prefix, i) # get name of the storm and add to list

    if (row_index == length(header_indices)) { # at the last header
        tables[[i]] = data[(row + 1): nrow(data), ] %>% # the data is from next row to the end of the data
            mutate(Storm_Name = storm_name) %>% # add the storm name
            mutate(Unique_ID = current_header[1, 1]) %>% # add the storm unique ID
            select(Storm_Name, Unique_ID, Timestamp, everything()) # move new columns to front for better ordering
        # adding these will make it easier to filter data
        
    } else { # otherwise, data is from next row to the start of the next header
        next_header = header_indices[row_index + 1]
        tables[[i]] = data[(row + 1): (next_header - 1), ]  %>%
            mutate(Storm_Name = storm_name) %>% # add the storm name
            mutate(Unique_ID = current_header[1, 1]) %>% # add the storm unique ID
            select(Storm_Name, Unique_ID, Timestamp, everything()) # move new columns to front for better ordering
    }
    i = i + 1
  }
  tables
}

In [28]:
# get list of all indices with hurricane headers
atlantic_header_indices = header_locations(atlantic_data)
pacific_header_indices = header_locations(pacific_data)

In [29]:
# list of dataframes of storms; this wil be the main storage for the data, makes it easy to plot
atlantic_data = make_huricane_data(atlantic_data, atlantic_header_indices, "ATLANTIC") # contains names and name_data
pacific_data = make_huricane_data(pacific_data, pacific_header_indices, "PACIFIC") # contains names and name_data

# if needed, can merge into one list 
#combined_data = c(atlantic_data, pacific_data)

In [30]:
# write the cleaned data to RDS files
saveRDS(atlantic_data, file = "atlantic_data.rds")
saveRDS(pacific_data, file = "pacific_data.rds")

In [31]:
# print head of first storm from both data sets
print("Atlantic Data")
head(atlantic_data[[1]])
print("Pacific Data")
head(pacific_data[[1]])

[1] "Atlantic Data"


Storm_Name,Unique_ID,Timestamp,Date,Time,Record_ID,Storm_Type,Lat,Lon,Speed,...,Wind_Radii_NW_34,Wind_Radii_NE_50,Wind_Radii_SE_50,Wind_Radii_SW_50,Wind_Radii_NW_50,Wind_Radii_NE_64,Wind_Radii_SE_64,Wind_Radii_SW_64,Wind_Radii_NW_64,Size
ATLANTIC STORM 1,AL011851,1851-06-25 00:00:00,18510625,0,,HU,28.0,-94.8,80,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,
ATLANTIC STORM 1,AL011851,1851-06-25 06:00:00,18510625,600,,HU,28.0,-95.4,80,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,
ATLANTIC STORM 1,AL011851,1851-06-25 12:00:00,18510625,1200,,HU,28.0,-96.0,80,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,
ATLANTIC STORM 1,AL011851,1851-06-25 18:00:00,18510625,1800,,HU,28.1,-96.5,80,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,
ATLANTIC STORM 1,AL011851,1851-06-25 21:00:00,18510625,2100,L,HU,28.2,-96.8,80,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,
ATLANTIC STORM 1,AL011851,1851-06-26 00:00:00,18510626,0,,HU,28.2,-97.0,70,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,


[1] "Pacific Data"


Storm_Name,Unique_ID,Timestamp,Date,Time,Record_ID,Storm_Type,Lat,Lon,Speed,...,Wind_Radii_NW_34,Wind_Radii_NE_50,Wind_Radii_SE_50,Wind_Radii_SW_50,Wind_Radii_NW_50,Wind_Radii_NE_64,Wind_Radii_SE_64,Wind_Radii_SW_64,Wind_Radii_NW_64,Size
PACIFIC STORM 1,EP011949,1949-06-11 00:00:00,19490611,0,,TS,20.2,-106.3,45,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,
PACIFIC STORM 1,EP011949,1949-06-11 06:00:00,19490611,600,,TS,20.2,-106.4,45,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,
PACIFIC STORM 1,EP011949,1949-06-11 12:00:00,19490611,1200,,TS,20.2,-106.7,45,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,
PACIFIC STORM 1,EP011949,1949-06-11 18:00:00,19490611,1800,,TS,20.3,-107.7,45,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,
PACIFIC STORM 1,EP011949,1949-06-12 00:00:00,19490612,0,,TS,20.4,-108.6,45,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,
PACIFIC STORM 1,EP011949,1949-06-12 06:00:00,19490612,600,,TS,20.5,-109.4,45,...,-999,-999,-999,-999,-999,-999,-999,-999,-999,


In [32]:
# custom functions for retrieving and plotting storm data

# plot a single storm's path on a map
plot_storm_path = function(storm_data, color) { # very simple plots, but will customize later
    map_object = leaflet() %>% addTiles()
        if (nrow(storm_data) == 1) { # only 1 coordinate
            map_object %>% 
            addCircleMarkers(data = storm_data, lat = ~Lat, lng = ~Lon, color = color)
        } else {
            map_object %>% 
                addPolylines(data = storm_data, lat = ~Lat, lng = ~Lon, color = color) # %>% 
                # addMarkers(data=storm_data, lng= ~lon, lat= ~lat)
        }
}

# plot multiple storm paths on a map
plot_multi_storm_path = function(storm_data_list, color_list) {
    map_object = leaflet() %>% addTiles()
    for(i in 1: length(storm_data_list)) {
        storm_data = storm_data_list[[i]]
        color = color_list[i]
        if (nrow(storm_data) == 1) { # only 1 coordinate
            map_object = map_object %>% 
                addCircleMarkers(data = storm_data, lat = ~Lat, lng = ~Lon, color = color)
        } else {
            map_object = map_object %>% 
                addPolylines(data = storm_data, lat = ~Lat, lng = ~Lon, color = color) # %>% 
                # addMarkers(data=storm_data, lng= ~lon, lat= ~lat)
        }
    }
    map_object
}

# get a list of storms (spanning) in a given year
get_storms_by_year = function(storm_data_list, year) {
    result = list()
    i = 1 
    for (storm_data in storm_data_list) {
        # check wether the given year is in the years cuz a storm may span end of 1 year to start of another
        if (year %in% year(storm_data$Timestamp)) { 
            # get the specific year
            result[[i]] = storm_data
            i = i + 1
        }
    }
    result
}

# get a list of storms (spanning) in a given day
get_storms_by_day = function(storm_data_list, date_string) {
    # date string should be a string of the form mm/dd/YYYY (same in UI display)
    result = list()
    i = 1 
    for(storm_data in storm_data_list) {
        # check if date is in the dates (because a storm can span > 1 day)
        if (as.Date(date_string, "%m/%d/%Y") %in% as.Date(storm_data$Timestamp)) {
            result[[i]] = storm_data
            i = i + 1
        }
    }
    result 
}

# get a storm's data given its name
get_storm_by_name = function(storm_data_list, name) {
    for (storm_data in storm_data_list) {
        if (name %in% storm_data$Storm_Name) {
            return (storm_data)
        }
    }
}



In [33]:
###########################
### Example Plots Below ###
###########################

In [34]:
# plot Atlnatic storms 11 through 15 on a map
plot_multi_storm_path(atlantic_data[11:15], c("red", "blue", "gray", "pink", "purple"))

# !!! doesn't plot in notebook for some reason; needs to be run in R Studio !!!

In [35]:
# plot Atlantic storms from 2018
colors = c("red", "blue", "gray", "pink", "purple", "black", "aqua", "royalblue",
           "yellow", "brown", "green", "turquoise", "skyblue", "fuscia", "white", "orange")
plot_multi_storm_path(get_storms_by_year(atlantic_data, 2018), colors)

In [36]:
# get the data for the storms on June 11, 1949 and plot their paths on a map
plot_multi_storm_path(get_storms_by_day(pacific_data, "06/11/1949"), colors)

In [37]:
# get the data for the storm named HIKI and plot its path on a map
plot_storm_path(get_storm_by_name(pacific_data, 'HIKI'), "blue")

In [38]:
# plot a single storm's path on a map by its size
plot_storm_path_by_size = function(storm_data, color) { # very simple plots, but will customize later
    map_object = leaflet() %>% addTiles()
        if (nrow(storm_data) == 1) { # only 1 coordinate
            map_object %>% 
            addCircleMarkers(data = storm_data, lat = ~Lat, lng = ~Lon, color = color, radius = (storm_data$Speed / 20)) # add markers for size, replace radius with desired circle scaler
        } else {
            map_object %>% 
                addCircleMarkers(data = storm_data, lat = ~Lat, lng = ~Lon, color = color, radius = (storm_data$Speed / 20)) %>% # add markers for size, replace radius with desired circle scaler
                addPolylines(data = storm_data, lat = ~Lat, lng = ~Lon, color = color, weight = storm_data$Size) #%>%
                #addMarkers(data = storm_data, lng= ~Lon, lat= ~Lat)
        }
}

# plot multiple storm paths on a map by their size
plot_multi_storm_path_by_size = function(storm_data_list, color_list) {
    map_object = leaflet() %>% addTiles()
    for(i in 1: length(storm_data_list)) {
        storm_data = storm_data_list[[i]]
        color = color_list[i]
        if (nrow(storm_data) == 1) { # only 1 coordinate
            map_object = map_object %>% 
                addCircleMarkers(data = storm_data, lat = ~Lat, lng = ~Lon, color = color, radius = (storm_data$Speed / 20)) # add markers for size, replace radius with desired circle scaler
        } else {
            map_object = map_object %>% 
                addCircleMarkers(data = storm_data, lat = ~Lat, lng = ~Lon, color = color, radius = (storm_data$Speed / 20)) %>% # add markers for size, replace radius with desired circle scaler
                addPolylines(data = storm_data, lat = ~Lat, lng = ~Lon, color = color, weight = storm_data$Size) # %>% 
                # addMarkers(data=storm_data, lng= ~lon, lat= ~lat)
        }
    }
    map_object
}

# !!! all hurricanes have N/A in Size column, so using scaled down speed to plot size of hurricane at each point of its path

In [39]:
# get the data for the storm named HIKI and plot its path on a map by size
plot_storm_path_by_size(get_storm_by_name(pacific_data, 'HIKI'), "blue")

In [40]:
# plot Atlantic storms by size from 2018
colors = c("red", "blue", "gray", "pink", "purple", "black", "aqua", "royalblue",
           "yellow", "brown", "green", "turquoise", "skyblue", "fuscia", "white", "orange")
plot_multi_storm_path_by_size(get_storms_by_year(atlantic_data, 2018), colors)

In [41]:
################################
### Ordering Functions Below ###
################################

In [42]:
# get a table of hurricanes in chronological order for a given data set
get_storm_names_chronologically = function(storm_data_list) {
    table = list()
    i = 1
    for (storm_data in storm_data_list) {
        table[[i]] = storm_data$Storm_Name[1]
        i = i + 1
    }
    
    # build dataframe of each hurricane by name
    df <- data.frame(matrix(unlist(table), nrow = length(table), byrow = T), stringsAsFactors = FALSE)
    names(df)[1] <- "Storm_Name"
    
    # should be sorted in chronological order by default so return the ordered list of names
    df$Storm_Name
}

# get a table of hurricanes in alphabetical order for a given data set
get_storm_names_alphabetically = function(storm_data_list) {
    table = list()
    i = 1
    for (storm_data in storm_data_list) {
        table[[i]] = storm_data$Storm_Name[1]
        i = i + 1
    }
    
    # build dataframe of each hurricane by name
    df <- data.frame(matrix(unlist(table), nrow = length(table), byrow = T), stringsAsFactors = FALSE)
    names(df)[1] <- "Storm_Name"
    
    # sort in alphabetical order and return the ordered list of names
    df <- df[order(df$Storm_Name), ]
    df
}

# get a table of hurricanes in order by top speed for a given data set
get_storm_names_max_speed = function(storm_data_list) {
    table = list()
    i = 1
    for (storm_data in storm_data_list) {
        table[[i]] = c(storm_data$Storm_Name[1], max(storm_data$Speed))
        i = i + 1
    }
    
    # build dataframe of each hurricane and its top speed
    df <- data.frame(matrix(unlist(table), nrow = length(table), byrow = T), stringsAsFactors = FALSE)
    names(df)[1] <- "Storm_Name"
    names(df)[2] <- "TopSpeed"
    df$TopSpeed <- as.numeric(df$TopSpeed)
    
    # sort in descending order and return the ordered list of names
    df <- df[order(-df$TopSpeed), ]
    df$Storm_Name
}

# get a table of hurricanes in order by min pressure for a given data set
get_storm_names_min_pressure = function(storm_data_list) {
    table = list()
    i = 1
    for (storm_data in storm_data_list) {
        table[[i]] = c(storm_data$Storm_Name[1], min(storm_data$Pressure))
        i = i + 1
    }
    
    # build dataframe of each hurricane and its min pressure
    df <- data.frame(matrix(unlist(table), nrow = length(table), byrow = T), stringsAsFactors = FALSE)
    names(df)[1] <- "Storm_Name"
    names(df)[2] <- "MinPressure"
    df$MinPressure <- as.numeric(df$MinPressure)
    
    # sort in ascending order and return the ordered list of names
    df <- df[order(df$MinPressure), ]
    df$Storm_Name
}

In [43]:
# output count and chronological order of atlantic hurricanes
chrono_order <- get_storm_names_chronologically(atlantic_data)
length(chrono_order)
chrono_order

In [44]:
# output count and alphabetical order of atlantic hurricanes
alpha_order <- get_storm_names_alphabetically(atlantic_data)
length(alpha_order)
alpha_order

In [45]:
# output count and order of atlantic hurricanes by top speed
speed_order <- get_storm_names_max_speed(atlantic_data)
length(speed_order)
speed_order

In [46]:
# output count and order of atlantic hurricanes by min pressure
pressure_order <- get_storm_names_min_pressure(atlantic_data)
length(pressure_order)
pressure_order