In [84]:
import pandas as pd
import numpy as np

# Replacing -999 with NaN in HURDAT2

In [85]:
df_hurdat2 = pd.read_csv('datasets/hurdat2_formatted.csv')
df_hurdat2['datetime'] = pd.to_datetime(df_hurdat2['datetime'])
df_hurdat2.replace(-999, np.nan, inplace=True)
df_hurdat2.head(5)

Unnamed: 0,storm_id,storm_name,num_of_obs,datetime,category,record_identifier,status_of_system,latitude,longitude,maximum_sustained_wind_knots,...,34_kt_nw_nm,50_kt_ne_nm,50_kt_se_nm,50_kt_sw_nm,50_kt_nw_nm,64_kt_ne_nm,64_kt_se_nm,64_kt_sw_nm,64_kt_nw_nm,radius_of_max_wind_nm
0,AL011851,UNNAMED,14,1851-06-25 12:00:00,1,,HU,28.0N,96.0W,80,...,,,,,,,,,,
1,AL011851,UNNAMED,14,1851-06-25 18:00:00,1,,HU,28.1N,96.5W,80,...,,,,,,,,,,
2,AL011851,UNNAMED,14,1851-06-25 21:00:00,1,L,HU,28.2N,96.8W,80,...,,,,,,,,,,
3,AL011851,UNNAMED,14,1851-06-26 00:00:00,1,,HU,28.2N,97.0W,70,...,,,,,,,,,,
4,AL011851,UNNAMED,14,1851-06-26 06:00:00,Tropical Storm,,TS,28.3N,97.6W,60,...,,,,,,,,,,


# Grouping HURDAT2 by storm_id

In [86]:
grouped_df = df_hurdat2.groupby(['storm_id']).agg({
    'maximum_sustained_wind_knots': ['mean', 'max'],
    'central_pressure_mb': ['mean', 'min'],
    'radius_of_max_wind_nm': ['mean', 'max']
}).reset_index()
print("Number of unique storms:", len(grouped_df))

grouped_df_without_nan = grouped_df.dropna()

print("Number of unique storms without NaN:", len(grouped_df_without_nan))

Number of unique storms: 715
Number of unique storms without NaN: 206


In [87]:
grouped_df_without_nan.head(5)

Unnamed: 0_level_0,storm_id,maximum_sustained_wind_knots,maximum_sustained_wind_knots,central_pressure_mb,central_pressure_mb,radius_of_max_wind_nm,radius_of_max_wind_nm
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,max,mean,min,mean,max
1,AL011852,57.777778,100,961.0,961.0,10.0,10.0
3,AL011856,84.444444,130,934.0,934.0,10.0,10.0
24,AL011900,57.333333,120,936.0,936.0,15.0,15.0
32,AL011915,44.444444,65,996.5,990.0,15.0,15.0
34,AL011918,68.571429,105,955.0,955.0,10.0,10.0


In [88]:
grouped_df_without_nan.to_csv('datasets/hurdat2_grouped.csv', index=False)

# Classifying storms into categories

In [89]:
def  classify_category(wind_speed):
    if wind_speed <= 33:
        return "Tropical Depression"
    elif 34 <= wind_speed <= 63:
        return "Tropical Storm"
    elif 64 <= wind_speed <= 82:
        return 1
    elif 83 <= wind_speed <= 95:
        return 2
    elif 96 <= wind_speed <= 112:
        return 3
    elif 113 <= wind_speed <= 136:
        return 4
    elif wind_speed >= 137:
        return 5
    return "Unknown"

In [90]:
grouped_df_without_nan['category'] = grouped_df_without_nan['maximum_sustained_wind_knots']['max'].apply(classify_category)
grouped_df['category'] = grouped_df['maximum_sustained_wind_knots']['max'].apply(classify_category)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  grouped_df_without_nan['category'] = grouped_df_without_nan['maximum_sustained_wind_knots']['max'].apply(classify_category)


In [91]:
grouped_df_without_nan.head(5)

Unnamed: 0_level_0,storm_id,maximum_sustained_wind_knots,maximum_sustained_wind_knots,central_pressure_mb,central_pressure_mb,radius_of_max_wind_nm,radius_of_max_wind_nm,category
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,max,mean,min,mean,max,Unnamed: 8_level_1
1,AL011852,57.777778,100,961.0,961.0,10.0,10.0,3
3,AL011856,84.444444,130,934.0,934.0,10.0,10.0,4
24,AL011900,57.333333,120,936.0,936.0,15.0,15.0,4
32,AL011915,44.444444,65,996.5,990.0,15.0,15.0,1
34,AL011918,68.571429,105,955.0,955.0,10.0,10.0,3


# RESOURCE DATA: Aggregating  resources by category

In [98]:
df_resource = pd.read_csv('datasets/resource_dataset_with_category.csv')

In [99]:
# Category -1 refers to 'tropical storm'
df_resource.head(5)

Unnamed: 0,storm_name,category,year,shelters,meals_millions,water_million_gallons,fuel_million_gallons
0,CHARLEY,3,2004,250,2.0,0.3,0.1
1,FRANCES,3,2004,250,3.5,1.5,0.2
2,IVAN,3,2004,120,2.0,0.7,0.2
3,JEANNE,-1,2004,200,4.0,1.5,0.2
4,DENNIS,-1,2005,70,1.0,0.5,0.1


In [94]:
df_hurdat2['year'] = df_hurdat2['datetime'].dt.year.astype(str)
df_hurdat2

Unnamed: 0,storm_id,storm_name,num_of_obs,datetime,category,record_identifier,status_of_system,latitude,longitude,maximum_sustained_wind_knots,...,50_kt_ne_nm,50_kt_se_nm,50_kt_sw_nm,50_kt_nw_nm,64_kt_ne_nm,64_kt_se_nm,64_kt_sw_nm,64_kt_nw_nm,radius_of_max_wind_nm,year
0,AL011851,UNNAMED,14,1851-06-25 12:00:00,1,,HU,28.0N,96.0W,80,...,,,,,,,,,,1851
1,AL011851,UNNAMED,14,1851-06-25 18:00:00,1,,HU,28.1N,96.5W,80,...,,,,,,,,,,1851
2,AL011851,UNNAMED,14,1851-06-25 21:00:00,1,L,HU,28.2N,96.8W,80,...,,,,,,,,,,1851
3,AL011851,UNNAMED,14,1851-06-26 00:00:00,1,,HU,28.2N,97.0W,70,...,,,,,,,,,,1851
4,AL011851,UNNAMED,14,1851-06-26 06:00:00,Tropical Storm,,TS,28.3N,97.6W,60,...,,,,,,,,,,1851
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13302,AL212023,TWENTY-ONE,6,2023-10-23 18:00:00,Tropical Depression,,TD,11.5N,83.2W,25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,2023
13303,AL212023,TWENTY-ONE,6,2023-10-24 00:00:00,Tropical Depression,,TD,12.2N,83.4W,25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,2023
13304,AL212023,TWENTY-ONE,6,2023-10-24 01:30:00,Tropical Depression,L,TD,12.4N,83.5W,25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,2023
13305,AL212023,TWENTY-ONE,6,2023-10-24 06:00:00,Tropical Depression,,TD,13.0N,83.8W,25,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60.0,2023


In [103]:
df_resource['year'] = df_resource['year'].astype(str)
df_hurdat2_unique = df_hurdat2.drop_duplicates(subset=['storm_name', 'year', 'storm_id'])
df_resources_with_id = df_resource.merge(
    df_hurdat2_unique[['storm_id', 'year', 'storm_name']], on=['storm_name', 'year'], how='left')

In [106]:
df_resources_with_id.head(5)

Unnamed: 0,storm_name,category,year,shelters,meals_millions,water_million_gallons,fuel_million_gallons,storm_id
0,CHARLEY,3,2004,250,2.0,0.3,0.1,AL032004
1,FRANCES,3,2004,250,3.5,1.5,0.2,AL062004
2,IVAN,3,2004,120,2.0,0.7,0.2,AL092004
3,JEANNE,-1,2004,200,4.0,1.5,0.2,AL112004
4,DENNIS,-1,2005,70,1.0,0.5,0.1,AL042005


In [112]:
df_resources_with_id['exists_in_hurdat2'] = df_resources_with_id['storm_id'].isin(grouped_df_without_nan['storm_id'])
df_resources_with_id = df_resources_with_id[df_resources_with_id['exists_in_hurdat2']]
df_resources_with_id = df_resources_with_id.drop(columns=['exists_in_hurdat2'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_resources_with_id['exists_in_hurdat2'] = df_resources_with_id['storm_id'].isin(grouped_df_without_nan['storm_id'])


In [113]:
df_resources_with_id.head(5)

Unnamed: 0,storm_name,category,year,shelters,meals_millions,water_million_gallons,fuel_million_gallons,storm_id
0,CHARLEY,3,2004,250,2.0,0.3,0.1,AL032004
1,FRANCES,3,2004,250,3.5,1.5,0.2,AL062004
2,IVAN,3,2004,120,2.0,0.7,0.2,AL092004
3,JEANNE,-1,2004,200,4.0,1.5,0.2,AL112004
4,DENNIS,-1,2005,70,1.0,0.5,0.1,AL042005


In [114]:
df_resources_with_id.to_csv('datasets/resource_dataset_with_id_category.csv', index=False)