In [6]:
# Dependencies and Setup
import pandas as pd
from pathlib import Path
import warnings

# Hiding warnings
warnings.filterwarnings('ignore')

# File to Load
data_to_load = Path('Resources/raw_data/storm_raw_data.csv')

# Read Data File and store into Pandas DataFrames
raw_data = pd.read_csv(data_to_load)

raw_data.head()

Unnamed: 0,BEGIN_YEARMONTH,BEGIN_DAY,BEGIN_TIME,END_YEARMONTH,END_DAY,END_TIME,EPISODE_ID,EVENT_ID,STATE,STATE_FIPS,...,END_RANGE,END_AZIMUTH,END_LOCATION,BEGIN_LAT,BEGIN_LON,END_LAT,END_LON,EPISODE_NARRATIVE,EVENT_NARRATIVE,DATA_SOURCE
0,202401,29,1000,202401,30,900,188640,1160769,ALASKA,2,...,10.0,NW,DOUGLAS,58.362,-134.5824,58.3625,-134.5798,A strong storm force low pressure system over ...,Rain and rising freezing levels moved over the...,CSV
1,202401,13,330,202401,13,540,188650,1159956,LAKE SUPERIOR,92,...,2.0,E,COPPER HARBOR,47.4703,-87.8403,47.4703,-87.8403,A powerful storm system moving through the Upp...,The Copper Harbor ASOS measured storm force no...,CSV
2,202401,13,330,202401,13,540,188650,1159913,LAKE SUPERIOR,92,...,2.0,E,COPPER HARBOR,47.4688,-87.8665,47.4688,-87.8665,A powerful storm system moving through the Upp...,The Copper Harbor ASOS measured storm force no...,CSV
3,202401,13,330,202401,13,540,188650,1159953,LAKE SUPERIOR,92,...,7.0,W,MANITOU ISLAND LIGHT,47.4298,-87.7144,47.4298,-87.7144,A powerful storm system moving through the Upp...,The Copper Harbor ASOS measured storm force no...,CSV
4,202402,28,700,202402,28,845,189314,1165191,LAKE SUPERIOR,92,...,0.0,N,STANNARD ROCK LIGHT,47.18,-87.23,47.18,-87.23,North to northwest storm force winds between 5...,The Stannard Rock C-MAN station measured storm...,CSV


In [7]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11945 entries, 0 to 11944
Data columns (total 51 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   BEGIN_YEARMONTH     11945 non-null  int64  
 1   BEGIN_DAY           11945 non-null  int64  
 2   BEGIN_TIME          11945 non-null  int64  
 3   END_YEARMONTH       11945 non-null  int64  
 4   END_DAY             11945 non-null  int64  
 5   END_TIME            11945 non-null  int64  
 6   EPISODE_ID          11945 non-null  int64  
 7   EVENT_ID            11945 non-null  int64  
 8   STATE               11945 non-null  object 
 9   STATE_FIPS          11945 non-null  int64  
 10  YEAR                11945 non-null  int64  
 11  MONTH_NAME          11945 non-null  object 
 12  EVENT_TYPE          11945 non-null  object 
 13  CZ_TYPE             11945 non-null  object 
 14  CZ_FIPS             11945 non-null  int64  
 15  CZ_NAME             11945 non-null  object 
 16  WFO 

In [8]:
# Removing unnecessary columns
columns_to_keep = ["YEAR", "MONTH_NAME","BEGIN_DAY", "BEGIN_DATE_TIME", "DAMAGE_PROPERTY", "EVENT_TYPE", "STATE", "TOR_F_SCALE", "BEGIN_LOCATION", "BEGIN_LAT", "BEGIN_LON", "EVENT_NARRATIVE"]

filtered_data = raw_data[columns_to_keep]

# Removing rows with missing coordinates and property damage is $0
filtered_data = filtered_data.dropna(subset=["BEGIN_LAT", "DAMAGE_PROPERTY"])


# Ensure "DAMAGE_PROPERTY" is treated as string and strip any whitespace
filtered_data["DAMAGE_PROPERTY"] = filtered_data["DAMAGE_PROPERTY"].astype(str).str.strip()


filtered_data = filtered_data.dropna(subset=["BEGIN_LAT"])
filtered_data = filtered_data[~filtered_data["DAMAGE_PROPERTY"].isin(["0.00K", "0.0"])]

filtered_data.head()

Unnamed: 0,YEAR,MONTH_NAME,BEGIN_DAY,BEGIN_DATE_TIME,DAMAGE_PROPERTY,EVENT_TYPE,STATE,TOR_F_SCALE,BEGIN_LOCATION,BEGIN_LAT,BEGIN_LON,EVENT_NARRATIVE
16,2024,January,16,16-JAN-24 09:42:00,100.00K,Flood,MONTANA,,(DLN)DILLON ARPT,45.25,-112.6,Reported ice jam along the Beaverhead River no...
17,2024,February,26,26-FEB-24 15:30:00,7.50K,Thunderstorm Wind,OREGON,,LACOMB,44.6201,-122.7533,"Property owner reported damage to 6 trees, alo..."
23,2024,January,10,10-JAN-24 09:00:00,200.00K,Flash Flood,MAINE,,PORTLAND,43.6712,-70.2589,Heavy rain and snowmelt impacted Cumberland Co...
24,2024,January,13,13-JAN-24 11:00:00,200.00K,Flash Flood,MAINE,,PORTLAND,43.671,-70.2586,A coastal front was established along coastal ...
25,2024,January,10,10-JAN-24 09:00:00,200.00K,Flash Flood,MAINE,,OLD ORCHARD BEACH,43.5576,-70.3884,Heavy rain and snowmelt impacted Coastal York ...


In [9]:
# Converting Property Damage to USD, removing decimal, converting to integer

def convert_damage(value):
    if isinstance(value, str):
        if 'K' in value.upper():
            return int(float(value[:-1]) * 1000)
        elif 'M' in value.upper():
            return int(float(value[:-1]) * 1000000)
        else:
            return int(float(value))
    return int(value)

# Apply the conversion function to the DAMAGE_PROPERTY column
filtered_data['DAMAGE_PROPERTY'] = filtered_data['DAMAGE_PROPERTY'].apply(convert_damage)

# Renaming "Flash Flood" to "Flood"
filtered_data['EVENT_TYPE'] = filtered_data['EVENT_TYPE'].replace('Flash Flood', 'Flood')

filtered_data.head()

Unnamed: 0,YEAR,MONTH_NAME,BEGIN_DAY,BEGIN_DATE_TIME,DAMAGE_PROPERTY,EVENT_TYPE,STATE,TOR_F_SCALE,BEGIN_LOCATION,BEGIN_LAT,BEGIN_LON,EVENT_NARRATIVE
16,2024,January,16,16-JAN-24 09:42:00,100000,Flood,MONTANA,,(DLN)DILLON ARPT,45.25,-112.6,Reported ice jam along the Beaverhead River no...
17,2024,February,26,26-FEB-24 15:30:00,7500,Thunderstorm Wind,OREGON,,LACOMB,44.6201,-122.7533,"Property owner reported damage to 6 trees, alo..."
23,2024,January,10,10-JAN-24 09:00:00,200000,Flood,MAINE,,PORTLAND,43.6712,-70.2589,Heavy rain and snowmelt impacted Cumberland Co...
24,2024,January,13,13-JAN-24 11:00:00,200000,Flood,MAINE,,PORTLAND,43.671,-70.2586,A coastal front was established along coastal ...
25,2024,January,10,10-JAN-24 09:00:00,200000,Flood,MAINE,,OLD ORCHARD BEACH,43.5576,-70.3884,Heavy rain and snowmelt impacted Coastal York ...


In [10]:
# Checking info
filtered_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 724 entries, 16 to 2672
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   YEAR             724 non-null    int64  
 1   MONTH_NAME       724 non-null    object 
 2   BEGIN_DAY        724 non-null    int64  
 3   BEGIN_DATE_TIME  724 non-null    object 
 4   DAMAGE_PROPERTY  724 non-null    int64  
 5   EVENT_TYPE       724 non-null    object 
 6   STATE            724 non-null    object 
 7   TOR_F_SCALE      45 non-null     object 
 8   BEGIN_LOCATION   724 non-null    object 
 9   BEGIN_LAT        724 non-null    float64
 10  BEGIN_LON        724 non-null    float64
 11  EVENT_NARRATIVE  724 non-null    object 
dtypes: float64(2), int64(3), object(7)
memory usage: 73.5+ KB


In [11]:
# Converting BEGIN_DATE_TIME to datetime
filtered_data["BEGIN_DATE_TIME"] = pd.to_datetime(filtered_data["BEGIN_DATE_TIME"], format="%d-%b-%y %H:%M:%S")

# Creating TIME_OF_DAY column
filtered_data["TIME_OF_DAY"] = filtered_data["BEGIN_DATE_TIME"].dt.time


# Reordering columns
columns_order = ["YEAR", "MONTH_NAME","BEGIN_DAY","BEGIN_DATE_TIME","TIME_OF_DAY", "DAMAGE_PROPERTY", "EVENT_TYPE", "STATE", "TOR_F_SCALE", "BEGIN_LOCATION", "BEGIN_LAT", "BEGIN_LON", "EVENT_NARRATIVE"]
filtered_data = filtered_data[columns_order]

filtered_data.head()

Unnamed: 0,YEAR,MONTH_NAME,BEGIN_DAY,BEGIN_DATE_TIME,TIME_OF_DAY,DAMAGE_PROPERTY,EVENT_TYPE,STATE,TOR_F_SCALE,BEGIN_LOCATION,BEGIN_LAT,BEGIN_LON,EVENT_NARRATIVE
16,2024,January,16,2024-01-16 09:42:00,09:42:00,100000,Flood,MONTANA,,(DLN)DILLON ARPT,45.25,-112.6,Reported ice jam along the Beaverhead River no...
17,2024,February,26,2024-02-26 15:30:00,15:30:00,7500,Thunderstorm Wind,OREGON,,LACOMB,44.6201,-122.7533,"Property owner reported damage to 6 trees, alo..."
23,2024,January,10,2024-01-10 09:00:00,09:00:00,200000,Flood,MAINE,,PORTLAND,43.6712,-70.2589,Heavy rain and snowmelt impacted Cumberland Co...
24,2024,January,13,2024-01-13 11:00:00,11:00:00,200000,Flood,MAINE,,PORTLAND,43.671,-70.2586,A coastal front was established along coastal ...
25,2024,January,10,2024-01-10 09:00:00,09:00:00,200000,Flood,MAINE,,OLD ORCHARD BEACH,43.5576,-70.3884,Heavy rain and snowmelt impacted Coastal York ...


In [12]:
filtered_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 724 entries, 16 to 2672
Data columns (total 13 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   YEAR             724 non-null    int64         
 1   MONTH_NAME       724 non-null    object        
 2   BEGIN_DAY        724 non-null    int64         
 3   BEGIN_DATE_TIME  724 non-null    datetime64[ns]
 4   TIME_OF_DAY      724 non-null    object        
 5   DAMAGE_PROPERTY  724 non-null    int64         
 6   EVENT_TYPE       724 non-null    object        
 7   STATE            724 non-null    object        
 8   TOR_F_SCALE      45 non-null     object        
 9   BEGIN_LOCATION   724 non-null    object        
 10  BEGIN_LAT        724 non-null    float64       
 11  BEGIN_LON        724 non-null    float64       
 12  EVENT_NARRATIVE  724 non-null    object        
dtypes: datetime64[ns](1), float64(2), int64(3), object(7)
memory usage: 79.2+ KB


In [13]:
# Export out clean_data.csv

output_path = Path('Resources/clean_data.csv')
filtered_data.to_csv(output_path, index=False)