In [4]:
import pandas as pd
import numpy as np

Housing Data

In [None]:
def clean_housing(csv):
    # read in the csv file
    df = pd.read_csv(csv)
    
    return new_df

Storm Data

In [8]:
def clean_storm(csv):
    # NOTE: adjust for inflation
    df = pd.read_csv(csv)



    # keep columns: first 6 columns, state, state_fips, month_name, event_type,cz_fips cz_name, damage_property, begin_lat, begin_lon, end_lat, end_lon. All names are in uppper case
    new_df = df.iloc[:, :6]
    join_df = df[['STATE', 'STATE_FIPS', 'MONTH_NAME', 'EVENT_TYPE', 'CZ_FIPS', 'CZ_NAME', 'DAMAGE_PROPERTY', 'BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON']]
    new_df = pd.concat([new_df, join_df], axis=1)



    # calculate the approximate area of the storm by using the haversine formula to calculate the distance between the begin and end coordinates. Add a new column called 'STORM_AREA' to the dataframe
    # drop na values in cordinates columns
    new_df = new_df.dropna(subset=['BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON'])
    def rectagular_area(begin_lat, begin_lon, end_lat, end_lon):
        # convert decimal degrees to radians
        lat1, lon1, lat2, lon2 = map(np.radians, [begin_lat, begin_lon, end_lat, end_lon])

        # use A = R² (sin lat1 − sin lat2) (lon1 − lon2).
        # from https://www.johndcook.com/blog/2023/02/21/sphere-grid-area/#:~:text=Area%20of%20latitude/longitude%20grid&text=A%20=%20π%20R²%20(sin%20φ,1%20−%20θ2)/180.
        r = 3956  # Radius of earth in miles
        area = r**2 * (np.sin(lat1) - np.sin(lat2)) * (lon1 - lon2)
        return abs(area)
    new_df['STORM_AREA_MILES'] = new_df.apply(lambda row: rectagular_area(row['BEGIN_LAT'], row['BEGIN_LON'], row['END_LAT'], row['END_LON']), axis=1)

    # drop the begin and end lat and lon columns
    new_df = new_df.drop(columns=['BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON'])



    # calculate the total damage by converting the damage property column to a numeric value. 
    # The damage property column is in the format of a string with a number followed by a letter (K, M, B) which represents the magnitude of the damage. 
    # keep missing values
    def convert_damage(damage):
        if pd.isna(damage):
            return np.nan
        elif damage.endswith('K'):
            return float(damage[:-1]) * 1e3
        elif damage.endswith('M'):
            return float(damage[:-1]) * 1e6
        elif damage.endswith('B'):
            return float(damage[:-1]) * 1e9
    new_df['DAMAGE_PROPERTY'] = new_df['DAMAGE_PROPERTY'].apply(convert_damage)


    
    # calculate the duration of the storm by using begin time and end time columns which are in military time (hhmm)
    def calculate_duration(row):
        begin_time = row['BEGIN_TIME']
        end_time = row['END_TIME']

        # pad the time strings with zeros if they are less than 4 characters long
        begin_time = str(begin_time).zfill(4)
        end_time = str(end_time).zfill(4)

        begin_hours = int(begin_time[:2]) 
        end_hours = int(end_time[:2]) 
        begin_minutes = int(begin_time[2:])
        end_minutes = int(end_time[2:])

        duration = (end_hours * 60 + end_minutes) - (begin_hours * 60 + begin_minutes)
        if duration < 0:
            duration += 24 * 60  # Adjust for storms that last past midnight
        return duration  # Return duration in minutes
    new_df['DURATION_MINUTES'] = new_df.apply(calculate_duration, axis=1)



    # keep event types: ones with flood in the name, Hail, heavy rain, high wind, lightning, strong wind, thunderstorm wind, and tornado
    new_df = new_df[new_df['EVENT_TYPE'].str.contains('FLOOD|HAIL|HEAVY RAIN|HIGH WIND|LIGHTNING|STRONG WIND|THUNDERSTORM WIND|TORNADO', case=False, na=False)]
    # drop marine event types
    new_df = new_df[~new_df['EVENT_TYPE'].str.contains('Marine', case=False, na=False)]



    # get month from yearmonth column and add it as a new column called 'MONTH'
    new_df['MONTH'] = new_df['BEGIN_YEARMONTH'].astype(str).str[4:6].astype(int)
    # drop first 6 columns
    new_df = new_df.drop(columns=new_df.columns[:6])




    # combine state cz fips and pad with zeros on left to ensure they are 5 digits long. Add a new column called 'FIPS' to the dataframe
    new_df['FIPS'] = new_df['STATE_FIPS'].astype(str) + new_df['CZ_FIPS'].astype(str).str.zfill(3)
    # drop STATE_FIPS and CZ_FIPS columns
    new_df = new_df.drop(columns=['STATE_FIPS', 'CZ_FIPS'])




    # keep only states in the continental US
    non_continental_states = ['ALASKA', 'HAWAII', 'PUERTO RICO', 'GUAM', 'VIRGIN ISLANDS', 'AMERICAN SAMOA', 'NORTHERN MARIANA ISLANDS'] # keeping the district of columbia
    new_df = new_df[~new_df['STATE'].isin(non_continental_states)]

    return new_df

In [9]:
test = clean_storm(r"rawData\2023\StormData_2023.csv")
test.head()

Unnamed: 0,STATE,MONTH_NAME,EVENT_TYPE,CZ_NAME,DAMAGE_PROPERTY,STORM_AREA_MILES,DURATION_MINUTES,MONTH,FIPS
9,ILLINOIS,August,Thunderstorm Wind,EFFINGHAM,0.0,0.0,5,8,17049
10,ILLINOIS,August,Thunderstorm Wind,CLAY,0.0,0.0,5,8,17025
11,VIRGINIA,September,Thunderstorm Wind,SOUTHAMPTON,1000.0,0.0,0,9,51175
12,VIRGINIA,September,Thunderstorm Wind,SUFFOLK (C),2000.0,0.0,0,9,51800
13,ILLINOIS,August,Thunderstorm Wind,EFFINGHAM,0.0,0.0,5,8,17049


Population Data

In [None]:
def clean_population(csv):
    df = pd.read_csv(csv)
    
    return new_df

In [None]:
test_population = clean_population(r"rawData\2023\county_population_2023.csv")
test_population.head()

Median Income

In [None]:
def clean_income(csv):
    df = pd.read_csv(csv)

    return new_df

In [None]:
test_income = clean_income(r"rawData\2023\county_median_household_income_2023.csv")
test_income.head()

Unnamed: 0,FIPS,MEDIAN_INCOME,CZ_NAME
1,1003,72915,"Baldwin County, Alabama"
2,1015,50780,"Calhoun County, Alabama"
3,1043,58923,"Cullman County, Alabama"
4,1049,43509,"DeKalb County, Alabama"
5,1051,72478,"Elmore County, Alabama"


Relative Oceanic Nino Index

In [1]:
def clean_roni(csv):
    # read in the csv file
    df = pd.read_csv(csv)

    # set the first column to Month
    df = df.rename(columns={df.columns[0]: 'Month'})

    return df

In [5]:
test_roni = clean_roni(r"rawData\2023\RONI_2023.csv")
test_roni

Unnamed: 0,Month,2023
0,DECEMBER,1.2
1,JANUARY,0.433333
2,FEBRUARY,-0.266667
3,MARCH,0.0
4,APRIL,0.3
5,MAY,0.566667
6,JUNE,0.833333
7,JULY,1.1
8,AUGUST,1.366667
9,SEPTEMBER,1.6


Temperature Anomoly