In [None]:
import pandas as pd
import numpy as np
from sklearn import haversine_distances

Housing Data

Storm Data

In [None]:
def clean_storm(csv):
    df = pd.read_csv(csv)



    # keep columns: first 6 columns, state, state_fips, event_type,cz_fips cz_name, damage_property, begin_lat, begin_lon, end_lat, end_lon. All names are in uppper case
    new_df = df.iloc[:, :6]
    join_df = df[['STATE', 'STATE_FIPS', 'EVENT_TYPE', 'CZ_FIPS', 'CZ_NAME', 'DAMAGE_PROPERTY', 'BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON']]
    new_df = pd.concat([new_df, join_df], axis=1)



    # calculate the approximate area of the storm by using the haversine formula to calculate the distance between the begin and end coordinates. Add a new column called 'STORM_AREA' to the dataframe
    # drop na values in cordinates columns
    new_df = new_df.dropna(subset=['BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON'])
    def rect_area(begin_lat, begin_lon, end_lat, end_lon):
        # convert decimal degrees to radians
        lat1, lon1, lat2, lon2 = map(np.radians, [begin_lat, begin_lon, end_lat, end_lon])

        # use A = R² (sin lat1 − sin lat2) (lon1 − lon2).
        # from https://www.johndcook.com/blog/2023/02/21/sphere-grid-area/#:~:text=Area%20of%20latitude/longitude%20grid&text=A%20=%20π%20R²%20(sin%20φ,1%20−%20θ2)/180.
        r = 3956  # Radius of earth in miles
        area = r**2 * (np.sin(lat1) - np.sin(lat2)) * (lon1 - lon2)
        return abs(area)
    new_df['STORM_AREA_MILES'] = new_df.apply(lambda row: rect_area(row['BEGIN_LAT'], row['BEGIN_LON'], row['END_LAT'], row['END_LON']), axis=1)

    # drop the begin and end lat and lon columns
    new_df = new_df.drop(columns=['BEGIN_LAT', 'BEGIN_LON', 'END_LAT', 'END_LON'])



    # calculate the total damage by converting the damage property column to a numeric value. 
    # The damage property column is in the format of a string with a number followed by a letter (K, M, B) which represents the magnitude of the damage. 
    # keep missing values
    def convert_damage(damage):
        if pd.isna(damage):
            return np.nan
        elif damage.endswith('K'):
            return float(damage[:-1]) * 1e3
        elif damage.endswith('M'):
            return float(damage[:-1]) * 1e6
        elif damage.endswith('B'):
            return float(damage[:-1]) * 1e9
    new_df['DAMAGE_PROPERTY'] = new_df['DAMAGE_PROPERTY'].apply(convert_damage)


    
    # calculate the duration of the storm by using begin time and end time columns which are in military time (hhmm)
    def calculate_duration(row):
        begin_time = row['BEGIN_TIME']
        end_time = row['END_TIME']

        # pad the time strings with zeros if they are less than 4 characters long
        begin_time = str(begin_time).zfill(4)
        end_time = str(end_time).zfill(4)

        begin_hours = int(begin_time[:2]) 
        end_hours = int(end_time[:2]) 
        begin_minutes = int(begin_time[2:])
        end_minutes = int(end_time[2:])

        duration = (end_hours * 60 + end_minutes) - (begin_hours * 60 + begin_minutes)
        if duration < 0:
            duration += 24 * 60  # Adjust for storms that last past midnight
        return duration  # Return duration in minutes
    new_df['DURATION_MINUTES'] = new_df.apply(calculate_duration, axis=1)



    # keep event types: ones with flood in the name, Hail, heavy rain, high wind, lightning, strong wind, thunderstorm wind, and tornado
    new_df = new_df[new_df['EVENT_TYPE'].str.contains('FLOOD|HAIL|HEAVY RAIN|HIGH WIND|LIGHTNING|STRONG WIND|THUNDERSTORM WIND|TORNADO', case=False, na=False)]
    # drop marine event types
    new_df = new_df[~new_df['EVENT_TYPE'].str.contains('Marine', case=False, na=False)]



    # get month from yearmonth column and add it as a new column called 'MONTH'
    new_df['MONTH'] = new_df['BEGIN_YEARMONTH'].astype(str).str[4:6].astype(int)
    # drop first 6 columns
    new_df = new_df.drop(columns=new_df.columns[:6])




    # combine state cz fips and pad with zeros on left to ensure they are 5 digits long. Add a new column called 'FIPS' to the dataframe
    new_df['FIPS'] = new_df['STATE_FIPS'].astype(str) + new_df['CZ_FIPS'].astype(str).str.zfill(3)
    # drop STATE_FIPS and CZ_FIPS columns
    new_df = new_df.drop(columns=['STATE_FIPS', 'CZ_FIPS'])




    # keep only states in the continental US
    non_continental_states = ['ALASKA', 'HAWAII', 'PUERTO RICO', 'GUAM', 'VIRGIN ISLANDS', 'AMERICAN SAMOA', 'NORTHERN MARIANA ISLANDS'] # keeping the district of columbia
    new_df = new_df[~new_df['STATE'].isin(non_continental_states)]

    return new_df

In [None]:
test = clean_storm(r"rawData\2023\StormEvents_details-ftp_v1.0_d2023_c20260116.csv")
test.head()

<StringArray>
[            'ILLINOIS',             'VIRGINIA',            'WISCONSIN',
             'NEBRASKA',             'NEW YORK',             'MISSOURI',
               'OREGON',           'WASHINGTON',            'LOUISIANA',
                 'IOWA',           'CALIFORNIA',                'MAINE',
             'ARKANSAS',          'MISSISSIPPI',             'OKLAHOMA',
                 'UTAH',        'NEW HAMPSHIRE',         'PENNSYLVANIA',
         'RHODE ISLAND',              'ALABAMA',        'MASSACHUSETTS',
             'COLORADO',               'KANSAS',         'SOUTH DAKOTA',
              'WYOMING',       'NORTH CAROLINA',              'FLORIDA',
              'ARIZONA',            'MINNESOTA',                'TEXAS',
           'NEW MEXICO',              'GEORGIA',             'KENTUCKY',
                 'OHIO',             'MICHIGAN',               'NEVADA',
         'NORTH DAKOTA',              'MONTANA',            'TENNESSEE',
             'MARYLAND',        'WEST

Population Data

Median Income

Relative Oceanic Nino Index

In [57]:
def clean_roni(csv):
    # read in the csv file
    df = pd.read_csv(csv)

    # set the year as the index
    df = df.set_index('Year')

    # rename the columns DJF becomes december, january, and february
    df = df.rename(columns={'DJF': 'DECEMBER_JANUARY_FEBRUARY'})
    df = df.rename(columns={'JFM': 'JANUARY_FEBRUARY_MARCH'})
    df = df.rename(columns={'FMA': 'FEBRUARY_MARCH_APRIL'})
    df = df.rename(columns={'MAM': 'MARCH_APRIL_MAY'})
    df = df.rename(columns={'AMJ': 'APRIL_MAY_JUNE'})
    df = df.rename(columns={'MJJ': 'MAY_JUNE_JULY'})
    df = df.rename(columns={'JJA': 'JUNE_JULY_AUGUST'})
    df = df.rename(columns={'JAS': 'JULY_AUGUST_SEPTEMBER'})
    df = df.rename(columns={'ASO': 'AUGUST_SEPTEMBER_OCTOBER'})
    df = df.rename(columns={'SON': 'SEPTEMBER_OCTOBER_NOVEMBER'})
    df = df.rename(columns={'OND': 'OCTOBER_NOVEMBER_DECEMBER'})
    df = df.rename(columns={'NDJ': 'NOVEMBER_DECEMBER_JANUARY'})

    # split each column with the month names into three separate columns with the month name as the column name and the value as the value.
    # if month exists already add the value to the existing column.
    month_columns = ['DECEMBER_JANUARY_FEBRUARY', 'JANUARY_FEBRUARY_MARCH', 'FEBRUARY_MARCH_APRIL', 'MARCH_APRIL_MAY', 'APRIL_MAY_JUNE', 'MAY_JUNE_JULY', 'JUNE_JULY_AUGUST', 'JULY_AUGUST_SEPTEMBER', 'AUGUST_SEPTEMBER_OCTOBER', 'SEPTEMBER_OCTOBER_NOVEMBER', 'OCTOBER_NOVEMBER_DECEMBER', 'NOVEMBER_DECEMBER_JANUARY']
    for month_column in month_columns:
        months = month_column.split('_')
        for month in months:
            col_name = month
            if col_name in df.columns:
                df[col_name] += df[month_column]
            else:
                df[col_name] = df[month_column]
    
        # drop the original column
        df = df.drop(columns=[month_column])

    # divide all month columns by 3 to get the average value for each month
    df = df.div(3)

    # # combine duplicate columns by taking the mean of the duplicate columns
    # df = df.groupby(df.columns, axis=1).mean()

    return df

In [58]:
test_roni = clean_roni(r"rawData\2023\RONI.csv")
test_roni.head()

Unnamed: 0_level_0,DECEMBER,JANUARY,FEBRUARY,MARCH,APRIL,MAY,JUNE,JULY,AUGUST,SEPTEMBER,OCTOBER,NOVEMBER
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1950,-0.966667,-1.2,-1.333333,-1.233333,-1.166667,-1.066667,-0.833333,-0.6,-0.433333,-0.4,-0.466667,-0.6
1951,0.333333,-0.166667,-0.5,-0.166667,0.133333,0.4,0.566667,0.733333,0.866667,1.033333,1.066667,1.0
1952,0.2,0.333333,0.4,0.333333,0.266667,0.166667,0.033333,-0.033333,0.033333,0.1,0.1,0.066667
1953,0.666667,0.6,0.533333,0.633333,0.7,0.766667,0.766667,0.733333,0.733333,0.766667,0.8,0.8
1954,-0.2,0.2,0.433333,0.033333,-0.3,-0.466667,-0.533333,-0.633333,-0.766667,-0.833333,-0.8,-0.733333


Temperature Anomoly