In [8]:
import pandas as pd
import numpy as np
import os

In [14]:
def merge_data(year, out_dir):
    # In cleanedData directory, find the year folder
    cleaned_dir = f"cleanedData/{year}"
    if not os.path.exists(cleaned_dir):
        return

    # Loop through all the files, except 'RONI' and 'temperature_anomalies' in the cleanedData/year directory and LEFT JOIN them on 'FIPS' column
    files = [f for f in os.listdir(cleaned_dir) if f.endswith('.csv') and 'RONI' not in f and 'temperature_anomalies' not in f and 'StormData' not in f]
    dfs = []
    for file in files:
        df = pd.read_csv(os.path.join(cleaned_dir, file))
        dfs.append(df)

    # Merge non-special dataframes on 'FIPS' column using LEFT JOIN
    # first df is StormData csv file
    final_df = pd.read_csv(os.path.join(cleaned_dir, f"StormData.csv"))
    for df in dfs:
        final_df = pd.merge(final_df, df, on='FIPS', how='left')

    # fill missing values in coastal type columns with 'inland'
    final_df['COASTAL_TYPE_SHORELINE'] = final_df['COASTAL_TYPE_SHORELINE'].fillna('inland')
    final_df['COASTAL_TYPE_WATERSHED'] = final_df['COASTAL_TYPE_WATERSHED'].fillna('inland')

    # drop missing month name rows as they are propabably from merging on fips code but the merging df still has non mainland fips code which is not in the final df
    final_df = final_df.dropna(subset=['MONTH_NAME'])

    # merge temperature anomalies to the final data by matching the month number and fips code
    # first make the month number in the final df a int and then merge on month number
    # final_df['MONTH'] = final_df['MONTH'].astype(int)
    anomalies_path = os.path.join(cleaned_dir, f"temperature_anomalies.csv")
    temp_anomalies_df = pd.read_csv(anomalies_path)
    final_df = pd.merge(final_df, temp_anomalies_df, on=['FIPS', 'MONTH'], how='left')

    # drop missing anomalies as they are all but two from the district of columbia which is not in the anomalies df. There are no zero anomalies.
    final_df = final_df.dropna(subset=['ANOMALY']) 

    # merge the RONI data on month NameError
    # first first change the month column in the RONI df to 'MONTH_NAME' and then merge on month name
    roni_path = os.path.join(cleaned_dir, f"RONI.csv")
    roni_df = pd.read_csv(roni_path)
    final_df = pd.merge(final_df, roni_df, on='MONTH_NAME', how='left')


    # drop month name and month number columns
    final_df = final_df.drop(columns=['MONTH', 'MONTH_NAME'])

    # Save the final dataframe to the final directory
    final_df.to_csv(os.path.join(out_dir, f"mergedData_{year}.csv"), index=False)


In [15]:
year = 2023
final_dir = f"{year}"

if not os.path.exists(final_dir):
    os.makedirs(final_dir)

merge_data(year, final_dir)