In [1]:
import pandas as pd
import numpy as np

In [2]:
# Custom conversion function to convert numerical strings to floats.
# For strings starting with the '<' sign, just divide the number by 10 to 
# make it much smaller than the rest of the dataset.
def custom_convert(val):
    if isinstance(val, str) and val.strip().startswith('<'):
        numstr = val.split('<')[1]
        val=float(numstr)
        return(val/10)
    if isinstance(val, str) and val in ["bdl", ""]:
        return float(0.01)
    try:
        return float(val)
    except ValueError:
        return None  # or np.nan if you prefer


def convert_strings_to_float(df):
    custom_convert_columns = ['particulate_microcystin', 'dissolved_microcystin', 'total_dissolved_p',
                             'extracted_phycocyanin', 'soluble_reactive_p', 'extracted_chla', 'ammonia', 'nitrate_nitrite']

    # Apply to columns that have datatype as object
    for column in custom_convert_columns:
        if column in df.columns:
            df[column] = df[column].apply(custom_convert)
            
    return df

In [4]:
def rename_columns_to_match_2021(df):
    dflocal = df.rename(columns={'Date': 'date',
                            'Site': 'station_name',
                            'Local Time (Eastern Time Zone)': 'time',
                            'Arrival_Time': 'time',
                            'Latitude (decimal deg)': 'lat',
                            'Lat_deg' : 'lat',
                            'Long_deg' : 'lon',
                            'Longitude (decimal deg)': 'lon',
                            'Particulate Microcystin (µg/L)': 'particulate_microcystin',
                            'Dissolved Microcystin (µg/L)': 'dissolved_microcystin',
                            'Extracted Phycocyanin (µg/L)': 'extracted_phycocyanin',
                            'Extracted Chlorophyll a (µg/L)': 'extracted_chla',
                            'Turbidity (NTU)': 'turbidity',
                            'Total Suspended Solids (mg/L)': 'tss',
                            'Volatile Suspended Solids (mg/L)': 'vss',
                            'Total Phosphorus (µg P/L)': 'total_p',
                            'Total Dissolved Phosphorus (µg P/L)': 'total_dissolved_p',
                            'Soluble Reactive Phosphorus (µg P/L)': 'soluble_reactive_p',
                            'Ammonia (µg N/L)': 'ammonia',
                            'Nitrate + Nitrite (mg N/L)': 'nitrate_nitrite',
                            'Particulate Organic Carbon (mg/L)': 'particulate_organic_c',
                            'Particulate Organic Nitrogen (mg/L)': 'particulate_organic_n',
                            'Colored Dissolved Organic Material absorbance (m-1) at 400nm ': 'cdom',
                            'Sample Depth (m)' : 'sample_depth_m',
                            'Particulate_Microcystin_ugL-1' : 'particulate_microcystin',
                            'Extracted_CHLa_ugL-1' : 'extracted_chla',
                            'Sample_Depth_m' : 'sample_depth_m',
                            'Dissolved_Microcystin_ugL-1' : 'dissolved_microcystin'
                           })
    return dflocal

In [5]:
def handle_missing_lat_lon(df):
    station_dict = {
    "WE2": (41.762,-83.33), 
    "WE4": (41.827,-83.193),
    "WE6": (41.705,-83.385),
    "WE8": (41.834,-83.364),
    "WE9": (41.718,-83.424),
    "WE12": (41.703,-83.254),
    "WE13": (41.741,-83.136),
    "WE16": (41.66,-83.143)
    }
    
    # Replace empty strings with NA for consistent missing handling
#    df['lat'].replace('', np.nan, inplace=True)
#    df['lon'].replace('', np.nan, inplace=True)
    df['lat'] = df['lat'].replace('', np.nan)
    df['lon'] = df['lon'].replace('', np.nan)


    # Apply imputation: fill in missing lat/lon using the dictionary
    df['lat'] = df.apply(
        lambda row: station_dict[row['station_name']][0]
        if pd.isna(row['lat']) and row['station_name'] in station_dict else row['lat'],
        axis=1
    )

    df['lon'] = df.apply(
        lambda row: station_dict[row['station_name']][1]
        if pd.isna(row['lon']) and row['station_name'] in station_dict else row['lon'],
        axis=1
    )

    return df

In [6]:
def handle_timestamp_and_geocoordinates(df):
    df['lat'] = df['lat'].astype(float)
    df['lon'] = df['lon'].astype(float)

    # Combine and convert to datetime64
    df['time'] = df['time'].fillna('00:00')
    df['timestamp'] = pd.to_datetime(df['date'] + ' ' + df['time'])
    #df = df.drop(columns=['date', 'time'])
    return df


In [7]:
def extract_subset_columns(df):
    subset = ['station_name', 'timestamp', 'lat', 'lon', 
              'particulate_microcystin', 'extracted_chla', 'dissolved_microcystin']

    df = df[subset]
    return df

In [8]:

filelist = [
    './data/lake_erie_habs_field_sampling_results_2012_2018.csv',
    './data/lake_erie_habs_field_sampling_results_2019.csv',
    './data/noaa-glerl-erie-habs-field-sampling-results-2020-2021.csv',
    './data/noaa-glerl-erie-habs-field-sampling-results-2022.csv',
    './data/noaa-glerl-erie-habs-field-sampling-transects-results-2021.csv',
    './data/noaa-glerl-erie-habs-field-sampling-transects-results-2022.csv',
    './data/2024_WLE_Weekly_Datashare_CSV.csv',
    './data/2025_WLE_Weekly_Datashare_CSV.csv',
]

multiyear_df = pd.DataFrame()

for file in filelist:
    df = pd.read_csv(file, encoding='latin1')
    df = rename_columns_to_match_2021(df)
    if 'transects' in file:
        df['station_name'] = 'Transect'
        df['extracted_phycocyanin'] = 0
    df = df[df['sample_depth_m'] < 2]
    df = handle_timestamp_and_geocoordinates(df)
    df = convert_strings_to_float(df)
    df = extract_subset_columns(df)
    df = handle_missing_lat_lon(df)
    multiyear_df = pd.concat([multiyear_df, df], ignore_index=True)

multiyear_df.loc[multiyear_df.particulate_microcystin.isna(), 'particulate_microcystin'] = float(0.01)


In [9]:
multiyear_df.to_csv("./glrl-hab-data.csv")