Author: Paul Inkenbrandt, modified by Diane Menuz for testing  
Date: October 1, 2025  
Goal: Test modifications to micromet to see about fixing problems, using Escalante as a test case. Clean up  
processing steps.

Requirements
- raw_fold: folder
- within raw_fold, subfolders with each stationid
- within station-specific folder, the following files:
    - {stationname}_Flux_AmeriFluxFormat.dat: Ameriflux file downloaded from EasyFlux; should only be one
    - {stationname}_Flux_CSFormat.dat: CSFLUX file downloaded from EasyFlux; should only be one
    - folder called AmeriFluxFormat with eddy data downloaded from the station
    - folder called Statistics_Ameriflux with met data downloaded from the station
    - folder called Statistics with met data downloaded from the station

To Do  

- Fix the compare_to_raw component! should generalize the file naming and need to make into a loop if doing all  
of the sites?? Only works nonw b/c I am loking at a single site
- Could keep report when running through multiple files- I did for the Met Ameriflux Stats and CSFlux, check other   
compoments
- Move towards a loop for both CSFlux and Ameriflux_format b/c it seems like the downloads are sometimes missing  
info; copy the Easyflux data into the same folder as other data


# Initialization

## Import Libraries

In [1]:
import pandas as pd
import numpy as np
import os
import geopandas as gpd
import sys
import pathlib
from pathlib import Path


import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px

import pandas as pd
import numpy as np
from pandas.tseries.frequencies import to_offset
import plotly.graph_objects as go

sys.path.append("../../src/")
import micromet
from micromet import validate
from micromet import validation
from micromet import gap_summary
from micromet import cleanup

%matplotlib inline

In [2]:
loggerloader_path= "C:/Users/dmenuz/Documents/scripts/loggerloader"

import sys

sys.path.append(loggerloader_path)
from loggerloader import plotlystuff

ModuleNotFoundError: No module named 'loggerloader'

## Initialize Logger

In [167]:
import logging
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
ch = logging.StreamHandler()
ch.setFormatter(
    logging.Formatter(
        fmt="%(levelname)s [%(asctime)s] %(name)s – %(message)s",
        datefmt="%Y-%m-%d %H:%M:%S",
    )
)
logger.addHandler(ch)

# Define Root Folder and Site Foldes

In [168]:
raw_fold = pathlib.Path(f'M:/Shared drives/UGS_Flux/Data_Downloads/compiled')

#amflux column data
amflux = pd.read_csv(r'M:\Shared drives\UGS_Flux\Data_Downloads\compiled\flux-met_processing_variables_20250818.csv')

In [169]:
site_folders = {#'US-UTD':'Dugout_Ranch',
                # 'US-UTB':'BSF',
                 'US-UTJ':'Bluff',
                # 'US-UTW':'Wellington',
                'US-UTE':'Escalante'
                # 'US-UTM':'Matheson',
                # 'US-UTP':'Phrag',
                # 'US-CdM':'Cedar_mesa',
                # 'US-UTV':'Desert_View_Myton',
                # 'US-UTN':'Juab',
                # 'US-UTG':'Green_River',
                # 'US-UTL':'Pelican_Lake',
                 }

# loggerids = {
#     "eddy": {
#         "US-UTD": 21314,
#         "US-UTB": 27736,
#         "US-UTJ": 21020,
#         "US-UTW": 21025,
#         "US-UTE": 21021,
#         "US-UTM": 21029,
#         "US-UTP": 8442,
#         "US-CdM": 21313,
#         "US-UTV": 21027,
#         "US-UTN": 8441,
#         "US-UTG": 25415,
#         "US-UTL": 21215,
#     },
#     "met": {
#         "US-UTD": 21031,
#         "US-UTB": 27736,
#         "US-UTJ": 21030,
#         "US-UTW": 21026,
#         "US-UTE": 21032,
#         "US-UTM": 21023,
#         "US-UTP": 8441,
#         "US-CdM": 21029,
#         "US-UTV": 21311,
#         "US-UTG": 25414,
#         "US-UTL": 21028,
#     },
# }

# Testing

Used for general testing with raw data

In [349]:
raw_file = r'M:\Shared drives\UGS_Flux\Data_Downloads\compiled\US-UTJ\Bluff_Flux_CSFormat.dat'
rawcs = pd.read_csv(raw_file, header=1, skiprows=[2,3],
                    na_values=[-9999,"NAN","NaN","nan"])

format_str = '%Y-%m-%d %H:%M:%S'
rawcs['DATETIME_END'] = pd.to_datetime(
    rawcs['TIMESTAMP'].str.strip(), 
    format=format_str,
    errors='coerce'
)

rawcs = rawcs.set_index("DATETIME_END").sort_index()

In [350]:
df = rawcs.copy()
detect_timesteps = True
cont_num = 5

df['timediff'] = df.index.to_series().diff().dt.total_seconds() / 60
temp = df[df.timediff<120]

plotlystuff([temp], ['timediff'])



In [None]:
import pandas as pd
import numpy as np

def resample_alternating_frequency_with_other(df, min_records_threshold=24):
    """
    Identifies contiguous blocks of data, resamples 30min/60min blocks,
    and assigns 'OTHER' to the timestep for unclassified (non-gap) blocks.
    """
    
    # --- Step 1: Calculate Time Differences and Classify ---
    df['time_diff'] = df.index.to_series().diff().dt.total_seconds() / 60
    df['time_diff_min'] = df['time_diff'].round()

    def classify_frequency(diff):
        if pd.isna(diff):
            return 'UNKNOWN'
        if 25 <= diff <= 35:
            return '30MIN'
        elif 55 <= diff <= 65:
            return '60MIN'
        else:
            # All other differences are 'OTHER' frequencies or true gaps
            return 'OTHER'

    df['frequency_class'] = df['time_diff_min'].apply(classify_frequency)

    # --- Step 2: Create Block Categories and IDs ---
    
    # 'other_to_nan' column: ONLY UNKNOWN and sustained GAPS become NaN
    # The 'OTHER' string is preserved here for temporary identification.
    df['block_category_temp'] = df['frequency_class'].replace('UNKNOWN', np.nan)
    
    # 1. Forward fill the very first NaN (UNKNOWN).
    df['block_category_temp'] = df['block_category_temp'].ffill()

    # 2. Backward fill with limit=1. Corrects the first row of any block.
    df['block_category_temp'] = df['block_category_temp'].bfill(limit=1)

    # This column will contain '30MIN', '60MIN', 'OTHER', or NaN (for true gaps)
    df['block_category'] = df['block_category_temp']

    # Create Block IDs for both defined blocks, 'OTHER' blocks, and sustained gaps
    df['block_id'] = (df['block_category'].fillna('GAP_BLOCK') != df['block_category'].fillna('GAP_BLOCK').shift(1)).cumsum()

    # --- Step 3: Separate and Iterate Over Defined Blocks and 'OTHER' Blocks ---

    # Filter for blocks that are NOT true NaNs (i.e., NOT sustained gaps)
    defined_or_other_blocks = df[df['block_category'].notna()]

    resampled_list = []
    previous_freq = None
    
    for block_id, block in defined_or_other_blocks.groupby('block_id'):
        
        current_category = block['block_category'].iloc[0]

        # Handle 30MIN and 60MIN blocks
        if current_category in ['30MIN', '60MIN']:
            current_freq = int(current_category.replace('MIN', ''))

            # Apply the minimum record threshold logic
            if len(block) >= min_records_threshold:
                final_freq_for_resample = current_freq
                previous_freq = current_freq
            elif previous_freq is not None:
                final_freq_for_resample = previous_freq
            else:
                final_freq_for_resample = current_freq
                
            # Resample and assign timestep
            freq_str = f"{final_freq_for_resample}min"
            
            original_cols = block.columns.drop(['time_diff', 'time_diff_min', 'frequency_class', 'block_category_temp', 'block_category', 'block_id'], errors='ignore')
            resampled_block = block[original_cols].resample(freq_str).last()
            resampled_block['timestep'] = final_freq_for_resample

        # Handle 'OTHER' blocks
        else: # current_category == 'OTHER'
            # Do NOT resample 'OTHER' blocks, just assign the timestep and keep the original data
            resampled_block = block.drop(columns=['time_diff', 'time_diff_min', 'frequency_class', 'block_category_temp', 'block_category', 'block_id'], errors='ignore').copy()
            resampled_block['timestep'] = -1
        
        resampled_list.append(resampled_block)


    # --- Step 4: Recombine with Original Gap Rows ---
    
    final_resampled_blocks = pd.concat(resampled_list)

    # Get the original true gap rows (block_category is NaN)
    gap_rows = df[df['block_category'].isna()].drop(
        columns=['time_diff', 'time_diff_min', 'frequency_class', 'block_category_temp', 'block_category', 'block_id'], 
        errors='ignore'
    )

    # Concatenate resampled data, 'OTHER' data, and the true gap rows, then sort
    final_df = pd.concat([final_resampled_blocks, gap_rows]).sort_index()

    # Clean up the timestep column for the true gap rows (they should be NaN)
    final_df['timestep'] = final_df['timestep'].replace('', np.nan).fillna('TRUE_GAP')

    # Final cleanup of temporary columns
    final_df = final_df.drop(columns=['time_diff', 'time_diff_min', 'frequency_class', 'block_category_temp', 'block_category', 'block_id'], errors='ignore')

    return final_df

In [324]:
out = resample_alternating_frequency_with_other(rawcs)

In [None]:
df = rawcs.copy()

df['time_diff'] = df.index.to_series().diff().dt.total_seconds() / 60
min_diff = df.time_diff.iloc[1]
max_diff = df.time_diff.iloc[-1]

df.index = df.index.floor('min')

if (max_diff==60) & (min_diff==60):
    df = df.resample('60min').last()
    df['timestep'] = 60
    logging.debug('All data hourly')
elif (max_diff==30) & (min_diff==30):
    df = df.resample('30min').last()
    df['timestep'] = 30
    logging.debug('All data half-hourly')
    print('All data half-hourly')
elif (max_diff==60) & (min_diff==30):
    max30 = df[df.time_diff==30].index.max()
    mask = df.index>=max30
    df30 = df[~mask]
    df30 = df30.resample('30min').last()
    df30['timestep'] = 30
    df60 = df[mask]
    df60 = df60.resample('60min').last()
    df60['timestep'] = 60
    df = pd.concat([df30, df60], axis=0)
    logging.debug(f'Mixed timestamps. Hourly data starts on {max30}')
    print(f'Mixed timestamps. Hourly data starts on {max30}')
else:
    print('Uh oh')
df.drop(columns=['time_diff'], inplace=True)
return(df)

All data half-hourly


In [None]:
import pandas as pd
import logging
import numpy as np

# Set logging for demonstration purposes (can be removed in production)
# logging.basicConfig(level=logging.DEBUG, format='%(levelname)s: %(message)s')

def resample_single_frequency_switch(df, sample_size=100):
    """
    Resamples a DataFrame based on a single detected frequency switch (30min to 60min).
    It uses the mode of the first 100 records to robustly determine the initial frequency,
    handling minor clock jitter and occasional gaps.

    Args:
        df (pd.DataFrame): DataFrame with a DatetimeIndex.
        sample_size (int): The number of initial records to analyze for the starting routine.

    Returns:
        pd.DataFrame: Resampled DataFrame with a 'timestep' column.
    """
    
    # 1. Input Validation and Preparation
    if not isinstance(df.index, pd.DatetimeIndex):
        logging.error("DataFrame index must be a DatetimeIndex.")
        raise ValueError("DataFrame index must be a DatetimeIndex.")
        
    if len(df) < 2:
        logging.warning("DataFrame has fewer than 2 rows; cannot determine frequency.")
        df['timestep'] = pd.NA
        return df

    # Calculate Time Differences and Round to nearest minute (handles jitter)
    df['time_diff'] = df.index.to_series().diff().dt.total_seconds() / 60
    df['time_diff_rounded'] = df['time_diff'].round()
    
    # Floor the Index to the nearest minute to prepare for accurate resampling
    df.index = df.index.floor('min')
    
    # 2. Determine Initial and Final Routines (Robust Check)

    N = len(df)
    
    # Safely get a sample of the initial time differences (skipping the first NaN)
    analysis_size = min(sample_size + 1, N) # Sample size + 1 to account for the skip
    
    # Calculate the mode of the initial routine
    initial_mode = df['time_diff_rounded'].iloc[1:analysis_size].mode()
    initial_routine = initial_mode.iloc[0] if not initial_mode.empty else None

    # The final routine is the last valid time difference
    final_routine = df['time_diff_rounded'].iloc[-1]

    # --- 3. Conditional Resampling Logic ---
    
    # Case 1: All data is 60-minute
    if (final_routine == 60) and (initial_routine == 60):
        df = df.resample('60min').last()
        df['timestep'] = 60
        logging.debug('All data hourly.')

    # Case 2: All data is 30-minute
    elif (final_routine == 30) and (initial_routine == 30):
        df = df.resample('30min').last()
        df['timestep'] = 30
        logging.debug('All data half-hourly.')

    # Case 3: Switch from 30-minute to 60-minute
    elif (final_routine == 60) and (initial_routine == 30):
        
        # Find the timestamp of the last *valid* 30-minute rounded difference
        max30 = df[df['time_diff_rounded'] == 30].index.max()
        
        # Split the data into 30min and 60min sections
        mask = df.index >= max30
        
        # Resample 30-minute section
        df30 = df[~mask].copy()
        df30 = df30.resample('30min').last()
        df30['timestep'] = 30
        
        # Resample 60-minute section
        df60 = df[mask].copy()
        df60 = df60.resample('60min').last()
        df60['timestep'] = 60
        
        # Concatenate and sort
        df = pd.concat([df30, df60], axis=0).sort_index()
        logging.debug(f'Mixed timestamps. Hourly data starts on {max30}.')

    # Case 4: Other (e.g., 60 to 30, or initial mode is neither 30 nor 60)
    else:
        logging.warning(f"Unhandled routine pattern: Initial={initial_routine}, Final={final_routine}. No resampling performed.")
        # If unhandled, drop temp columns and return the original DataFrame
        df.drop(columns=['time_diff', 'time_diff_rounded'], inplace=True, errors='ignore')
        df['timestep'] = pd.NA
        return df

    # Final cleanup and return
    df.drop(columns=['time_diff', 'time_diff_rounded'], inplace=True, errors='ignore')
    return df

In [353]:
df.timestep.value_counts()

timestep
30    22318
Name: count, dtype: int64

# Met

### Compile Met Statistics Tables

For each site, loop through all files in that site's Statistics folder, process, and then  
compile. Output is a dictionary for each site key with the value as all of the site data.

In [None]:
stats = {}
reports = {}
for key, value in site_folders.items():
    print(f"Processing site: {key} - {value}")
    parent_fold = raw_fold / f"{key}" / "Statistics"
    am_df = {}
    stat_report = {}
    i=0
    #raw_data = micromet.raw_file_compile(raw_fold, parent_fold, search_str = "TOA5*Statistics*.dat")    
    for file_name in parent_fold.glob("TOA5*Statistics*.dat"):
        i += 1
        print(f"Processing file: {file_name}")
        sts = pd.read_csv(file_name, skiprows = [0,2,3])
        for col in sts.columns:
            if col.endswith("_Avg"):
                sts.rename(columns={col: col[:-4]}, inplace=True)
            elif col.endswith("_Tot"):
                sts.rename(columns={col: col[:-4]}, inplace=True)
        sts['TIMESTAMP'] = pd.to_datetime(sts['TIMESTAMP'])
        sts["TIMESTAMP_END"] = sts.TIMESTAMP.dt.strftime("%Y%m%d%H%M").astype(int)
        am_data = micromet.Reformatter(drop_soil=False, logger=logger,)
        #raw_data = raw_data.drop([0], axis=0)
        df, report, checktime = am_data.process(sts, data_type="met")
        am_df[file_name.stem] = df
        stat_report[file_name.stem] = report
    if i > 0:
        stats[key] = pd.concat(am_df)
        reports[key] = pd.concat(stat_report)

stats_met_temp = pd.concat(stats)
stats_reports = pd.concat(reports)


In [None]:
# review variables with a lot of dropped values based on the report

report_final = stats_reports.reset_index(level=[0,1,2])
report_final = report_final.drop(columns=['level_1','level_2'])
report_final = report_final.rename({'level_0':'STATIONID'}, axis=1)
report_final = report_final.groupby(['STATIONID','column', 'matched_key']).mean()

report_final.to_csv(raw_fold / 'metstat_report_diane.csv')

report_final[report_final.pct_flagged>=10].round(1)

In [None]:
stats_met = stats_met_temp.reset_index().rename(columns={'level_0':'STATIONID'})
stats_met = stats_met.drop(['level_1'],axis=1)
if len (stats_met[stats_met.duplicated(subset=['STATIONID','DATETIME_END'])])>0:
    print('FAIL: STATIONID AND DATETIME_END DUPLICATES PRESENT')
    print('DROPPING DUPLICATES')
    stats_met = stats_met.drop_duplicates(subset=['STATIONID','DATETIME_END'])

else:
    print("PASS: NO STATIONID AND DATETIME_END DUPLICATES")

stats_met = stats_met.set_index(['STATIONID','DATETIME_END'])
stats_met = stats_met.mask(stats_met < -5000)

In [None]:
rename_dict = {
    "LWMCON_1": 'LWMCON_1_1_1',
        "LWMCON_2": 'LWMCON_1_1_2',
        "LWMDRY_1": 'LWMDRY_1_1_1',
        "LWMDRY_2": 'LWMDRY_1_1_2',
        "LWMV_1": 'LWMV_1_1_1',
        "LWMV_2": 'LWMV_1_1_2',
        "LWMWET_1": 'LWMWET_1_1_1',
        "LWMWET_2": 'LWMWET_1_1_2'
    }

stats_met.rename(columns=rename_dict, inplace=True)

results = validate.compare_names_to_ameriflux(stats_met, amflux)
print('\n')

In [None]:
stats_met.to_parquet(raw_fold / "comp_met_stat_diane.parquet")

In [None]:
# summarize and view data gaps (view for just one station)

gaps_statsmet = gap_summary.summarize_gaps(stats_met)

bal = stats_met.loc['US-UTE'].sort_index()
plotlystuff([bal, bal], ['WS_1_1_2','NETRAD_1_1_2'])

### Compile Statistics Ameriflux .dat Tables

In [None]:
alldat = {}
allreports = {}
for key, value in site_folders.items():
    print(f"Processing site: {key} - {value}")
    parent_fold = raw_fold / f"{key}" / 'Statistics_Ameriflux'
    am_df = {}
    report_temp = {}
    i=0
    #raw_data = micromet.raw_file_compile(raw_fold, parent_fold, search_str = "TOA5*Statistics*.dat")    
    for file_name in parent_fold.glob("*Statistics_AmeriFlux*.dat"):
        print(file_name)
        i += 1
        print(f"Processing file: {file_name}")
        sts = pd.read_csv(file_name)
        for col in sts.columns:
            if col.endswith("_Avg"):
                sts.rename(columns={col: col[:-4]}, inplace=True)
            elif col.endswith("_Tot"):
                sts.rename(columns={col: col[:-4]}, inplace=True)
        am_data = micromet.Reformatter(drop_soil=False, logger=logger,)
        #raw_data = raw_data.drop([0], axis=0)
        df, report, checktime = am_data.process(sts, data_type="met")
        am_df[file_name.stem] = df
        report_temp[file_name.stem] = report
    if i > 0:
        alldat[key] = pd.concat(am_df)
        allreports[key] = pd.concat(report_temp)

afstats_met_temp = pd.concat(alldat)
outlier_report = pd.concat(allreports, axis=1)

In [None]:
# review variables with a lot of dropped values based on the report
report_stacked = outlier_report.stack(level=0)

report_final = report_stacked.reset_index(level=1)
report_final = report_final.droplevel(0, axis=0)
report_final = report_final.drop(['level_1'], axis=1)
report_final.index.name = 'STATIONID'
report_final = report_final.reset_index()
report_final = report_final.groupby(['STATIONID','column', 'matched_key']).mean()

report_final[report_final.pct_flagged>=10]

In [None]:
afstats_met = afstats_met_temp.reset_index().rename(columns={'level_0':'STATIONID'})
afstats_met = afstats_met.drop(['level_1'],axis=1)
if len (afstats_met[afstats_met.duplicated(subset=['STATIONID','DATETIME_END'])])>0:
    print('FAIL: STATIONID AND DATETIME_END DUPLICATES PRESENT')
    print('DROPPING DUPLICATES')
    afstats_met = afstats_met.drop_duplicates(subset=['STATIONID','DATETIME_END'])

else:
    print("PASS: NO STATIONID AND DATETIME_END DUPLICATES")

afstats_met = afstats_met.set_index(['STATIONID','DATETIME_END'])
afstats_met = afstats_met.mask(afstats_met < -5000)

results = cleanup.process_and_match_columns(afstats_met, amflux)
print('\n')

In [None]:
afstats_met.columns.sort_values()

In [None]:
rename_dict = {
    "LWMCON_1": 'LWMCON_1_1_1',
        "LWMCON_2": 'LWMCON_1_1_2',
        "LWMDRY_1": 'LWMDRY_1_1_1',
        "LWMDRY_2": 'LWMDRY_1_1_2',
        "LWMV_1": 'LWMV_1_1_1',
        "LWMV_2": 'LWMV_1_1_2',
        "LWMWET_1": 'LWMWET_1_1_1',
        "LWMWET_2": 'LWMWET_1_1_2'
    }

stats_met.rename(columns=rename_dict, inplace=True)

results = validate.compare_names_to_ameriflux(stats_met, amflux)
print('\n')

In [None]:
afstats_met.to_parquet(raw_fold / "comp_met_afstat_diane.parquet")

In [None]:
gaps_afstatsmet = gap_summary.summarize_gaps(afstats_met)

bal = afstats_met.loc['US-UTJ'].sort_index()
plotlystuff([bal, bal], ['NETRAD_1_1_2','WD_1_1_2'])

## Check datetimes on available data

This is a method to loop through individual met download folders and look at the file dates.  
Note that it seems like only the first row of data has a timestamp value, so the end dates are  
the same as the start dates.

In [None]:
folder_list = ['m20240821','m20241008','m20250219','m20250513','m20250715','m20250811','me20241008','met_files']

outputs = {}

for folder_name in folder_list:
    folder = Path(f'M:\\Shared drives\\UGS_Flux\\Data_Downloads\\Bluff\\{folder_name}')

    data_rows = []

    for file_name in folder.glob("*Statistics_AmeriFlux*.dat"):
        print(f"Processing: {file_name}")

        try:
            sts = pd.read_csv(file_name)
            sts['TIMESTAMP_START'] = pd.to_datetime(
                 sts['TIMESTAMP_START'],
                 format='%Y%m%d%H%M'
                 )
            row = {
                'file_name': file_name.name,  # Use .name to get just the filename string
                'min_timestamp': sts.TIMESTAMP_START.min(),
                'max_timestamp': sts.TIMESTAMP_START.max(),
                'num_records':len(sts)
            }
            data_rows.append(row)

        except Exception as e:
            print(f"Error processing {file_name}: {e}")
            continue  
    output_df = pd.DataFrame(data_rows)
    outputs[folder_name] = output_df


In [None]:
file_dates = pd.concat(outputs)
mask = (file_dates.min_timestamp>pd.to_datetime('2024-06-01'))
file_dates[mask].sort_values(['min_timestamp'])

# Eddy

## Compile Downloaded Eddy Data from EasyFluxWeb

In [None]:
# processing one .dat file per station that is in station folder with "*_Flux_AmeriFluxFormat.dat"
easyfluxdf = {}
ef_reports = {}

for key, value in site_folders.items():
    site_dir = raw_fold / key
    print(site_dir)
    for file in site_dir.glob("*_Flux_AmeriFluxFormat.dat"):
        print(file)
        am_data = micromet.Reformatter(drop_soil=True,
                                            logger=logger,
                                            )
        df = pd.read_csv(file,skiprows=[0,2,3],
                        na_values=[-9999,"NAN","NaN","nan"])
        
        am_df, report, checktime = am_data.process(df, data_type="eddy")
        easyfluxdf[key] = am_df
        ef_reports[key] = report

ef_report = pd.concat(ef_reports, axis=1)
easyflux = pd.concat(easyfluxdf)

In [None]:
# review variables with a lot of dropped values based on the report
report_stacked = ef_report.stack(level=0)

report_final = report_stacked.reset_index(level=1)
report_final = report_final.rename(columns={'level_1': 'STATIONID'})

report_final.to_csv(raw_fold / "easyflux_report_diane.csv")

report_final[report_final.pct_flagged>=10].sort_values(['STATIONID', 'matched_key']).round(1)



In [None]:
# run various tests on data
#raw_file = r'M:\My Drive\projects\eddy_covariance\site_specific_data_review\Escalante_Flux_AmeriFluxFormat.dat'

easyflux_final = easyflux.reset_index().rename(columns={'level_0':'STATIONID'})

validate.validate_flags(easyflux)
print('\n')

results = cleanup.process_and_match_columns(easyflux, amflux)
print('\n')

validate.validate_timestamp_consistency(easyflux)
print('\n')

if len (easyflux_final[easyflux_final.duplicated(subset=['STATIONID','DATETIME_END'])])>0:
    print('FAIL: STATIONID AND DATETIME_END DUPLICATES PRESENT')
    print('DROPPING DUPLICATES')
    easyflux_final = easyflux_final.drop_duplicates(subset=['STATIONID','DATETIME_END'])

else:
    print("PASS: NO STATIONID AND DATETIME_END DUPLICATES")

easyflux_final = easyflux_final.set_index(['STATIONID','DATETIME_END'])
easyflux_final = easyflux_final.mask(easyflux_final < -5000)

# differences_from_raw = compare_to_raw(raw_file, easyflux, test_var='NETRAD', threshold=0.1)
# print('\n')
# print(f'Differences between raw and micromet files')
# print(differences_from_raw)

In [None]:
# export parquette file
easyflux_final.to_parquet(raw_fold / "easyflux_diane.parquet")

In [None]:
# summarize and view data gaps (view for just one station)

gaps_easyflux = gap_summary.summarize_gaps(easyflux_final)

bal = easyflux_final.loc['US-UTJ']
plotlystuff([bal, bal, bal], ['LE_1_1_1', 'NETRAD_1_1_1', 'WD_1_1_1'])

In [None]:
plotlystuff([bal], ['SG_1_1_A'])

In [None]:
raw_file = r'M:\Shared drives\UGS_Flux\Data_Downloads\compiled\US-UTJ\Bluff_Flux_CSFormat.dat'
rawcs = pd.read_csv(raw_file, header=1, skiprows=[2,3],
                    na_values=[-9999,"NAN","NaN","nan"])

format_str = '%Y-%m-%d %H:%M:%S'

rawcs['DATETIME_END'] = pd.to_datetime(
    rawcs['TIMESTAMP'].str.strip(), # Always strip just in case!
    format=format_str,
    errors='coerce'
)
rawcs = rawcs.set_index("TIMESTAMP").sort_index()
rawcs_sub = rawcs[rawcs.LW_OUT <700]


plotlystuff([bal, rawcs_sub], ['LW_OUT_1_1_1', 'LW_OUT'])


## Compile Ameriflux Format dat files from Dataloggers

The other two components just read in a single file per station. This component reads in all of the  
datalogger files for all of the stations. Since this can be time-intensive, this is run in two parts.  
Part 1 compiles all of the datalogger files into an ouput file in the stations folder. The second part  
pulls in all those compiled files, cleans them up, and exports as a parquette file.  
  
**NOTE:** 
This method is not preserving the report for each compilation, which is fine. The reports from the other  
compilations should be sufficient to provide the general picture of what is going on

In [None]:
# PART 1: Compiling datalogger files together and exporting for each site

comp_edd_df = {}
outlier_report = {}

am = micromet.AmerifluxDataProcessor(logger=logger)

for key, value in site_folders.items():

    parent_fold = raw_fold / f"{key}" / "AmeriFluxFormat"
    #ahp.scan(parent_fold, min_sim=0.3, backup=False)
    #pths = micromet.fix_all_in_parent(parent_fold)
    raw_data = am.raw_file_compile(raw_fold, parent_fold, search_str = "*Flux_AmeriFluxFormat*.dat")
    if raw_data is not None:
        am_data = micromet.Reformatter(drop_soil=False,
                                       logger=logger,
                                       )
        #raw_data = raw_data.drop([0], axis=0)
        am_df, report, checktime = am_data.process(raw_data, data_type="eddy")
        comp_edd_df[key] = am_df
        outlier_report[key] = report

        timestart = am_df['TIMESTAMP_START'].values[0]
        timeend = am_df['TIMESTAMP_END'].values[-1]

        am_df.to_csv(raw_fold / f"{key}" / f"{key}_HH_{timestart:}_{timeend:}_diane.csv")

In [None]:
cmp_edd_df = {}

for key, value in site_folders.items():
    for file in (raw_fold / f"{key}").glob(f"{key}_HH_*.csv"):
        df = pd.read_csv(file, index_col=0)
        df.index = pd.to_datetime(df.index)
        df = df.sort_index()
        df = df.drop_duplicates(subset=['TIMESTAMP_START','TIMESTAMP_END'])
        cmp_edd_df[key] = df

datalogger_dat = pd.concat(cmp_edd_df)

In [None]:
# run various tests on data and a little cleanup
raw_file = r'M:\Shared drives\UGS_Flux\Data_Downloads\compiled\US-UTE\Escalante_Flux_AmeriFluxFormat.dat'

dataloggerdf = datalogger_dat.reset_index().rename(columns={'level_0':'STATIONID'})
dataloggerdf = dataloggerdf.rename(columns={'BATTERY_VOLTAGE':'V_BATT'})


validate.validate_flags(dataloggerdf)
print('\n')

results = cleanup.process_and_match_columns(dataloggerdf, amflux)
print('\n')

validate.validate_timestamp_consistency(dataloggerdf)
print('\n')

# differences_from_raw = validate.compare_to_raw(raw_file, dataloggerdf, test_var='NETRAD', threshold=0.1)
# print('\n')
# print(f'Differences between micromet (left) and raw (right) files')
# print(differences_from_raw)

datalogger_final = dataloggerdf.rename(columns={'BATTERY_VOLTAGE':'V_BATT'}) # tried to fix this with the refromatter_vars but I must have done something wrong...

#datalogger_final = datalogger_final.reset_index().rename(columns={'level_0':'STATIONID'})
if len (datalogger_final[datalogger_final.duplicated(subset=['STATIONID','DATETIME_END'])])>0:
    print('FAIL: STATIONID AND DATETIME_END DUPLICATES PRESENT')
    print('DROPPING DUPLICATES')
    datalogger_final = datalogger_final.drop_duplicates(subset=['STATIONID','DATETIME_END'])

else:
    print("PASS: NO STATIONID AND DATETIME_END DUPLICATES")

datalogger_final = datalogger_final.set_index(['STATIONID','DATETIME_END'])
datalogger_final = datalogger_final.mask(datalogger_final < -5000)

In [None]:
datalogger_final.to_parquet(raw_fold / "datalogger_diane.parquet")

In [None]:
# summarize and view data gaps (view for just one station)

gaps_datalogger = gap_summary.summarize_gaps(datalogger_final)

bal = datalogger_final.loc['US-UTE']
plotlystuff([bal, bal, bal], ['LE_1_1_1', 'NETRAD_1_1_1', 'WD_1_1_1'])

## Compile CSFormat Files

In [None]:
# processing one .dat file per station that is in station folder with "*_Flux_CSFormat.dat"
csdf = {}
cs_reports = {}

for key, value in site_folders.items():
    file_pattern = raw_fold / key / '*_Flux_CSFormat.dat'

    try:
        file_to_read = next(raw_fold.glob(str(file_pattern.relative_to(raw_fold))))
        print(f"Found file for {key}: {file_to_read.name}")
        df = pd.read_csv(file_to_read,skiprows=[0,2,3],
                    na_values=[-9999,"NAN","NaN","nan"])
        am_data = micromet.Reformatter(drop_soil=True,
                                        logger=logger,
                                        )
        df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'])
        df["TIMESTAMP_END"] = df.TIMESTAMP.dt.strftime("%Y%m%d%H%M").astype(int)
    
        csflux_temp, report, timething = am_data.process(df, data_type="eddy")
        csdf[key] = csflux_temp
        cs_reports[key] = report
        
    except StopIteration:
        # --- File Not Found: Handle the missing file ---
        print(f"⚠️ Warning: No matching *_Flux_CSFormat.dat file found for site: {key} in folder {file_pattern.parent}")
        continue


outlier_report = pd.concat(cs_reports, axis=1)
cs_dat = pd.concat(csdf)

In [None]:
# CODE BELOW SHOULD ALLOW YOU TO ITERATE THROUGH THE FLUX_CSFormat folder to compile all files in there

# cs_df = {}
# outlier_reports = {}

# am = micromet.AmerifluxDataProcessor(logger=logger)

# for key, value in site_folders.items():
#     sitedf = {}
#     sitereport = {}
#     parent_fold = raw_fold / f"{key}" / "Flux_CSFormat"
#     #ahp.scan(parent_fold, min_sim=0.3, backup=False)
#     #pths = micromet.fix_all_in_parent(parent_fold)
#     for file in parent_fold.glob("*_Flux_CSFormat*.dat"):
#         am_data = micromet.Reformatter(drop_soil=False,
#                                             logger=logger,
#                                             )
#         df = pd.read_csv(file,skiprows=[0,2,3],
#                         na_values=[-9999,"NAN","NaN","nan"])
#         # must create a timestamp_end column to feed into prepare
#         # b/c otherwise no data will be returned
#         df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'])
#         df["TIMESTAMP_END"] = df.TIMESTAMP.dt.strftime("%Y%m%d%H%M").astype(int)
        
#         csprep, report, checktime = am_data.process(df, data_type="eddy")
#         sitedf[file] = csprep
#         sitereport[file] = report
#     cs_df[key] = pd.concat(sitedf)
#     outlier_reports[key] = pd.concat(sitereport)



# outlier_report = pd.concat(outlier_reports, axis=1).droplevel(level=0, axis=0)
# cs_dat = pd.concat(cs_df).droplevel(level=1, axis=0)

In [None]:
# review variables with a lot of dropped values based on the report
# review variables with a lot of dropped values based on the report
report_stacked = outlier_report.stack(level=0)

report_final = report_stacked.reset_index(level=1)
report_final = report_final.rename(columns = {'level_1':'STATIONID'})
report_final = report_final.groupby(['STATIONID','column', 'matched_key']).mean()
report_final

report_final.to_csv(raw_fold / "csflux_report_diane.csv")

report_final[report_final.pct_flagged>=10]

In [None]:
report_final[(report_final.pct_flagged>=10)]

In [None]:
# drop and/or rename fields we don't want in the file

csflux = cs_dat.reset_index().rename(columns={'level_0':'STATIONID'})

drop_fields = [
    "TS_CS65X_2_1_1",
    "WS_RSLT",
    "_229_DEL_TMPR(1)",
    "_229_DEL_TMPR(2)",
    "_229_TMPR_T0_1",
    "_229_TMPR_T0_2",
    "_229_TMPR_T1_1",
    "_229_TMPR_T1_2",
    "_229_TMPR_T30_1",
    "_229_TMPR_T30_2",
    "_PANEL_TMPR_T0",
    "_PANEL_TMPR_T1",
    "_PANEL_TMPR_T30",
    "WND_DIR_STD",
    "WND_DIR_UNIT_VEC",
    "WND_SPD_AVG",
    "U_HEATMAX",
    "U_SEN0",
    "U_SENAMP",
    "U_SENMAX",
    "SONIC_AZIMUTH",
    "CS65X_EC_2_1_1"
    "SUN_AZIMUTH",
    "SUN_DECLINATION",
    "SUN_ELEVATION",
    "HEIGHT_AGL",
    "HOUR_ANGLE",
    "CS65X_PERM_1_1_1",
    "DAYTIME",
    "E",
    "E1_Q",
    "ANONYMOUS1",
    "ANONYMOUS2",
    "TD_TP01",
    "AIR_MASS_COEFF",
    "ROCP_TP01",
    "Q"
]

for field in drop_fields:
    if field in csflux.columns:
        csflux = csflux.drop(columns=[field],axis=1)

rename_fields = {
    "CS65X_EC_1_1_1":"EC_1_1_1",
    "CS65X_EC_1_1_2":"EC_1_1_2",
    "LI7700_AMB_TMPR":"TA_1_1_5",
    "T_SONIC":"T_SONIC_1_1_1",
    'CO2_SIGMA':'CO2_SIGMA_1_1_1', 
    'H2O_SIGMA':'H2O_SIGMA_1_1_1',
    }


csflux = csflux.rename(columns=rename_fields)

In [None]:
# run various tests on data; file needs to be downloaded file from easyflux website
# skip flags- not in dataframe
#raw_file = r'M:\My Drive\projects\eddy_covariance\site_specific_data_review\Escalante_Flux_CSFormat.dat'

results = cleanup.process_and_match_columns(csflux, amflux)
print(results)
print('\n')

validate.validate_timestamp_consistency(csflux)
print('\n')

# differences_from_raw = validate.compare_to_raw(raw_file, csflux, test_var='NETRAD', threshold=0.1)
# print('\n')
# print(f'Differences between micromet (left) and raw (right) files')
# print(differences_from_raw)

In [None]:
# based on Paul's merge code; I found that most of the columns didn't exist in my data but I just looked at Escalante
# may want to drop the value columns when I am done

mergefields = {
    "TA_1_1_4": ["AMB_AIR_TMPR"],
    "E_AMB": ["AMB_E"],
    "E_SAT_AMB": ["AMB_E_SAT"],
    "TS_1_1_1": ["TS_CS65X_1_1_1", "TS_CS65X_1_1_2"], 
    "TS110_T_AVG": ["T_CANOPY"]
}

for key, values_list in mergefields.items():
    
    if key not in csflux.columns:
        print(f"Skipping target '{key}': not found in DataFrame.")
        continue # Skip to the next key
    
    s_target = csflux[key].replace(-9999, np.nan)
    
    for value in values_list:
        if value in csflux.columns:
            print(f"Merging '{value}' into '{key}'...")
            
            s_source = csflux[value].replace(-9999, np.nan)
            
            s_target = s_target.combine_first(s_source)
        else:
            print(f"Source column '{value}' not found, skipping merge into '{key}'.")
            
    # Save the final result back to the DataFrame
    csflux[key] = s_target.fillna(-9999)

In [None]:
# check for any duplicates and final cleanup
duplicate_columns = csflux.columns[csflux.columns.duplicated()]
print("Duplicate column names:", duplicate_columns)


csflux_final = csflux.reset_index().rename(columns={'level_0':'STATIONID'})
if len (csflux_final[csflux_final.duplicated(subset=['STATIONID','DATETIME_END'])])>0:
    print('FAIL: STATIONID AND DATETIME_END DUPLICATES PRESENT')
    print('DROPPING DUPLICATES')
    csflux_final = csflux_final.drop_duplicates(subset=['STATIONID','DATETIME_END'])

else:
    print("PASS: NO STATIONID AND DATETIME_END DUPLICATES")

csflux_final = csflux_final.set_index(['STATIONID','DATETIME_END'])
csflux_final = csflux_final.mask(csflux_final < -5000)

In [None]:
csflux_final.to_parquet(raw_fold / "csflux_diane.parquet")

In [None]:
# summarize and view data gaps (view for just one station)

gaps_csflux = gap_summary.summarize_gaps(csflux_final)

bal = csflux_final.loc['US-UTJ']
bal = bal.sort_index()
plotlystuff([bal, bal], ['LE_1_1_1', 'NETRAD_1_1_1'])

# Bring together the datasets - NOT REVIEWED!

In [None]:
gaps_amfluxfmt

In [None]:
site_vs_files = gap_summary.compare_gap_summaries(gaps_easyflux, gaps_amfluxfmt)

In [None]:
site_vs_files

In [None]:
import pandas as pd
from pandas.tseries.frequencies import to_offset

def fill_missing_from_other(
    df_target: pd.DataFrame,
    df_source: pd.DataFrame,
    expected_freq: str = "30min",
    add_missing_timestamps: bool = True,
    min_steps: int = 1,
    columns: list | None = None,
    station_level: str = "STATIONID",
    time_level: str = "DATETIME_END",
    return_plan: bool = False,
):
    """
    Fill missing values in `df_target` using `df_source` guided by gap/coverage analysis.

    It:
      1) runs `summarize_gaps` on target and source
      2) runs `compare_gap_summaries` to find fillable segments where SOURCE can fill TARGET
      3) (optionally) reindexes target to include any missing timestamps in those segments
      4) copies values from source -> target ONLY for the targeted column(s), station, and times
         where target is missing (NaN or newly added rows)

    Parameters
    ----------
    df_target : pd.DataFrame
        MultiIndex (station, datetime) with data to be filled (we call this "A" internally).
    df_source : pd.DataFrame
        MultiIndex (station, datetime) with data to copy from (we call this "B").
    expected_freq : str, default "30min"
        Grid frequency (must match both datasets).
    add_missing_timestamps : bool, default True
        If True, adds missing rows in the target during fillable segments before copying.
        If False, only fills NaN cells at timestamps that already exist in target.
    min_steps : int, default 1
        Only consider fillable segments of at least this many samples.
    columns : list[str] | None
        Optional subset of columns to fill. By default uses the intersection of
        df_target.columns and df_source.columns.
    station_level : str, default "STATIONID"
        Name of station level in MultiIndex.
    time_level : str, default "DATETIME_END"
        Name of time level in MultiIndex.
    return_plan : bool, default False
        If True, also returns the computed fill plan (B→A only).

    Returns
    -------
    filled : pd.DataFrame
        A copy of `df_target` with values filled from `df_source`.
    audit : pd.DataFrame
        Row-by-row audit of realized fills with columns:
          ['STATIONID','COLUMN','FILLABLE_START','FILLABLE_END',
           'N_STEPS_PLANNED','N_STEPS_FILLED','HOURS_FILLED']
    plan (optional) : pd.DataFrame
        The B→A portion of the compare plan (only if return_plan=True).

    Notes
    -----
    - Requires the helper functions `summarize_gaps` and `compare_gap_summaries` to be defined.
    - Only copies the specified column indicated by each plan row (no cross-column filling).
    - Never overwrites non-missing target values.
    """
    # --- basic checks ---
    if not isinstance(df_target.index, pd.MultiIndex) or not isinstance(df_source.index, pd.MultiIndex):
        raise TypeError("Both df_target and df_source must have a MultiIndex (station, datetime).")
    if station_level not in df_target.index.names or time_level not in df_target.index.names:
        raise KeyError("df_target index must include levels: station and time.")
    if station_level not in df_source.index.names or time_level not in df_source.index.names:
        raise KeyError("df_source index must include levels: station and time.")

    # Decide which columns to work on
    if columns is None:
        columns = list(set(df_target.columns).intersection(set(df_source.columns)))
        if not columns:
            raise ValueError("No overlapping columns between target and source to fill.")

    # Frequency helpers
    freq_td = to_offset(expected_freq).delta
    hours_per_step = freq_td / pd.Timedelta(hours=1)

    # --- Build plan: B fills A ---
    gaps_a = gap_summary.summarize_gaps(df_target, station_level=station_level, time_level=time_level,
                            expected_freq=expected_freq, columns=columns)
    gaps_b = gap_summary.summarize_gaps(df_source, station_level=station_level, time_level=time_level,
                            expected_freq=expected_freq, columns=columns)
    plan_all = gap_summary.compare_gap_summaries(gaps_a, gaps_b, expected_freq=expected_freq, min_steps=min_steps)
    plan = plan_all[plan_all["TARGET_DATASET"] == "A"].copy()

    if plan.empty:
        # Nothing to do
        audit = pd.DataFrame(columns=[
            "STATIONID","COLUMN","FILLABLE_START","FILLABLE_END",
            "N_STEPS_PLANNED","N_STEPS_FILLED","HOURS_FILLED"
        ])
        return (df_target.copy(), audit, plan) if return_plan else (df_target.copy(), audit)

    # Optional column filter
    plan = plan[plan["COLUMN"].isin(columns)].copy()
    if plan.empty:
        audit = pd.DataFrame(columns=[
            "STATIONID","COLUMN","FILLABLE_START","FILLABLE_END",
            "N_STEPS_PLANNED","N_STEPS_FILLED","HOURS_FILLED"
        ])
        return (df_target.copy(), audit, plan) if return_plan else (df_target.copy(), audit)

    # --- Prepare a working copy of target ---
    target = df_target.copy()

    # If we need to add missing timestamps, compute per-station union of times from plan
    if add_missing_timestamps:
        add_times_by_station = {}
        for _, r in plan.iterrows():
            stn = r["STATIONID"]
            times = pd.date_range(r["FILLABLE_START"], r["FILLABLE_END"], freq=expected_freq)
            add_times_by_station.setdefault(stn, set()).update(times.to_pydatetime().tolist())

        # Reindex per station once with the union of needed times
        rebuilt = []
        stations = target.index.get_level_values(station_level).unique()
        stations_in_plan = set(plan["STATIONID"].unique())
        for stn in stations.union(stations_in_plan):
            # Slice existing station data if present, else empty
            if stn in stations:
                sub = target.xs(stn, level=station_level)
            else:
                # Create empty subframe with all columns if station absent
                sub = pd.DataFrame(columns=target.columns, index=pd.DatetimeIndex([], name=time_level))

            need_times = pd.DatetimeIndex(sorted(add_times_by_station.get(stn, [])))
            if len(need_times) > 0:
                new_index = sub.index.union(need_times)
                sub = sub.reindex(new_index)

            # Return to MultiIndex
            sub = sub.copy()
            sub[station_level] = stn
            sub[time_level] = sub.index
            sub = sub.set_index([station_level, time_level]).sort_index()
            rebuilt.append(sub)

        target = pd.concat(rebuilt).sort_index()

    # --- Perform the fill per plan row ---
    audit_rows = []
    idx = pd.IndexSlice
    for _, r in plan.iterrows():
        stn = r["STATIONID"]
        col = r["COLUMN"]
        times = pd.date_range(r["FILLABLE_START"], r["FILLABLE_END"], freq=expected_freq)

        # Intersect with indices present in both frames (after optional reindex, target has them;
        # still be safe if add_missing_timestamps=False)
        try:
            t_vals = target.loc[idx[stn, times], col]
        except KeyError:
            # If none of the times exist in target and we didn't reindex them in, skip
            continue

        # Source values for those times (skip if missing in source for any reason)
        try:
            s_vals = df_source.loc[idx[stn, times], col]
        except KeyError:
            # If source lacks all those times (shouldn't happen per plan), skip
            continue

        # Only fill where target is NA and source is not NA
        to_fill_mask = t_vals.isna() & s_vals.notna()
        if not to_fill_mask.any():
            # Nothing filled for this segment
            audit_rows.append({
                "STATIONID": stn,
                "COLUMN": col,
                "FILLABLE_START": r["FILLABLE_START"],
                "FILLABLE_END": r["FILLABLE_END"],
                "N_STEPS_PLANNED": int(r["N_STEPS_FILLABLE"]),
                "N_STEPS_FILLED": 0,
                "HOURS_FILLED": 0.0,
            })
            continue

        # Assign
        fill_index = to_fill_mask.index[to_fill_mask]
        target.loc[idx[stn, fill_index], col] = s_vals.loc[fill_index]

        n_filled = int(to_fill_mask.sum())
        audit_rows.append({
            "STATIONID": stn,
            "COLUMN": col,
            "FILLABLE_START": r["FILLABLE_START"],
            "FILLABLE_END": r["FILLABLE_END"],
            "N_STEPS_PLANNED": int(r["N_STEPS_FILLABLE"]),
            "N_STEPS_FILLED": n_filled,
            "HOURS_FILLED": n_filled * hours_per_step,
        })

    audit = pd.DataFrame(audit_rows, columns=[
        "STATIONID","COLUMN","FILLABLE_START","FILLABLE_END",
        "N_STEPS_PLANNED","N_STEPS_FILLED","HOURS_FILLED"
    ]).sort_values(["STATIONID","COLUMN","FILLABLE_START"]).reset_index(drop=True)

    # Done
    target = target.sort_index()
    if return_plan:
        return target, audit, plan
    return target, audit


In [None]:
# Assuming summarize_gaps() and compare_gap_summaries() are defined (from earlier),
# and df_a (target) and df_b (source) are your MultiIndex DataFrames.

df_a = pd.read_parquet(raw_fold / "easyflux.parquet")
df_b = pd.read_parquet(raw_fold / "comp_edd.parquet")
filled_a, audit = fill_missing_from_other(
    df_target=df_a,
    df_source=df_b,
    expected_freq="30min",
    add_missing_timestamps=True,     # add structurally-missing rows before filling
    min_steps=1,                     # ignore super-short segments if you want, e.g., min_steps=2
    columns=["LE_1_1_1","H_1_1_1","NETRAD_1_1_1",
             "LW_IN_1_1_1","SW_IN_1_1_1","SW_OUT_1_1_1","LW_OUT_1_1_1"],         # or None to auto-use shared columns
    station_level="STATIONID",
    time_level="DATETIME_END",
)

print(audit.head())
# filled_a now contains values copied from df_b wherever plan said B could fill A.


## Eddy Data

In [None]:
df_edd = pd.read_parquet(raw_fold /  "comp_edd.parquet",).replace(-9999,np.nan)
df_edd.index.names = ['STATIONID','DATETIME_END']
df_edd['PRIORITY'] = 1

df = pd.read_parquet(raw_fold /  "easyflux.parquet",).replace(-9999,np.nan)
df.index.names = ['STATIONID','DATETIME_END']
df['PRIORITY'] = 2

In [None]:
df_merged = pd.read_parquet(raw_fold / "comp_cs_flux.parquet")
df_merged.index.names = ['STATIONID','DATETIME_END']
df_merged['PRIORITY'] = 3

In [None]:
dfdb = pd.read_parquet(raw_fold /  "old_database_eddy.parquet",).replace(-9999,np.nan)
dfdb.columns = dfdb.columns.str.upper()
dfdb['DATETIME_END'] = pd.to_datetime(dfdb['DATETIME_END'])
#dfdb["TIMESTAMP_START"] = dfdb['DATETIME_END'].apply(lambda x: f"{x:%Y%m%d%H%M}")
dfdb = dfdb.set_index(['STATIONID','DATETIME_END'])
#df.index.names = ['station','datetime']

dfdb.columns = dfdb.columns.str.upper()
rename_dict = {'CO2':'CO2_1_1_1', 
               'CO2_SIGMA':'CO2_SIGMA_1_1_1', 
               'H2O':'H2O_1_1_1', 
               'H2O_SIGMA':'H2O_SIGMA_1_1_1',
               'FC':'FC_1_1_1', 
               'FC_SSITC_TEST':'FC_SSITC_TEST_1_1_1', 
               'LE':'LE_1_1_1',
               'LE_SSITC_TEST':'LE_SSITC_TEST_1_1_1', 
               'ET':'ET_1_1_1',
               'ET_SSITC_TEST':'ET_SSITC_TEST_1_1_1', 
               'H':'H_1_1_1',
               'H_SSITC_TEST':'H_SSITC_TEST_1_1_1', 
               'G':'G_1_1_A',
               'G_SSITC_TEST':'G_SSITC_TEST_1_1_1',
               'SG':'SG_1_1_1', 
               'WD':'WD_1_1_1', 
               'WS':'WS_1_1_1', 
               'WS_MAX':'WS_MAX_1_1_1',
               'PA':'PA_1_1_1', 
               'VPD':'VPD_1_1_1', 
               'ALB':'ALB_1_1_1', 
               'NETRAD':'NETRAD_1_1_1', 
               'SW_IN':'SW_IN_1_1_1',
               'SW_OUT':'SW_OUT_1_1_1', 
               'LW_IN':'LW_IN_1_1_1', 
               'LW_OUT':'LW_OUT_1_1_1', 
               'P':'P_1_1_1', 
               }

dfdb = dfdb.rename(columns=rename_dict)
dfdb['ET_1_1_1'].where(dfdb['ET_1_1_1'].between(0,1.1),np.nan)
dfdb['PRIORITY'] = 4


In [None]:
import numpy as np
import pandas as pd

def coalesce_by_priority_multiindex(
    df_or_dfs,
    priority_col="priority",
    ascending=True,
    invalid_values=(-9999,),
    keep_index=True,
):
    """
    Column-wise coalesce: for each MultiIndex group (all index levels except `priority_col`),
    take the first non-null value per column after sorting by priority.

    Parameters
    ----------
    df_or_dfs : DataFrame or list/tuple of DataFrames
        Concatenated DataFrame (or list to be concatenated) with a MultiIndex.
    priority_col : str
        Column name (or index level name) indicating priority. Lower/greater is better
        depending on `ascending`.
    ascending : bool
        Sort so that smaller (True) or larger (False) priority wins.
    invalid_values : tuple
        Treat these values as missing.
    keep_index : bool
        Keep the MultiIndex in the result. If False, returns a reset_index frame.
    """
    # 0) Accept list of dfs or a single df
    if isinstance(df_or_dfs, (list, tuple)):
        df = pd.concat(df_or_dfs, axis=0)
    else:
        df = df_or_dfs.copy()

    # 1) If priority is an index level, move it to a column (so we don't group by it)
    if isinstance(df.index, pd.MultiIndex) and priority_col in df.index.names:
        df = df.reset_index(level=priority_col)

    # 2) Define group levels = all current index levels (MultiIndex) → the "keys"
    if not isinstance(df.index, pd.MultiIndex):
        raise ValueError("Expected a MultiIndex index. Set your keys as the DataFrame index first.")
    group_levels = list(df.index.names)

    # 3) Value columns = all columns except the priority column
    if priority_col not in df.columns:
        raise ValueError(f"'{priority_col}' must be a column or an index level.")
    value_cols = [c for c in df.columns if c != priority_col]

    # 4) Treat sentinels as NaN
    if invalid_values:
        for v in invalid_values:
            df[value_cols] = df[value_cols].mask(df[value_cols].eq(v))
    df[value_cols] = df[value_cols].where(df[value_cols].notna(), np.nan)

    # 5) Sort by priority (best first)
    df = df.sort_values(priority_col, ascending=ascending)

    # 6) Per group & per column, take the first non-null
    def _first_valid(s):
        s = s.dropna()
        return s.iloc[0] if len(s) else np.nan

    out = (
        df.groupby(level=group_levels, sort=False)[value_cols]
          .agg(_first_valid)
    )

    return out if keep_index else out.reset_index()



result = coalesce_by_priority_multiindex([df,df_edd,df_merged],  
                              priority_col="PRIORITY", 
                              ascending=True, 
                              invalid_values=(-9999,np.nan,"NAN",None))
result.to_parquet(raw_fold / "combined_eddy_dataset_20250905.parquet")

In [None]:
import numpy as np
import pandas as pd

def coalesce_by_priority_multiindex_fast(
    df_or_dfs,
    priority_col="PRIORITY",
    ascending=True,
    invalid_values=(-9999,),
    keep_index=True,
):
    # 1) Combine frames
    if isinstance(df_or_dfs, (list, tuple)):
        df = pd.concat(df_or_dfs, axis=0)
    else:
        df = df_or_dfs.copy()

    # 2) Ensure PRIORITY is an index level (last)
    if priority_col in df.columns:
        df = df.set_index(priority_col, append=True)
    elif not (isinstance(df.index, pd.MultiIndex) and priority_col in df.index.names):
        raise ValueError(f"'{priority_col}' must be a column or an index level.")
    levels = list(df.index.names)
    if levels[-1] != priority_col:
        levels.remove(priority_col)
        levels.append(priority_col)
        df = df.reorder_levels(levels).sort_index()

    value_cols = list(df.columns)  # all non-index columns

    # 3) Normalize invalids → NaN
    if invalid_values:
        for v in invalid_values:
            # Skip np.nan because .eq(np.nan) is always False
            if isinstance(v, float) and np.isnan(v):
                continue
            df[value_cols] = df[value_cols].mask(df[value_cols].eq(v))
    df[value_cols] = df[value_cols].where(df[value_cols].notna(), np.nan)

    # 4) DEDUP step: collapse duplicates per (keys..., PRIORITY)
    #    For each group & column, take the first non-null.
    def _first_valid(s):
        s = s.dropna()
        return s.iloc[0] if len(s) else np.nan

    df = (
        df.groupby(level=list(df.index.names), sort=False)[value_cols]
          .agg(_first_valid)
    )

    # 5) Unstack PRIORITY and fill across priority dimension (best → worse)
    wide = df.unstack(priority_col)  # columns: (value_col, priority)
    wide = wide.sort_index(axis=1, level=1, ascending=ascending)
    filled = wide.bfill(axis=1)

    # 6) Take the first (best) priority slice for each value column
    best_priority_label = filled.columns.levels[1][0]
    out = filled.xs(best_priority_label, level=1, axis=1)

    return out if keep_index else out.reset_index()

result2 = coalesce_by_priority_multiindex_fast([df,df_edd,df_merged, dfdb],  
                              priority_col="PRIORITY", 
                              ascending=True, 
                              invalid_values=(-9999,np.nan,"NAN",None))
result2.to_parquet(raw_fold / "combined_eddy_dataset_20250905_v2.parquet")

In [None]:
result2.loc['US-UTV','NETRAD_1_1_1'].sort_index().plot()
plt.ylim(0,800)

In [None]:
def filter_static_outliers(
    df: pd.DataFrame,
    thresh: float = 4.0,
) -> pd.DataFrame:
    """
    Replace values that deviate more than `thresh` standard deviations
    from the *station-wide* mean (no moving window).

    Outlier detection is performed separately for each station (level-0
    of the MultiIndex).  Only floating-point columns are filtered.

    Parameters
    ----------
    df : pandas.DataFrame
        MultiIndex DataFrame with outer index = stationid and inner
        index = datetime (half-hourly).
    thresh : float, default 3.0
        Number of σ from the mean that defines an outlier.

    Returns
    -------
    pandas.DataFrame
        Copy of `df` with outliers in float columns replaced by NaN.
    """
    # Work on a copy to avoid mutating the caller’s DataFrame
    df = df.copy()

    # Select only float columns (ignore integers, objects, etc.)
    float_cols = df.select_dtypes(include=[np.floating]).columns
    if float_cols.empty:
        return df                        # nothing to do

    # Compute station-specific mean and std, broadcast back with transform
    grp = df[float_cols].groupby(level=0)
    mean  = grp.transform("mean")
    std   = grp.transform("std")         # sample std (ddof=1) like pandas default

    # Identify outliers and replace with NaN
    mask = (df[float_cols] - mean).abs() > thresh * std
    df.loc[:, float_cols] = df[float_cols].mask(mask)

    return df

In [None]:
combo = pd.concat([df,df_edd,df_merged],axis=0)
# Remove duplicate station datetime values, keeping the non-na values
combo = combo.sort_values(['LE_1_1_1','NETRAD_1_1_1','priority']).sort_index()
combo = combo.reset_index().drop_duplicates(subset=['stationid','DATETIME_END'],keep='first')
combo = combo.set_index(['stationid','DATETIME_END'])

In [None]:
# can't run this- drops most precip values
# clean_df = filter_static_outliers(combo, thresh=4)  # custom

In [None]:
combo.to_parquet(raw_fold / "combined_eddy_dataset.parquet")

## Met Compile

In [None]:
df_met = pd.read_parquet(raw_fold /  "comp_met.parquet",).replace(-9999,np.nan)
df_met.index.names = ['stationid','DATETIME_END']
df_met['priority'] = 1

In [None]:
stmet = pd.read_parquet(raw_fold / "comp_met_stat.parquet")
stmet['DATETIME_END'] = pd.to_datetime(stmet['TIMESTAMP_START'],format="%Y%m%d%H%M")
stmet = stmet.reset_index()
stmet = stmet.rename(columns = {'level_0':'stationid'})
stmet = stmet.set_index(['stationid','DATETIME_END'])

In [None]:
dfdbm = pd.read_parquet(raw_fold /  "old_database_met.parquet",).replace(-9999,np.nan)
dfdbm['DATETIME_END'] = pd.to_datetime(dfdbm['DATETIME_END'])
dfdbm = dfdbm.set_index(['stationid','DATETIME_END'])
#df.index.names = ['station','datetime']

dfdbm.columns = dfdbm.columns.str.upper()
rename_dict_m = {'CO2':'CO2_1_1_2', 
               'CO2_SIGMA':'CO2_SIGMA_1_1_2', 
               'H2O':'H2O_1_1_2', 
               'H2O_SIGMA':'H2O_SIGMA_1_1_2',
               'FC':'FC_1_1_2', 
               'FC_SSITC_TEST':'FC_SSITC_TEST_1_1_2', 
               'LE':'LE_1_1_2',
               'LE_SSITC_TEST':'LE_SSITC_TEST_1_1_2', 
               'ET':'ET_1_1_2',
               'ET_SSITC_TEST':'ET_SSITC_TEST_1_1_2', 
               'H':'H_1_1_2',
               'H_SSITC_TEST':'H_SSITC_TEST_1_1_2', 
               'G':'G_1_1_A',
               'G_SSITC_TEST':'G_SSITC_TEST_1_1_2',
               'SG':'SG_1_1_2', 
               'WD':'WD_1_1_2', 
               'WS':'WS_1_1_2', 
               'WS_MAX':'WS_MAX_1_1_2',
               'PA':'PA_1_1_2', 
               'VPD':'VPD_1_1_2', 
               'ALB':'ALB_1_1_2', 
               'NETRAD':'NETRAD_1_1_2', 
               'SW_IN':'SW_IN_1_1_2',
               'SW_OUT':'SW_OUT_1_1_2', 
               'LW_IN':'LW_IN_1_1_2', 
               'LW_OUT':'LW_OUT_1_1_2', 
               'P':'P_1_1_2', 
               }

dfdbm = dfdbm.rename(columns=rename_dict_m)
#dfdb['ET_1_1_1'].where(dfdb['ET_1_1_1'].between(0,1.1),np.nan)
dfdbm['priority'] = 3


In [None]:
combo_met = pd.concat([df_met,dfdbm,stmet],axis=0)
combo_met

In [None]:
# Remove duplicate station datetime values, keeping the non-na values
combo_met = combo_met.sort_values(['NETRAD_1_1_2','priority']).sort_index()
combo_met = combo_met.reset_index().drop_duplicates(subset=['stationid','DATETIME_END'],keep='first')
combo_met = combo_met.set_index(['stationid','DATETIME_END'])

In [None]:
# may want to revisit whether to run this- caused issues with precip data for the eddy stations
# clean_df_met = filter_static_outliers(combo_met, thresh=4)  # custom

In [None]:
clean_df_met.to_parquet(raw_fold / "combined_met_dataset.parquet")

In [None]:
met  = pd.read_parquet(raw_fold / "combined_met_dataset.parquet")
eddy = pd.read_parquet(raw_fold / "combined_eddy_dataset.parquet")
#met.to_csv(raw_fold / "combined_met_dataset.csv")
#eddy.to_csv(raw_fold / "combined_eddy_dataset.csv")

In [None]:
combined = pd.merge(met, eddy, how='outer', left_index=True, right_index=True,
         suffixes=('_met', '_eddy'))

combined

In [None]:
combined.loc['US-UTD']

In [None]:
combined.loc['US-UTD', ['WS','WS_1_1_1']].dropna().plot(kind='scatter',x='WS',y='WS_1_1_1',)

In [None]:
compare_cols = ["WS", 
                "TA_", 
                "RH_", 
                "LE_", 
                "H_", 
                "VPD", 
                "PA", 
                "WD", 
                "NETRAD", 
                "SW_IN_", 
                "SW_OUT_", 
                "LW_IN_", 
                "LW_OUT_", 
                "ALB"]

matches = {}
for i in compare_cols:
    values = []
    met_col = []
    eddy_col = []

    for col in met.columns:
        if 'MAX' not in col and 'SSITC' not in col:
            if col.startswith(i):
                values.append(col)
                met_col.append(col)

    for col in eddy.columns:
        if 'MAX' not in col and 'SSITC' not in col:
            if col.startswith(i):
                values.append(col)
                eddy_col.append(col)

    matches[i] = values
    if len(values) > 1:
        fig, ax = plt.subplots(figsize=(12, 6))
        plt.title(f"Comparison of {i} for US-UTD")
        for j in met_col:
            met.loc['US-UTD',j].replace(-9999,np.nan).plot(label=j,ax=ax)
        for k in eddy_col:
            eddy.loc['US-UTD',k].replace(-9999,np.nan).plot(label=k,ax=ax)
        plt.legend()





In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.ensemble import IsolationForest
from collections import defaultdict

# --------------------------------------------------
# 1. LOAD  (needs pyarrow or fastparquet installed)
# --------------------------------------------------
met  = pd.read_parquet(raw_fold / "combined_met_dataset.parquet")
eddy = pd.read_parquet(raw_fold / "combined_eddy_dataset.parquet")

# If not already multi-indexed by (station, timestamp):
# met  = met.set_index(["station_id", "timestamp"]).sort_index()
# eddy = eddy.set_index(["station_id", "timestamp"]).sort_index()

# Keep only overlapping station–time rows
common_idx = met.index.intersection(eddy.index)
met, eddy  = met.loc[common_idx], eddy.loc[common_idx]

# --------------------------------------------------
# 2. DEFINE THE PREFIXES YOU WANT TO COMPARE
#    (fill this list in with your own)
# --------------------------------------------------
prefixes = ["WS", "TA", "RH", "LE", "H", "VPD", "PA", "WD", "NETRAD", "SW_IN", "SW_OUT", "LW_IN", "LW_OUT", "ALB"]

# --------------------------------------------------
# 3. BUILD A MATCH TABLE  {prefix -> [(met_col, eddy_col), …]}
# --------------------------------------------------
matches = defaultdict(list)

for p in prefixes:
    # columns that begin with that prefix
    met_cols  = [c for c in met.columns  if c.startswith(p)]
    eddy_cols = [c for c in eddy.columns if c.startswith(p)]

    # simplest strategy: look for *exact* column-name matches
    common = set(met_cols).intersection(eddy_cols)
    for col in common:
        matches[p].append((col, col))

    # fallback: if names differ after the prefix, pair by the suffix
    if not common:
        met_suffix  = {c[len(p):]: c for c in met_cols}
        eddy_suffix = {c[len(p):]: c for c in eddy_cols}
        for suf in met_suffix.keys() & eddy_suffix.keys():
            matches[p].append((met_suffix[suf], eddy_suffix[suf]))

# sanity check
if not any(matches.values()):
    raise ValueError("No columns matched with the given prefixes!")
else:
    print(f"Found {len(matches)} prefixes with matches:")
    for p, pairs in matches.items():
        print(f"  {p}: {len(pairs)} pairs")
        for mcol, ecol in pairs:
            print(f"    {mcol} ↔ {ecol}")

# --------------------------------------------------
# 4. COLLECT ALL DIFFERENCES INTO ONE DATAFRAME
#    (column names => "<prefix><suffix>_diff")
# --------------------------------------------------
diff_frames = []
for p, pairs in matches.items():
    for mcol, ecol in pairs:
        name = f"{mcol}_diff"          # keeps original met name for clarity
        diff_frames.append(
            (name, met[mcol] - eddy[ecol])
        )

# combine into a single MultiIndex-friendly DataFrame
diff = pd.concat(
    {name: series for name, series in diff_frames}, axis=1
)

abs_diff = diff.abs()

# --------------------------------------------------
# 5. OUTLIER METHODS
# --------------------------------------------------
# 5A. Z-score (3σ)
z_scores = abs_diff.groupby(level=0).transform(
    lambda g: (g - g.mean()) / g.std(ddof=0)
)
flags_z = z_scores > 3

# 5B. MAD (3.5× MAD)
def mad_flags(s, k=3.5):
    med = s.median()
    mad = np.median(np.abs(s - med))
    return np.abs(s - med) / (1.4826 * mad + 1e-9) > k

flags_mad = abs_diff.groupby(level=0).transform(mad_flags)

# 5C. Isolation Forest (multivariate, per station)
flags_if = pd.DataFrame(False, index=abs_diff.index, columns=abs_diff.columns)

for stn, g in abs_diff.groupby(level=0):
    X   = g.values
    ok  = np.any(~np.isnan(X), axis=1)
    if ok.sum() < 20:                # need enough rows to fit
        continue

    clf = IsolationForest(
        n_estimators=300,
        contamination=0.01,
        random_state=42,
    ).fit(X[ok])

    row_out = clf.predict(X[ok]) == -1   # → Boolean vector
    # broadcast to all columns
    flags_if.loc[g.index[ok], :] = np.repeat(
        row_out[:, None], g.shape[1], axis=1
    )

# --------------------------------------------------
# 6. QUICK SUMMARY  (how many flags per variable)
# --------------------------------------------------
summary = (
    pd.DataFrame({
        "Zscore": flags_z.sum(),
        "MAD":    flags_mad.sum(),
        "IsoF":   flags_if.sum(),
    })
    .sort_index()
)
print(summary.head())

# --------------------------------------------------
# 7. OPTIONAL:  EXPORT OR APPLY MASK
# --------------------------------------------------
# Example: mask out any value flagged by *any* method
combined_flags = flags_z | flags_mad | flags_if
clean_met  = met.where(~combined_flags)  # replaces flagged cells with NaN


In [None]:
met_cols

In [None]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.ensemble import IsolationForest

# ---------- 1. LOAD ----------
met   = pd.read_parquet(raw_fold /"combined_met_dataset.parquet")   # needs pyarrow or fastparquet
eddy  = pd.read_parquet(raw_fold /"combined_eddy_dataset.parquet")

# If your indices aren’t yet a MultiIndex (station, time) do this once:
# met  = met.set_index(["station_id","timestamp"]).sort_index()
# eddy = eddy.set_index(["station_id","timestamp"]).sort_index()

# Keep only the overlapping stations & times
common_idx = met.index.intersection(eddy.index)
met  = met.loc[common_idx]
eddy = eddy.loc[common_idx]

# ---------- 2. IDENTIFY MATCHING VARIABLES ----------
common_cols = met.columns.intersection(eddy.columns)
if common_cols.empty:
    raise ValueError("No shared measurement names between the two datasets!")

# Optionally drop columns that are integer-typed (often flags / counters)
keep_float = [c for c in common_cols if np.issubdtype(met[c].dtype, np.floating)]
met  = met[keep_float]
eddy = eddy[keep_float]

# ---------- 3. STACK THE TWO SOURCES FOR EZ COMPARISON ----------
diff = met - eddy              # sign tells you which source is higher
abs_diff = diff.abs()

# ---------- 4A. Z-SCORE BASED OUTLIERS ----------
z_scores = abs_diff.groupby(level=0).transform(  # compute σ station-by-station
    lambda g: (g - g.mean()) / g.std(ddof=0)
)
outliers_z = z_scores > 3        # boolean DF same shape as diff

# ---------- 4B. MAD BASED OUTLIERS ----------
def mad_based_flags(series, k=3.5):
    med = series.median()
    mad = np.median(np.abs(series - med))
    # 1.4826 converts MAD to σ for a normal dist.
    return np.abs(series - med) / (1.4826 * mad + 1e-9) > k

outliers_mad = abs_diff.groupby(level=0).transform(mad_based_flags)

# ---------- 4C. ISOLATION FOREST (multivariate) ----------
iso_out = {}
for stn, g in abs_diff.groupby(level=0):

    X = g.values
    mask = np.any(~np.isnan(X), axis=1)          # rows with ≥1 real number
    flags = pd.DataFrame(False, index=g.index, columns=g.columns)

    if mask.sum() >= 20:                         # enough samples to train
        clf = IsolationForest(
            contamination=0.01,
            n_estimators=300,
            random_state=42,
        ).fit(X[mask])

        row_flags = clf.predict(X[mask]) == -1   # 1-D Boolean (outlier rows)

        # --- broadcast row_flags to full (n_rows_selected × n_columns) matrix
        flags.iloc[mask, :] = np.repeat(
            row_flags[:, None], g.shape[1], axis=1
        )

    iso_out[stn] = flags

outliers_iso = pd.concat(iso_out)

# ---------- 5. SUMMARIZE ----------
summary = (
    pd.DataFrame({
        "z_score":  outliers_z.sum(),
        "MAD":      outliers_mad.sum(),
        "iForest":  outliers_iso.sum()
    })
    .rename_axis("variable")
)
print(summary.head())


In [None]:
summary

Compile files from each station into a a single dataframe.

In [None]:
cdf = pd.concat(comp_edd_df, axis=0)
cdf.index.set_names(['stationid','DATETIME_END'],inplace=True)
#cdf.rename(columns={'level_0':'stationid'},inplace=True)
#cdf.to_parquet('../station_data/all_data.parquet')
for col in cdf.columns:
    cdf.rename(columns={col:col.lower()},inplace=True)

Save to Parquet

In [None]:
cdf.to_parquet('../../station_data/all_eddy_data.parquet')

In [None]:

comp_met_df = {}
root_dir = "C:/Users/paulinkenbrandt/Documents/GitHub/MicroMet/src/micromet/data/"
config_path = root_dir + "reformatter_vars.yml"
var_limits_csv = root_dir + "extreme_values.csv"
am = micromet.AmerifluxDataProcessor(config_path, logger)


for key, value in site_folders.items():

    print(key)
    raw_fold = pathlib.Path('G:/Shared drives/UGS_Flux/Data_Downloads/')
    raw_data = am.raw_file_compile(raw_fold, value, search_str = "*Statistics_AmeriFlux*.dat")
    if raw_data is not None:
        am_data = micromet.Reformatter(
                                       config_path=config_path,
                                       var_limits_csv= var_limits_csv,
                                       drop_soil=False,
                                       logger=logger,
                                       )
        am_df = am_data.process(raw_data, data_type="met")
        #am_df = am_data.et_data
        comp_met_df[key] = am_df

        #am_df.to_csv(f"../../station_data/{key}_HH_{am_df['TIMESTAMP_START'].values[0]:}_{am_df['TIMESTAMP_END'].values[-1]:}.csv")

        



In [None]:
ddf.columns = ddf.columns.str.lower()

In [None]:
soildfs

for old_col, new_col in mapping.items():
    if str(old_col).lower() in soildfs.columns.str.lower():
        if str(new_col).lower() in soildfs.columns.str.lower():
            soildfs[new_col.lower()] = soildfs[[old_col.lower(), new_col.lower()]].max(axis=1)
            soildfs = soildfs.drop(old_col.lower(), axis=1)
        else:
            soildfs = soildfs.rename(columns={old_col.lower(): new_col.lower()})
    elif str(old_col).lower()+"_eddy" in soildfs.columns.str.lower():
        print(f"Found {old_col} eddy column")
        if str(new_col).lower()+"_eddy" in soildfs.columns.str.lower():
            soildfs[new_col.lower()] = soildfs[[old_col.lower()+"_eddy", new_col.lower()+"_eddy"]].max(axis=1)
            soildfs = soildfs.drop(old_col.lower()+"_eddy", axis=1)
        else:
            soildfs = soildfs.rename(columns={old_col.lower()+"_eddy": new_col.lower()})
    elif str(new_col).lower()+"_eddy" in soildfs.columns.str.lower():
        if str(new_col).lower() in soildfs.columns.str.lower():
            soildfs[new_col.lower()] = soildfs[[new_col.lower()+"_eddy", new_col.lower()+"_eddy"]].max(axis=1)
            soildfs = soildfs.drop(new_col.lower()+"_eddy", axis=1)
            print(f"Found {new_col} eddy column")
        else:
            print(f"Found {new_col} eddy column")
            soildfs = soildfs.rename(columns={new_col.lower()+"_eddy": new_col.lower()})
        


In [None]:
ddf = pd.concat(comp_met_df, axis=0)
ddf.index.set_names(['stationid','DATETIME_END'],inplace=True)
#cdf.rename(columns={'level_0':'stationid'},inplace=True)
#cdf.to_parquet('../station_data/all_data.parquet')
for col in ddf.columns:
    ddf.rename(columns={col:col.lower()},inplace=True)

In [None]:
ddf[~ddf['vwc_2_7_1'].isna()]

In [None]:
ddf.iloc[0:1,:].to_clipboard()

In [None]:
import re

soilcols = [col.lower() for col in am_data.MATH_SOILS_V2]
pattern = re.compile(r"2_1_1|1_2_1|1_1_2")
# Print matching columns
matching_cols = [col for col in soilcols if pattern.search(col)]
# Remove them from the original list
soilcols = [col for col in soilcols if not pattern.search(col)]

        
soildfs = pd.merge(ddf,cdf[soilcols],how='left',on=['stationid','DATETIME_END'],suffixes=(None,'_eddy'))
soildfs

for col in cdf.columns:
    if col in soilcols:
        cdf.drop(columns=col,inplace=True)  # drop the soil columns from the main dataframe

cdf.to_parquet('../../station_data/all_eddy_data.parquet')

soildfs.to_parquet('../../station_data/all_soil_data.parquet')

ddf.to_parquet('../../station_data/all_met_data.parquet')

In [None]:
cdf = pd.read_parquet('../../station_data/all_eddy_data.parquet')


In [None]:
cdf.columns

In [None]:
soildfs = pd.read_parquet('../../station_data/all_soil_data.parquet')
utd_soilt = soildfs.loc['US-UTD'][['ts_3_1_1','ts_3_2_1','ts_3_3_1']].replace(-9999,np.nan)
utd_soilt = utd_soilt[utd_soilt.index >= '2024-07-01']#.resample('30T').mean()
utd_soilt['ts_3_1_1'].plot()
utd_soilt['ts_3_2_1'].shift(-1).plot()
utd_soilt['ts_3_3_1'].shift(-5).plot()
plt.axvline('2024-07-04 15:00',color='r')
#plt.xlim('2024-07-01','2024-07-08')
#plt.ylim(10,35)
plt.grid(True, which='minor')

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from scipy.signal import correlate

# Function to decompose the seasonal component
def extract_seasonal(ts, period):
    decomposition = seasonal_decompose(ts, model='additive', period=period)
    return decomposition.seasonal

# Function to calculate lag between two seasonal series using cross-correlation
def calculate_lag(seasonal1, seasonal2):
    n = len(seasonal1)
    correlation = correlate(seasonal1 - np.mean(seasonal1), seasonal2 - np.mean(seasonal2), mode='full')
    lags = np.arange(-n + 1, n)
    lag = lags[np.argmax(correlation)]
    return lag, correlation, lags

ts1 = utd_soilt['ts_3_2_1']
ts2 = utd_soilt['ts_3_3_1']
#utd_soilt['ts_3_3_1'].shift(-5).plot()


# Extract seasonal components
seasonal1 = extract_seasonal(ts1, period=48)
seasonal2 = extract_seasonal(ts2, period=48)

# Calculate lag
lag, correlation, lags = calculate_lag(seasonal1.dropna(), seasonal2.dropna())

# Output
print(f"Calculated lag: {lag/2} hours")

# Plot seasonal components and correlation
fig, ax = plt.subplots(3, 1, figsize=(10, 8))

seasonal1.plot(ax=ax[0], label='Seasonal Component 1')
seasonal2.plot(ax=ax[0], label='Seasonal Component 2')
ax[0].legend()
ax[0].set_title('Seasonal Components')
ax[0].set_xlim(pd.to_datetime('2024-07-01'),pd.to_datetime('2024-07-08'))
ax[0].grid(True)

ax[1].plot(lags, correlation)
ax[1].set_title('Cross-Correlation')
ax[1].set_xlabel('Lag (hours)')
ax[1].set_ylabel('Correlation')
ax[1].set_xlim(-10, 10)
ax[1].grid(True)

ax[2].plot(seasonal1.index, seasonal1, label='Series 1')
ax[2].plot(seasonal2.index + pd.Timedelta(hours=lag/2), seasonal2, label='Series 2 (Shifted)')
ax[2].legend()
ax[2].set_title(f'Series alignment (Lag: {lag/2} hours)')
ax[2].set_xlim(pd.to_datetime('2024-07-01'),pd.to_datetime('2024-07-08'))
ax[2].grid(True)
plt.tight_layout()
plt.show()



In [None]:
cdf = pd.read_parquet('../../station_data/all_eddy_data.parquet')
ddf = pd.read_parquet('../../station_data/all_met_data.parquet')

for col in cdf.columns:
    if col in ddf.columns:
        print(col)


In [None]:
ddf.head(10).to_clipboard()

In [None]:
series = ddf.loc['US-UTD','t_si111_body'].replace(-9999,np.nan)
series.plot()
series.diff().plot()
new_series = series[series.diff()<2].diff().cumsum()
new_series.plot()

In [None]:
config = configparser.ConfigParser()

config.read('../../secrets/config.ini')

from sqlalchemy import create_engine
import urllib.parse
host = config['DEFAULT']['ip']
pw = config['DEFAULT']['pw']
user = config['DEFAULT']['login']

encoded_password = urllib.parse.quote_plus(pw)

def postconn_et(encoded_password, host='localhost',user='postgres',port='5432',db='groundwater', schema = 'groundwater'):
    connection_text = "postgresql+psycopg2://{:}:{:}@{:}:{:}/{:}?gssencmode=disable".format(user,encoded_password,host,port,db)
    return create_engine(connection_text, connect_args={'options': '-csearch_path={}'.format(schema)})


engine = postconn_et(encoded_password, host=host, user=user)

In [None]:
cdf.to_sql(name = 'amfluxeddy',
           schema='groundwater',
           con=engine,
           if_exists='replace',
           chunksize=2000)

In [None]:
for col in soildfs.columns:
    print(f"amfluxmet.{col},")

In [None]:
soildfs.to_sql(name = 'amfluxmet',
           schema='groundwater',
           con=engine,
           if_exists='replace',
           chunksize=2000)