In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import pathlib
from configparser import ConfigParser

import polars as pl
import json
import re

from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer

import warnings
warnings.filterwarnings('ignore')

In [2]:
import sys
sys.path.append("../../micromet")
import micromet
from micromet import AmerifluxDataProcessor

In [3]:
def filter_polars_df_iqr(df, 
                         column_name, 
                         lower_quant = 0.05, 
                         upper_quant= 0.95, 
                         multiplier=1., 
                         filter_rows=True,
                         replace_value=None 
                         ):
    """
    Filters a Polars DataFrame based on the IQR method for outlier detection.
    
    Parameters:
    df (pl.DataFrame): The Polars DataFrame to filter.
    column_name (str): The name of the column to apply the IQR filter on.
    
    Returns:
    pl.DataFrame: The filtered DataFrame.
    """
    q1 = df.select(pl.col(column_name).quantile(lower_quant)).item()
    q3 = df.select(pl.col(column_name).quantile(upper_quant)).item()
    iqr = q3 - q1

    lower_bound = q1 - multiplier * iqr
    upper_bound = q3 + multiplier * iqr

    if filter_rows:
        return df.filter(
            (pl.col(column_name) >= lower_bound) & (pl.col(column_name) <= upper_bound)
        )

    else:
        return df.with_columns(
            pl.when(pl.col(column_name) < lower_bound)
            .then(replace_value)
            .when(pl.col(column_name) > upper_bound)
            .then(replace_value)
            .otherwise(pl.col(column_name)).alias(column_name)
        )



In [4]:
import pandas as pd
import re

def replace_negative_values(df, column_name, filter_column=None, threshold=0.85, replace_value=0):
    """
    Replace negative values in a specified column of a DataFrame with 0.

    Parameters:
        df (pd.DataFrame): The DataFrame to process.
        column_name (str): The name of the column to check for negative values.

    Returns:
        pd.DataFrame: The DataFrame with negative values replaced by 0.
    """
    if filter_column:
        filter_column = filter_column
    else:
        filter_column = column_name


    df = df.with_columns(
        pl.when(pl.col(filter_column) < threshold)
        .then(replace_value)
        .otherwise(pl.col(column_name))
        .alias(column_name)
    )
    return df

def parse_datetimes_with_decimal_seconds(datetime_series):
    """
    Parse datetime strings that may have optional or missing fractional seconds,
    and normalize them so all include a decimal part.

    Parameters:
        datetime_series (pd.Series or list-like): Datetime strings in format
        'YYYY-MM-DD HH:MM:SS', 'YYYY-MM-DD HH:MM:SS.', or 'YYYY-MM-DD HH:MM:SS.ssssss'.

    Returns:
        pd.Series: Parsed datetime values with consistent fractional seconds.
    """
    def normalize_timestamp(ts):
        ts = ts.strip()
        # Remove trailing dot if no digits follow
        ts = re.sub(r'(?<=\d{2}:\d{2}:\d{2})\.(?!\d)', '', ts)
        # If it ends right after the seconds, append ".0"
        if re.match(r'^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}$', ts):
            ts += '.0'
        return ts

    cleaned = datetime_series.map(normalize_timestamp)
    return pd.to_datetime(cleaned, format="%Y-%m-%d %H:%M:%S.%f")



# Example usage
example = [
    "2023-10-27 15:59:55",
    "2023-10-27 15:59:55.",
    "2023-10-27 15:59:55.123456",
    "2023-10-27 00:00:00"
]

df = pd.DataFrame({'TIMESTAMP': example})
df['parsed'] = parse_datetimes_with_decimal_seconds(df['TIMESTAMP'])
print(df)




                    TIMESTAMP                     parsed
0         2023-10-27 15:59:55 2023-10-27 15:59:55.000000
1        2023-10-27 15:59:55. 2023-10-27 15:59:55.000000
2  2023-10-27 15:59:55.123456 2023-10-27 15:59:55.123456
3         2023-10-27 00:00:00 2023-10-27 00:00:00.000000


In [None]:
from micromet import convert_file
import pathlib
import os

input_dir = pathlib.Path("E:/UGS_Flux/Data_Downloads/")
output_format = "toa5"
format_options = "1"


for file in input_dir.rglob("*Time_Series*.dat"): 

    cr_sn = str(file.stem).split("_")[0]
    file_no = str(file.stem).split("_")[-1]
    file_dir = file.parent
    output_file = file_dir / f"{cr_sn}_{file_no}.asc"
    if output_file.exists():
        print(f"File {output_file} already exists. Skipping conversion.")
        continue
    print(f"Converting {file} to {output_file}")    
    convert_file(file, output_file, output_format, format_options)

In [None]:
input_dir = pathlib.Path("E:/UGS_Flux/Data_Downloads/")
output_format = "toa5"
format_options = "1"


for file in input_dir.rglob("*Time_Series*.dat"): 

    cr_sn = str(file.stem).split("_")[0]
    file_no = str(file.stem).split("_")[-1]
    file_dir = file.parent
    output_file = file_dir / f"{cr_sn}_{file_no}.asc"
    if output_file.exists():
        print(f"File {output_file} already exists. Skipping conversion.")
        continue
    print(f"Converting {file} to {output_file}")    
    convert_file(file, output_file, output_format, format_options)

File E:\UGS_Flux\Data_Downloads\Bluff\21020_0.asc already exists. Skipping conversion.
File E:\UGS_Flux\Data_Downloads\Bluff\21020_1.asc already exists. Skipping conversion.
File E:\UGS_Flux\Data_Downloads\Bluff\21020_10.asc already exists. Skipping conversion.
File E:\UGS_Flux\Data_Downloads\Bluff\21020_100.asc already exists. Skipping conversion.
File E:\UGS_Flux\Data_Downloads\Bluff\21020_101.asc already exists. Skipping conversion.
File E:\UGS_Flux\Data_Downloads\Bluff\21020_102.asc already exists. Skipping conversion.
File E:\UGS_Flux\Data_Downloads\Bluff\21020_103.asc already exists. Skipping conversion.
File E:\UGS_Flux\Data_Downloads\Bluff\21020_104.asc already exists. Skipping conversion.
File E:\UGS_Flux\Data_Downloads\Bluff\21020_105.asc already exists. Skipping conversion.
File E:\UGS_Flux\Data_Downloads\Bluff\21020_106.asc already exists. Skipping conversion.
File E:\UGS_Flux\Data_Downloads\Bluff\21020_107.asc already exists. Skipping conversion.
File E:\UGS_Flux\Data_Down

In [12]:



# Step 1 & 2: Read lines manually to get column names and units
with open(output_file, 'r') as f:
    lines = f.readlines()
    column_names = [col.strip().strip('"') for col in lines[1].split(",")]
    units = [unit.strip().strip('"') for unit in lines[2].split(",")]

# Step 3: Create a units dictionary
unit_map = dict(zip(column_names, units))

# Step 4: Read data using polars, skipping 4 lines and setting column names
df = pl.read_csv(
    output_file,
    skip_rows=4,
    has_header=False,  # because we are setting our own column names
    new_columns=column_names,
    quote_char='"',
    try_parse_dates=True
)



for col in df.columns:
    if col != "TIMESTAMP":
        print(col, df[col].dtype)
        df = filter_polars_df_iqr(df, col, filter_rows=False)

df = replace_negative_values(df, 'H2O_density', filter_column='H2O_sig_strgth', threshold=0.85, replace_value=0)
#df["TIMESTAMP"] = parse_polars_datetimes_with_decimal_seconds(df['TIMESTAMP'])
# Step 1: Convert Polars → Pandas
df_pd = df.to_pandas()
df_pd["TIMESTAMP"] = parse_datetimes_with_decimal_seconds(df_pd["TIMESTAMP"])



# Save the units dictionary to a JSON file
with open("units.json", "w") as f:
    json.dump(unit_map, f, indent=2)

df_pd = df_pd.set_index(["TIMESTAMP"])
df_resamp = df_pd.resample("100L").asfreq()

# Step 2: Impute with IterativeImputer
imputer = IterativeImputer(random_state=0)
df_imputed_pd = pd.DataFrame(imputer.fit_transform(df_resamp), 
                             columns=df_resamp.columns, 
                             index=df_resamp.index)



df_imputed = pl.from_pandas(df_imputed_pd)

df_imputed = replace_negative_values(df_imputed, 'H2O_density')

datalogger = str(input_file.stem).split("_")[0]
fileno = str(input_file.stem).split("_")[-1] 
first_date = df_imputed_pd.first_valid_index()
minutes = round((df_imputed_pd.last_valid_index() - df_imputed_pd.first_valid_index()).seconds/60,0) 
meas_freq = len(df_imputed_pd)/(minutes*60) # Hz
first_date

df_imputed.write_csv(f"{datalogger}_imputed_{fileno}_{first_date:%Y_%m_%d_%H%M}.csv")

RECORD Int64
Ux Float64
Uy Float64
Uz Float64
T_SONIC Float64
diag_sonic Int64
CO2_density Float64
CO2_density_fast_tmpr Float64
H2O_density Float64
diag_irga Int64
T_SONIC_corr Float64
TA_1_1_1 Float64
PA Float64
CO2_sig_strgth Float64
H2O_sig_strgth Float64


In [19]:
unit_map

{'TIMESTAMP': 'TS',
 'RECORD': 'RN',
 'Ux': 'm s-1',
 'Uy': 'm s-1',
 'Uz': 'm s-1',
 'T_SONIC': 'deg C',
 'diag_sonic': 'adimensional',
 'CO2_density': 'mg m-3',
 'CO2_density_fast_tmpr': '',
 'H2O_density': 'g m-3',
 'diag_irga': 'adimensional',
 'T_SONIC_corr': 'deg C',
 'TA_1_1_1': 'deg C',
 'PA': 'kPa',
 'CO2_sig_strgth': 'fraction',
 'H2O_sig_strgth': 'fraction'}

In [None]:
import matplotlib.pyplot as plt


df_imputed_pd['H2O_density'].plot()


In [16]:
def prepend_line(filename, line_to_prepend):
    with open(filename, 'r+', encoding='utf-8') as f:
        content = f.read()
        f.seek(0, 0)
        f.write(line_to_prepend.rstrip('\r\n') + '\n' + content)

In [18]:
# Load the data
config_path = f'../station_config/datafile.metadata'
# instantiate
config = ConfigParser()

# parse existing file
config.read(config_path)

fpath = pathlib.Path(r"C:\Users\paulinkenbrandt\Downloads\UGS Flux SDI12 Addresses Pakbus Metadata and Connection - Site_metadata.csv")
df = pd.read_csv(fpath)
df.dropna(subset=['stationid'], inplace=True)
df

for stat in df.index:
    site_id = df.loc[stat, 'stationid']
    name = df.loc[stat, 'parameter']
    latitude = df.loc[stat, 'latitude']
    longitude = df.loc[stat, 'longitude']
    altitude = df.loc[stat, 'altitude']
    canopy_height = df.loc[stat, 'height_canopy']

    config.set('Station', 'station_name', f"{name}")
    config.set('Station', 'station_id', f"{site_id}")
    config.set('Site','site_name', f"{name}")
    config.set('Site','site_id', f"{site_id}")
    config.set('Site','altitude', f"{altitude}")
    config.set('Site','latitude', f"{latitude}")
    config.set('Site','longitude', f"{longitude}")    
    config.set('Site','canopy_height', f"{canopy_height}")
    config.set('Timing','acquisition_frequency', f"{int(meas_freq):0d}")
    config.set('Timing','file_duration', f"{int(minutes):0d}")

    config.set('Project','title', f"{name}_{int(minutes):2d}min_process")
    config.set('Project','id', f"{site_id.split("-")[-1]}")

    with open(f'../station_config/edpro_{site_id.split("-")[-1]}.metadata', 'w') as configfile:
        config.write(configfile)
    prepend_line(f'../station_config/edpro_{site_id.split("-")[-1]}.metadata', ";GHG_METADATA")