In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import datetime


In [5]:
# Setup basedir
BASE_DIR = '/Volumes/SANDISK_SSD_G40/GoogleDrive/GitHub/sagehen_meadows'
os.chdir(BASE_DIR)
os.getcwd()

'/Volumes/SANDISK_SSD_G40/GoogleDrive/GitHub/sagehen_meadows'

In [6]:
# Import DENDRA Weather Data
dendra = pd.read_csv('data/station_instrumentation/climate/Dendra/Dendra_Sagehen_2010-2025.csv')
dendra["time"] = pd.DatetimeIndex(dendra.time)
dendra = dendra.set_index("time")
dendra.index.min(), dendra.index.max()


(Timestamp('2010-01-01 00:00:00'), Timestamp('2026-01-06 14:40:00'))

In [7]:
dendra.columns

Index(['air-temp-min-degc', 'air-temp-max-degc', 'air-temp-avg-degc',
       'barometric-pressure-avg-mb', 'battery-voltage-avg-v',
       'battery-voltage-max-v', 'battery-voltage-min-v',
       'precipitation-geonor-freq-s-1', 'precipitation-geonor-cm-cm',
       'precipitation-geonor-freq-std-s-1', 'precipitation-geonor-mm-mm',
       'rainfall-mm', 'relative-humidity-max-pct', 'relative-humidity-avg-pct',
       'relative-humidity-min-pct', 'snow-depth-max-in', 'snow-depth-min-in',
       'snow-depth-std-in', 'snow-depth-in', 'snow-temp-avg-degc',
       'snow-temp-min-degc', 'snow-temp-max-degc', 'soil-temp-max-degc',
       'soil-temp-avg-degc', 'soil-temp-min-degc', 'tdr-freq-1-usec',
       'total-solar-radiation-avg-wm-2', 'wind-direction-100-ft-std-deg',
       'wind-direction-100-ft-avg-deg', 'wind-direction-25-ft-std-deg',
       'wind-direction-25-ft-avg-deg', 'wind-speed-100-ft-max-ms',
       'wind-speed-25-ft-max-ms', 'wind-speed-100-ft-avg-ms',
       'wind-speed-25-ft

In [22]:
## COMPLETENESS VALIDATION
# Set start and end time for reporting on completeness
start = "2017-10-01"
end = "2025-11-10"
dendra_time = dendra.loc[start:end]

dendra_time.index.min(), dendra_time.index.max()


(Timestamp('2017-10-01 00:00:00'), Timestamp('2025-11-10 23:50:00'))

In [32]:
expected_len_inferred = len(dendra_time)
expected_len_inferred

426261

In [33]:
dendra_time.index.is_monotonic_increasing


True

In [34]:
dendra_time.index.to_series().diff().value_counts().head()


time
0 days 00:10:00    426243
0 days 00:50:00         3
0 days 00:20:00         3
0 days 01:10:00         2
0 days 00:40:00         2
Name: count, dtype: int64

In [35]:
freq = pd.infer_freq(dendra_time.index)
freq
# if nothing returns, pandas cannot infer the frequency of the data

In [36]:
step = pd.to_timedelta(freq)
step
# if nothing returns, pandas cannot infer the timestep of the data

In [38]:
step = pd.Timedelta("10min")
expected_len = int((dendra_time.index.max() - dendra_time.index.min()) / step) + 1
expected_len - expected_len_inferred # diff reflects some irregularity in the time series

411

In [48]:
expected_len

426672

In [49]:
completeness = (
    dendra_time.notna()
        .sum()
        .div(expected_len)
        .mul(100)
        .round(1)
        .rename("percent_complete")
)

completeness

air-temp-min-degc                        99.9
air-temp-max-degc                        99.9
air-temp-avg-degc                        99.9
barometric-pressure-avg-mb               99.9
battery-voltage-avg-v                    99.9
battery-voltage-max-v                    99.9
battery-voltage-min-v                    99.9
precipitation-geonor-freq-s-1            99.9
precipitation-geonor-cm-cm                6.9
precipitation-geonor-freq-std-s-1        34.3
precipitation-geonor-mm-mm               34.3
rainfall-mm                               6.9
relative-humidity-max-pct                99.9
relative-humidity-avg-pct                99.9
relative-humidity-min-pct                99.9
snow-depth-max-in                         0.0
snow-depth-min-in                         0.0
snow-depth-std-in                         0.0
snow-depth-in                            97.4
snow-temp-avg-degc                        0.0
snow-temp-min-degc                        0.0
snow-temp-max-degc                

In [121]:
# Report on GOOD-to-GO variables
complete = completeness[completeness > 95]
complete

air-temp-min-degc                        99.9
air-temp-max-degc                        99.9
air-temp-avg-degc                        99.9
barometric-pressure-avg-mb               99.9
battery-voltage-avg-v                    99.9
battery-voltage-max-v                    99.9
battery-voltage-min-v                    99.9
precipitation-geonor-freq-s-1            99.9
relative-humidity-max-pct                99.9
relative-humidity-avg-pct                99.9
relative-humidity-min-pct                99.9
snow-depth-in                            97.4
total-solar-radiation-avg-wm-2           99.9
wind-direction-100-ft-std-deg            99.9
wind-direction-100-ft-avg-deg            99.9
wind-direction-25-ft-std-deg             99.9
wind-direction-25-ft-avg-deg             99.9
wind-speed-100-ft-max-ms                 99.9
wind-speed-25-ft-max-ms                  99.9
wind-speed-100-ft-avg-ms                 99.9
wind-speed-25-ft-avg-ms                  99.9
wind-speed-vector-magnitude-25-ft-

AttributeError: 'Index' object has no attribute 'index'

In [104]:
def missing_runs(series):
    is_missing = series.isna()

    # label consecutive True runs
    groups = is_missing.ne(is_missing.shift()).cumsum()

    runs = (
        series[is_missing]
        .groupby(groups)
        .apply(lambda x: pd.DataFrame({
            "start": [x.index[0]],
            "end": [x.index[-1]],
            "n_points": [len(x)]
        }))
        .reset_index(drop=True)
    )

    return runs


In [105]:
missing_runs(dendra_time["precipitation-geonor-mm-mm"])


Unnamed: 0,start,end,n_points
0,2017-12-18 11:10:00,2017-12-18 11:10:00,1
1,2020-07-14 03:10:00,2025-11-10 23:50:00,279794


In [106]:
precip_runs = missing_runs(dendra_time["precipitation-geonor-cm-cm"])
precip_runs.dtypes

start       datetime64[ns]
end         datetime64[ns]
n_points             int64
dtype: object

In [107]:
runs = precip_runs
runs["duration"] = (runs["n_points"] - 1) * step
MIN_GAP = pd.Timedelta("1D")
major = runs[runs["duration"] >= MIN_GAP]
major

Unnamed: 0,start,end,n_points,duration
1,2018-04-24 16:00:00,2025-11-10 23:50:00,396645,2754 days 11:20:00


In [108]:

incomplete = completeness[completeness < 95]
incomplete


precipitation-geonor-cm-cm                6.9
precipitation-geonor-freq-std-s-1        34.3
precipitation-geonor-mm-mm               34.3
rainfall-mm                               6.9
snow-depth-max-in                         0.0
snow-depth-min-in                         0.0
snow-depth-std-in                         0.0
snow-temp-avg-degc                        0.0
snow-temp-min-degc                        0.0
snow-temp-max-degc                        0.0
soil-temp-max-degc                        0.0
soil-temp-avg-degc                        0.0
soil-temp-min-degc                        0.0
tdr-freq-1-usec                          34.3
rainfall-cumulative-mm                    6.9
total-solar-radiation-daily-tot-mjm-2    93.0
air-temp-25-ft-avg-degc                  92.9
air-temp-100-ft-avg-degc                 92.9
relative-humidity-25-ft-avg-pct          92.9
relative-humidity-100-ft-avg-pct         92.9
leaf-wetness-max-mv                      13.9
leaf-wetness-avg-mv               

In [109]:
## Identify dates of major gaps for variables with incomplete data
#  Major gap defined as ? days
MIN_GAP = pd.Timedelta("3D")

# Create a dict that reports on gaps
gap_report = []

# Summarize the gaps for columns with major incompleteness issues
incomplete = completeness[completeness < 95]

for col in incomplete.index:
    
    runs = missing_runs(dendra_time[col])
    if runs.empty: 
        continue
    
    #runs["duration"] = (runs["n_points"] - 1) * step
    runs["duration"] = runs["end"] - runs["start"]
    
    major = runs[runs["duration"] >= MIN_GAP]

    # add rows to gaps_report
    for _, row in major.iterrows():
        gap_report.append({
            "variable": col,
            "percent_complete": incomplete[col],
            "gap_start": row["start"],
            "gap_duration": row["duration"]
        })

summary = pd.DataFrame(gap_report)

summary = summary.sort_values(["variable", "gap_start"]).reset_index(drop=True)
summary

Unnamed: 0,variable,percent_complete,gap_start,gap_duration
0,air-temp-100-ft-avg-degc,92.9,2017-10-01 00:00:00,205 days 15:50:00
1,air-temp-25-ft-avg-degc,92.9,2017-10-01 00:00:00,205 days 15:50:00
2,leaf-wetness-avg-mv,13.9,2017-10-01 00:00:00,2552 days 11:50:00
3,leaf-wetness-max-mv,13.9,2017-10-01 00:00:00,2552 days 11:50:00
4,leaf-wetness-min-mv,13.9,2017-10-01 00:00:00,2552 days 11:50:00
5,leaf-wetness-minutes-contaminated-min,13.9,2017-10-01 00:00:00,2552 days 11:50:00
6,leaf-wetness-minutes-dry-min,13.9,2017-10-01 00:00:00,2552 days 11:50:00
7,leaf-wetness-minutes-wet-min,13.9,2017-10-01 00:00:00,2552 days 11:50:00
8,leaf-wetness-mv,13.9,2017-10-01 00:00:00,2552 days 11:50:00
9,precipitation-geonor-cm-cm,6.9,2018-04-24 16:00:00,2757 days 07:50:00


In [110]:
# Calculate min, max and mean for a column (or weather variable)
tdr_stats = dendra.loc['2017-10-01':'2020-07-01', 'tdr-freq-1-usec'].agg(["min", "max", "mean"])
tdr_stats

min    -21.40000
max     31.11000
mean     5.13563
Name: tdr-freq-1-usec, dtype: float64

In [123]:
complete_vars = completeness[completeness > 95].index
stats = (dendra[complete_vars].agg(["min", "max", "mean", "std"]).T)
summary_stats = stats.join(completeness)
summary_stats


Unnamed: 0,min,max,mean,std,percent_complete
air-temp-min-degc,-100.0,66.33963,5.108864,10.059436,99.9
air-temp-max-degc,-47.45,33.73,5.537239,10.169941,99.9
air-temp-avg-degc,-49.8,33.44,5.322992,10.110485,99.9
barometric-pressure-avg-mb,0.0,869.2219,833.679587,32.242398,99.9
battery-voltage-avg-v,-20.07604,15.13,13.170705,0.286775,99.9
battery-voltage-max-v,8.93,20.01986,13.205573,0.294279,99.9
battery-voltage-min-v,-1.72668,15.13,13.157758,0.28684,99.9
precipitation-geonor-freq-s-1,0.0,5165.0,1950.746894,572.013537,99.9
relative-humidity-max-pct,0.0,100.0,66.329945,24.972351,99.9
relative-humidity-avg-pct,0.0,100.0,64.631735,26.021684,99.9
