In [2]:
import os
import glob
from tensorboard.backend.event_processing import event_accumulator
from collections import defaultdict
import pandas as pd
import numpy as np

In [6]:
def load_bias_metrics_from_tensorboard(root_dir):
    """
    Scans root_dir recursively, finds latest TensorBoard event file for each run,
    loads scalar bias metrics, pivots them by step, and removes training loss and wet-day related tags.

    Returns:
        grouped_dfs: dict of {run_id: pd.DataFrame}, pivoted by step with cleaned tags
    """
    latest_event_files = {}

    # Step 1: Find latest event file for each run
    for root, dirs, files in os.walk(root_dir):
        event_files = [f for f in files if f.startswith("events.out.tfevents")]
        if not event_files:
            continue

        run_id = os.path.basename(root)
        full_paths = [os.path.join(root, f) for f in event_files]
        latest_file = max(full_paths, key=os.path.getmtime)
        latest_event_files[run_id] = latest_file

    # Step 2: Load scalars from each file
    run_data = defaultdict(list)

    for run_id, event_path in latest_event_files.items():
        try:
            ea = event_accumulator.EventAccumulator(event_path)
            ea.Reload()
            for tag in ea.Tags().get('scalars', []):
                for s in ea.Scalars(tag):
                    run_data[run_id].append((tag, s.step, s.value))
        except Exception as e:
            print(f"‚ö†Ô∏è Failed to load {event_path}: {e}")

    # Step 3: Convert to cleaned pivoted DataFrames
    grouped_dfs = {}
    drop_tags = {
        'Loss/train',
        'median_adjusted/Wet Days >1mm',
        'median_adjusted/Very Wet Days >10mm',
        'median_adjusted/Very Very Wet Days >20mm',
        'median_adjusted/Dry Days'
    }

    for run_id, records in run_data.items():
        df = pd.DataFrame(records, columns=["tag", "step", "value"])
        pivoted = df.pivot(index='step', columns='tag', values='value').sort_index()
        pivoted = pivoted.drop(columns=[tag for tag in drop_tags if tag in pivoted.columns], errors='ignore')
        grouped_dfs[run_id] = pivoted.dropna()

    return grouped_dfs




In [7]:
root_dir = "runs_revised/conus_gridmet_cnn/access_cm2-gridmet"
grouped_dfs = load_bias_metrics_from_tensorboard(root_dir)

In [None]:
# import os

# base_dir = "/pscratch/sd/k/kas7897/diffDownscale/jobs_revised_pca/access_cm2-gridmet"
# second_level_dirs = []

# for root, dirs, files in os.walk(base_dir):
#     # Only consider first-level subdirectories
#     if os.path.abspath(root) == os.path.abspath(base_dir):
#         for d in dirs:
#             subdir = os.path.join(root, d)
#             # List subdirectories inside each first-level subdirectory
#             for sub_root, sub_dirs, sub_files in os.walk(subdir):
#                 if os.path.abspath(sub_root) == os.path.abspath(subdir):
#                     for sd in sub_dirs:
#                         sd = sd[:8]
#                         second_level_dirs.append(sd)
#         break  # Only need to process the top level

# print(second_level_dirs)

# grouped_dfs = {k: v for k, v in grouped_dfs.items() if any(sub in k for sub in second_level_dirs)}


['a0b8fd4c', 'ab2538cf', '08fb4524', '43705867', 'bfcdd469', 'd6a01914', '18e94be5', '2eba82c4', '4a89eced', '6aab0ccc', '759cef29', 'e91a39c1', '2bac29a2', '51cfede1', '92c02791', 'b3bdb62f', '73a5cbfa', '15950e27', '4f247c2c', 'fe95099e', 'b9905a36', '6dc6b33f', 'b2b3ea84', 'fe7cb7a4', 'e2ce595b']


In [8]:
len(grouped_dfs)

6

In [42]:
# root_dir1 = "runs_revised/conus_pca/access_cm2-gridmet"
# grouped_dfs_pca = load_bias_metrics_from_tensorboard(root_dir1)

# grouped_dfs = grouped_dfs | grouped_dfs_pca

In [9]:
import pandas as pd

def find_best_experiment_and_epoch(exp_dict, agg_method='median'):
    """
    Args:
        exp_dict: dict of {exp_name: pd.DataFrame} with index=step, columns=indices (bias %)
        agg_method: 'median', 'mean', or 'sum' to aggregate bias across indices
    
    Returns:
        best_overall: (exp, step, score)
        best_per_index: {index: (exp, step, bias)}
        score_df: dataframe with all scores
    """
    rows = []

    for exp, df in exp_dict.items():
        for step, row in df.iterrows():
            bias_vals = row.dropna()
            if agg_method == 'median':
                score = bias_vals.abs().median()
            elif agg_method == 'mean':
                score = bias_vals.abs().mean()
            elif agg_method == 'sum':
                score = bias_vals.abs().sum()
            else:
                raise ValueError("agg_method must be 'median', 'mean', or 'sum'")

            rows.append({
                'exp': exp,
                'step': step,
                'score': score,
                **row.to_dict()
            })

    score_df = pd.DataFrame(rows)

    # Best overall (lowest aggregated score)
    best_overall_row = score_df.loc[score_df['score'].idxmin()]
    best_overall = (best_overall_row['exp'], best_overall_row['step'], best_overall_row['score'])

    # Best for each index (closest to 0 bias)
    indices = [col for col in score_df.columns if col not in ['exp', 'step', 'score']]
    best_per_index = {}
    for ind in indices:
        best_row = score_df.loc[score_df[ind].abs().idxmin()]
        best_per_index[ind] = (best_row['exp'], best_row['step'], best_row[ind])

    return best_overall, best_per_index, score_df


In [10]:
best_overall, best_per_index, scores = find_best_experiment_and_epoch(grouped_dfs, agg_method='median')


In [11]:
best_overall

('4de68857_1979_2000_2001_2014', 220, 7.9664692878723145)

In [12]:
best_per_index

{'Loss/validation': ('40970740_1979_2000_2001_2014', 380, 2.3028457164764404),
 'median_adjusted/CDD (Yearly)': ('74cc5d77_1979_2000_2001_2014',
  30,
  0.18543754518032074),
 'median_adjusted/CWD (Yearly)': ('cd2c368c_1979_2000_2001_2014',
  80,
  -0.0371057502925396),
 'median_adjusted/R10mm': ('cd2c368c_1979_2000_2001_2014',
  70,
  -0.5277726054191589),
 'median_adjusted/R20mm': ('6816184f_1979_2000_2001_2014', 130, 0.0),
 'median_adjusted/R95pTOT': ('74cc5d77_1979_2000_2001_2014',
  250,
  -0.13641822338104248),
 'median_adjusted/R99pTOT': ('74cc5d77_1979_2000_2001_2014',
  250,
  -0.13641822338104248),
 'median_adjusted/Rx1day': ('cd2c368c_1979_2000_2001_2014',
  70,
  5.9045257568359375),
 'median_adjusted/Rx5day': ('40970740_1979_2000_2001_2014',
  0,
  3.2955329418182373),
 'median_adjusted/SDII (Monthly)': ('4de68857_1979_2000_2001_2014',
  60,
  0.8129074573516846)}

In [13]:
from collections import Counter

def count_best_indices(best_per_index):
    exp_counts = Counter()
    for idx, (exp, step, bias) in best_per_index.items():
        exp_counts[exp] += 1
    return dict(exp_counts)

counts = count_best_indices(best_per_index)
print("üèÜ Best Index Counts Per Experiment:")
for exp, count in counts.items():
    print(f"{exp}: {count} indices")

üèÜ Best Index Counts Per Experiment:
40970740_1979_2000_2001_2014: 2 indices
74cc5d77_1979_2000_2001_2014: 3 indices
cd2c368c_1979_2000_2001_2014: 3 indices
6816184f_1979_2000_2001_2014: 1 indices
4de68857_1979_2000_2001_2014: 1 indices


In [None]:
def find_best_experiment_with_stability(exp_dict, agg_method='median', 
                                       stability_window=10, min_epochs=50,
                                       loss_weight=0.3, bias_weight=0.7):
    """
    Find best experiment considering both bias performance AND training stability.
    
    Args:
        stability_window: number of recent epochs to check for stability
        min_epochs: minimum training epochs before considering a model
        loss_weight, bias_weight: relative importance of loss vs bias (should sum to 1)
    """
    results = []
    
    for exp, df in exp_dict.items():
        if len(df) < min_epochs:
            continue
            
        # Get loss data (you'll need to load this separately)
        loss_data = load_loss_data(exp)  # You'll need to implement this
        
        for step in df.index[min_epochs:]:  # Only consider after min_epochs
            row = df.loc[step]
            bias_vals = row.dropna()
            
            if len(bias_vals) == 0:
                continue
                
            # 1. Calculate bias score
            if agg_method == 'median':
                bias_score = bias_vals.abs().median()
            elif agg_method == 'mean':
                bias_score = bias_vals.abs().mean()
            else:
                bias_score = bias_vals.abs().sum()
            
            # 2. Check training stability
            stability_metrics = calculate_stability(loss_data, step, stability_window)
            
            # 3. Combined score
            combined_score = (loss_weight * stability_metrics['loss_score'] + 
                            bias_weight * bias_score)
            
            results.append({
                'exp': exp,
                'step': step,
                'bias_score': bias_score,
                'loss_trend': stability_metrics['loss_trend'],
                'loss_variance': stability_metrics['loss_variance'],
                'is_stable': stability_metrics['is_stable'],
                'epochs_since_improvement': stability_metrics['epochs_since_improvement'],
                'combined_score': combined_score,
                **row.to_dict()
            })
    
    results_df = pd.DataFrame(results)
    
    # Filter to only stable models
    stable_results = results_df[results_df['is_stable']].copy()
    
    if len(stable_results) == 0:
        print("‚ö†Ô∏è No stable models found! Relaxing stability criteria...")
        stable_results = results_df
    
    # Best stable model
    best_stable = stable_results.loc[stable_results['combined_score'].idxmin()]
    
    return best_stable, stable_results

def calculate_stability(loss_data, current_step, window=10):
    """Calculate training stability metrics"""
    if current_step < window:
        return {'is_stable': False, 'loss_trend': float('inf'), 
                'loss_variance': float('inf'), 'loss_score': 1.0,
                'epochs_since_improvement': 0}
    
    recent_loss = loss_data[current_step-window:current_step+1]
    
    # 1. Loss trend (should be flat or slightly decreasing)
    loss_trend = np.polyfit(range(len(recent_loss)), recent_loss, 1)[0]
    
    # 2. Loss variance (should be low)
    loss_variance = np.var(recent_loss)
    
    # 3. Epochs since last significant improvement
    best_loss_idx = np.argmin(loss_data[:current_step+1])
    epochs_since_improvement = current_step - best_loss_idx
    
    # 4. Stability criteria
    is_stable = (
        abs(loss_trend) < 0.001 and  # Trend is nearly flat
        loss_variance < 0.01 and     # Low variance
        epochs_since_improvement < window * 2  # Recent improvement
    )
    
    # 5. Loss score (normalized, lower is better)
    loss_score = min(recent_loss) / max(loss_data)  # Relative to worst loss
    
    return {
        'is_stable': is_stable,
        'loss_trend': loss_trend,
        'loss_variance': loss_variance,
        'loss_score': loss_score,
        'epochs_since_improvement': epochs_since_improvement
    }

def load_loss_data(exp_name):
    """Load training loss for a specific experiment"""
    # You'll need to implement this based on your TensorBoard data
    # Return array of loss values by epoch
    pass