In [3]:
"""
Simple Baseline Calculator

Quick calculation of baseline MAE and loss from your data.
Checks if your model plateaus are just baseline performance.
"""

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from pathlib import Path


def calculate_baseline(csv_path: str, val_split: float = 0.1):
    """
    Calculate baseline metrics quickly.
    
    Parameters:
    -----------
    csv_path : str
        Path to your CSV database
    val_split : float
        Validation split (same as training)
    """
    print("="*70)
    print("BASELINE CALCULATOR")
    print("="*70)
    
    # Load data
    df = pd.read_csv(csv_path)
    barriers = df['backward_barrier_eV'].values
    
    print(f"\nData loaded: {len(barriers)} samples")
    print(f"Range: [{barriers.min():.3f}, {barriers.max():.3f}] eV")
    print(f"Mean: {barriers.mean():.3f} eV")
    print(f"Std: {barriers.std():.3f} eV")
    
    # Split train/val
    np.random.seed(42)
    n_val = int(len(barriers) * val_split)
    indices = np.random.permutation(len(barriers))
    
    train_barriers = barriers[indices[n_val:]]
    val_barriers = barriers[indices[:n_val]]
    
    print(f"\nTrain: {len(train_barriers)} samples")
    print(f"Val: {len(val_barriers)} samples")
    
    print("\n" + "="*70)
    print("BASELINE: Always predict MEAN")
    print("="*70)
    
    # Strategy: Always predict mean (simplest baseline)
    mean_prediction = train_barriers.mean()
    
    # Train metrics
    train_pred = np.full(len(train_barriers), mean_prediction)
    train_mae = np.mean(np.abs(train_pred - train_barriers))
    train_mse = np.mean((train_pred - train_barriers) ** 2)
    
    # Val metrics
    val_pred = np.full(len(val_barriers), mean_prediction)
    val_mae = np.mean(np.abs(val_pred - val_barriers))
    val_mse = np.mean((val_pred - val_barriers) ** 2)
    
    print(f"\nPredicting: {mean_prediction:.3f} eV (always)")
    print(f"\nTrain:")
    print(f"  MAE:  {train_mae:.4f} eV")
    print(f"  Loss (MSE): {train_mse:.4f}")
    
    print(f"\nVal:")
    print(f"  MAE:  {val_mae:.4f} eV")
    print(f"  Loss (MSE): {val_mse:.4f}")
    
    print("\n" + "="*70)
    print("BASELINE: Always predict MEDIAN")
    print("="*70)
    
    # Strategy: Always predict median
    median_prediction = np.median(train_barriers)
    
    # Train metrics
    train_pred = np.full(len(train_barriers), median_prediction)
    train_mae = np.mean(np.abs(train_pred - train_barriers))
    train_mse = np.mean((train_pred - train_barriers) ** 2)
    
    # Val metrics
    val_pred = np.full(len(val_barriers), median_prediction)
    val_mae = np.mean(np.abs(val_pred - val_barriers))
    val_mse = np.mean((val_pred - val_barriers) ** 2)
    
    print(f"\nPredicting: {median_prediction:.3f} eV (always)")
    print(f"\nTrain:")
    print(f"  MAE:  {train_mae:.4f} eV")
    print(f"  Loss (MSE): {train_mse:.4f}")
    
    print(f"\nVal:")
    print(f"  MAE:  {val_mae:.4f} eV")
    print(f"  Loss (MSE): {val_mse:.4f}")
    
    print("\n" + "="*70)
    print("COMPARISON WITH YOUR PLOT")
    print("="*70)
    
    print(f"\nYour plot shows:")
    print(f"  Train MAE plateau:  ~0.15 eV")
    print(f"  Train Loss plateau: ~0.04")
    
    print(f"\nBaseline (mean):")
    print(f"  Train MAE:  {train_mae:.4f} eV")
    print(f"  Train Loss: {train_mse:.4f}")
    
    print("\n" + "="*70)


if __name__ == "__main__":
    # Simple usage
    csv_path = "MoNbTaW.csv"
    
    if not Path(csv_path).exists():
        print(f"Error: {csv_path} not found")
        print("Please update csv_path variable in this script")
        exit(1)
    
    calculate_baseline(csv_path, val_split=0.1)

BASELINE CALCULATOR

Data loaded: 1000 samples
Range: [0.071, 39.198] eV
Mean: 0.614 eV
Std: 1.488 eV

Train: 900 samples
Val: 100 samples

BASELINE: Always predict MEAN

Predicting: 0.620 eV (always)

Train:
  MAE:  0.2323 eV
  Loss (MSE): 2.4574

Val:
  MAE:  0.1625 eV
  Loss (MSE): 0.0394

BASELINE: Always predict MEDIAN

Predicting: 0.550 eV (always)

Train:
  MAE:  0.2209 eV
  Loss (MSE): 2.4623

Val:
  MAE:  0.1487 eV
  Loss (MSE): 0.0353

COMPARISON WITH YOUR PLOT

Your plot shows:
  Train MAE plateau:  ~0.15 eV
  Train Loss plateau: ~0.04

Baseline (mean):
  Train MAE:  0.2209 eV
  Train Loss: 2.4623

