In [11]:
# 1. SETUP AND IMPORTS
# ============================================================================

import os
import json
from datetime import datetime

import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import warnings
warnings.filterwarnings('ignore')


print("="*70)
print("SOH ESTIMATION - STARTER SCRIPT")
print("="*70)
print(f"PyTorch version: {torch.__version__}")
print(f"Device: {'CUDA' if torch.cuda.is_available() else 'CPU'}")
print(f"Start time: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*70)
print()

# Set random seeds for reproducibility
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)

# MODEL DEFINITION
# ============================================================================

class SimpleLSTM(nn.Module):
    """
    Simple LSTM model for SOH estimation
    
    Architecture:
        Input → LSTM → Fully Connected → Output
    """
    def __init__(self, input_size, hidden_size, num_layers=1, output_size=1, dropout=0.2):
        super(SimpleLSTM, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.output_size = output_size
        
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0
        )
        
        self.fc = nn.Linear(hidden_size, output_size)
    
    def forward(self, x):
        """Forward pass"""
        lstm_out, (h_n, c_n) = self.lstm(x)
        last_output = lstm_out[:, -1, :]
        output = self.fc(last_output)
        return output
    
    def get_model_info(self):
        """Return model configuration"""
        return {
            "model_type": "SimpleLSTM",
            "input_size": self.input_size,
            "hidden_size": self.hidden_size,
            "num_layers": self.num_layers,
            "output_size": self.output_size,
            "total_parameters": sum(p.numel() for p in self.parameters())
        }

print("Model class defined successfully\n")


SOH ESTIMATION - STARTER SCRIPT
PyTorch version: 1.10.2+cu113
Device: CUDA
Start time: 2025-11-20 08:17:09

Model class defined successfully



In [4]:
# 2. DATA LOADING AND PREPARATION
# ============================================================================

# Configuration
DATA_PATH = '../data/cycles_example.csv'
FEATURE_COLS = ['cycle_number', 'voltage_v', 'current_A', 'temperature', 'capacity_ah']

# Check if file exists
if not os.path.exists(DATA_PATH):
    raise FileNotFoundError(f"Data file not found: {DATA_PATH}")

# Load the dataset
df = pd.read_csv(DATA_PATH)
print(f"Data loaded successfully from: {DATA_PATH}")
print(f"  • Data shape: {df.shape}")
print(f"  • Columns: {df.columns.tolist()}")
print()

# Generate synthetic SOH labels
def generate_synthetic_soh(cycle_numbers, initial_capacity=1.0, degradation_rate=0.0003):
    """
    Generate synthetic SOH based on cycle number
    Simulates linear capacity fade with noise
    """
    soh_base = initial_capacity - degradation_rate * cycle_numbers
    noise = np.random.normal(0, 0.01, size=len(cycle_numbers))
    soh = np.clip(soh_base + noise, 0.7, 1.0)
    return soh

cycle_numbers = df['cycle_number'].values
soh_labels = generate_synthetic_soh(cycle_numbers)

print(f"Synthetic SOH labels generated")
print(f"  • SOH range: {soh_labels.min():.3f} - {soh_labels.max():.3f}")
print(f"  • Mean SOH: {soh_labels.mean():.3f}")
print()

# Extract features
features = df[FEATURE_COLS].values
targets = soh_labels

print(f"Features extracted")
print(f"  • Feature shape: {features.shape}")
print(f"  • Target shape: {targets.shape}")
print()

Data loaded successfully from: ../data/cycles_example.csv
  • Data shape: (29, 7)
  • Columns: ['cycle_number', 'time_s', 'current_A', 'voltage_v', 'capacity_ah', 'temperature', 'status']

Synthetic SOH labels generated
  • SOH range: 0.980 - 1.000
  • Mean SOH: 0.995

Features extracted
  • Feature shape: (29, 5)
  • Target shape: (29,)



In [5]:
"""
Load battery cycling data from the data directory
"""

print("="*70)
print("LOADING DATA")
print("="*70 + "\n")

# Define file path
data_path = '../data/cycles_example.csv'

# Check if file exists
if not os.path.exists(data_path):
    raise FileNotFoundError(f"Data file not found: {data_path}")

# Load dataset
df = pd.read_csv(data_path)

print(f"Data loaded successfully from: {data_path}")
print(f"  • Data shape: {df.shape}")
print(f"  • Columns: {df.columns.tolist()}")

print("\n" + "-"*70)
print("Data Preview (first 5 rows):")
print("-"*70)
print(df.head())


LOADING DATA

Data loaded successfully from: ../data/cycles_example.csv
  • Data shape: (29, 7)
  • Columns: ['cycle_number', 'time_s', 'current_A', 'voltage_v', 'capacity_ah', 'temperature', 'status']

----------------------------------------------------------------------
Data Preview (first 5 rows):
----------------------------------------------------------------------
   cycle_number  time_s  current_A  voltage_v  capacity_ah  temperature  \
0             1       0        0.0       2.04          0.0           25   
1             1     300        1.1       2.18          0.2           25   
2             1    1800        1.1       2.59          0.4           25   
3             1    3600        1.1       3.31          0.6           25   
4             1    7200        1.1       3.60          0.8           25   

      status  
0       rest  
1  cc_charge  
2  cc_charge  
3  cc_charge  
4  cc_charge  


In [6]:
# 3. DATASET SPLITTING
# ============================================================================

def create_sequences(features, targets, seq_length):
    """Create sequences for time series prediction"""
    X, y = [], []
    for i in range(len(features) - seq_length):
        X.append(features[i:i+seq_length])
        y.append(targets[i+seq_length])
    return np.array(X), np.array(y)

# Configuration
SEQ_LENGTH = 10
TRAIN_RATIO = 0.7
VAL_RATIO = 0.15
TEST_RATIO = 0.15

# Create sequences
X, y = create_sequences(features, targets, SEQ_LENGTH)

print(f"Sequences created")
print(f"  • Sequence length: {SEQ_LENGTH}")
print(f"  • X shape: {X.shape}")
print(f"  • y shape: {y.shape}")
print()

# Calculate split indices
n_samples = len(X)
train_size = int(n_samples * TRAIN_RATIO)
val_size = int(n_samples * VAL_RATIO)

# Split the data
X_train = X[:train_size]
y_train = y[:train_size]

X_val = X[train_size:train_size+val_size]
y_val = y[train_size:train_size+val_size]

X_test = X[train_size+val_size:]
y_test = y[train_size+val_size:]

print(f"Dataset split completed")
print(f"  • Training set:   {len(X_train)} samples ({TRAIN_RATIO*100:.0f}%)")
print(f"  • Validation set: {len(X_val)} samples ({VAL_RATIO*100:.0f}%)")
print(f"  • Test set:       {len(X_test)} samples ({TEST_RATIO*100:.0f}%)")
print()

# Convert to PyTorch tensors
X_train_tensor = torch.FloatTensor(X_train)
y_train_tensor = torch.FloatTensor(y_train).reshape(-1, 1)

X_val_tensor = torch.FloatTensor(X_val)
y_val_tensor = torch.FloatTensor(y_val).reshape(-1, 1)

X_test_tensor = torch.FloatTensor(X_test)
y_test_tensor = torch.FloatTensor(y_test).reshape(-1, 1)

# Create DataLoaders
BATCH_SIZE = 4

train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"DataLoaders created with batch size: {BATCH_SIZE}\n")



Sequences created
  • Sequence length: 10
  • X shape: (19, 10, 5)
  • y shape: (19,)

Dataset split completed
  • Training set:   13 samples (70%)
  • Validation set: 2 samples (15%)
  • Test set:       4 samples (15%)

DataLoaders created with batch size: 4



In [7]:
# 4. MODEL TRAINING
# ============================================================================

# Model hyperparameters
INPUT_SIZE = len(FEATURE_COLS)
HIDDEN_SIZE = 32
NUM_LAYERS = 2
OUTPUT_SIZE = 1
DROPOUT = 0.2

# Training hyperparameters
NUM_EPOCHS = 100
LEARNING_RATE = 0.001
WEIGHT_DECAY = 1e-5

# Initialize model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SimpleLSTM(
    input_size=INPUT_SIZE,
    hidden_size=HIDDEN_SIZE,
    num_layers=NUM_LAYERS,
    output_size=OUTPUT_SIZE,
    dropout=DROPOUT
).to(device)

# Loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

# Learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=10, verbose=False
)

# Print model information
model_info = model.get_model_info()
print("="*70)
print("MODEL CONFIGURATION")
print("="*70)
print(f"Model type: {model_info['model_type']}")
print(f"Input size: {model_info['input_size']}")
print(f"Hidden size: {model_info['hidden_size']}")
print(f"Number of layers: {model_info['num_layers']}")
print(f"Total parameters: {model_info['total_parameters']:,}")
print(f"Device: {device}")
print()
print("TRAINING CONFIGURATION")
print("="*70)
print(f"Epochs: {NUM_EPOCHS}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Weight decay: {WEIGHT_DECAY}")
print("="*70)
print()

# Training functions
def train_epoch(model, train_loader, criterion, optimizer, device):
    """Train for one epoch"""
    model.train()
    total_loss = 0
    
    for batch_X, batch_y in train_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)
        
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    return total_loss / len(train_loader)

def validate(model, val_loader, criterion, device):
    """Validate the model"""
    model.eval()
    total_loss = 0
    
    with torch.no_grad():
        for batch_X, batch_y in val_loader:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)
            
            outputs = model(batch_X)
            loss = criterion(outputs, batch_y)
            total_loss += loss.item()
    
    return total_loss / len(val_loader)

# Training loop
training_history = {
    'train_loss': [],
    'val_loss': [],
    'epochs': []
}

best_val_loss = float('inf')
patience_counter = 0
early_stopping_patience = 20

print("="*70)
print("TRAINING PROGRESS")
print("="*70)
print(f"{'Epoch':<8} {'Train Loss':<15} {'Val Loss':<15} {'Status':<20}")
print("-"*70)

for epoch in range(NUM_EPOCHS):
    # Train
    train_loss = train_epoch(model, train_loader, criterion, optimizer, device)
    
    # Validate
    val_loss = validate(model, val_loader, criterion, device)
    
    # Update learning rate
    scheduler.step(val_loss)
    
    # Save history
    training_history['epochs'].append(epoch + 1)
    training_history['train_loss'].append(train_loss)
    training_history['val_loss'].append(val_loss)
    
    # Check for improvement
    status = ""
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        status = "✓ Best model"
        # Save best model
        os.makedirs('../models', exist_ok=True)
        torch.save(model.state_dict(), '../models/best_model.pth')
    else:
        patience_counter += 1
        if patience_counter >= early_stopping_patience:
            status = "Early stopping"
    
    # Print progress
    if (epoch + 1) % 10 == 0 or epoch == 0 or status:
        print(f"{epoch+1:<8} {train_loss:<15.6f} {val_loss:<15.6f} {status:<20}")
    
    # Early stopping
    if patience_counter >= early_stopping_patience:
        print(f"\nEarly stopping triggered after {epoch+1} epochs")
        break

print("-"*70)
print(f"Training completed")
print(f"  • Total epochs: {len(training_history['epochs'])}")
print(f"  • Best validation loss: {best_val_loss:.6f}")
print()

MODEL CONFIGURATION
Model type: SimpleLSTM
Input size: 5
Hidden size: 32
Number of layers: 2
Total parameters: 13,473
Device: cuda

TRAINING CONFIGURATION
Epochs: 100
Batch size: 4
Learning rate: 0.001
Weight decay: 1e-05

TRAINING PROGRESS
Epoch    Train Loss      Val Loss        Status              
----------------------------------------------------------------------
1        0.792569        0.653830        ✓ Best model        
2        0.582704        0.448541        ✓ Best model        
3        0.381335        0.259527        ✓ Best model        
4        0.186860        0.106529        ✓ Best model        
5        0.072082        0.016635        ✓ Best model        
6        0.009538        0.003355        ✓ Best model        
10       0.007072        0.001547        ✓ Best model        
11       0.001662        0.000040        ✓ Best model        
15       0.002889        0.000013        ✓ Best model        
20       0.001641        0.000049                            
30    

In [8]:
# 5. MODEL TESTING AND RESULTS SAVING
# ============================================================================

# Load best model
model.load_state_dict(torch.load('../models/best_model.pth'))
model.eval()

print("="*70)
print("MODEL TESTING")
print("="*70)

# Make predictions
y_true_list = []
y_pred_list = []

with torch.no_grad():
    for batch_X, batch_y in test_loader:
        batch_X = batch_X.to(device)
        batch_y = batch_y.to(device)
        
        outputs = model(batch_X)
        
        y_true_list.extend(batch_y.cpu().numpy().flatten())
        y_pred_list.extend(outputs.cpu().numpy().flatten())

y_true = np.array(y_true_list)
y_pred = np.array(y_pred_list)

print(f"Predictions completed")
print(f"  • Test samples: {len(y_true)}")
print(f"  • True SOH range: {y_true.min():.3f} - {y_true.max():.3f}")
print(f"  • Predicted SOH range: {y_pred.min():.3f} - {y_pred.max():.3f}")
print()

# Prepare test results
test_results = {
    "soh_true": (y_true * 100).tolist(),
    "soh_predicted": (y_pred * 100).tolist(),
    "sample_indices": list(range(len(y_true)))
}


MODEL TESTING
Predictions completed
  • Test samples: 4
  • True SOH range: 0.991 - 1.000
  • Predicted SOH range: 0.995 - 0.997



In [9]:
# 6. PERFORMANCE EVALUATION AND ANALYSIS
# ============================================================================

def calculate_metrics(y_true, y_pred):
    """Calculate comprehensive evaluation metrics"""
    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    
    errors = y_true - y_pred
    abs_errors = np.abs(errors)
    squared_errors = errors ** 2
    
    mae = np.mean(abs_errors)
    rmse = np.sqrt(np.mean(squared_errors))
    
    ss_res = np.sum(squared_errors)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    r2 = 1 - (ss_res / ss_tot) if ss_tot > 0 else 0
    
    mape = np.mean(abs_errors / (np.abs(y_true) + 1e-10)) * 100
    max_error = np.max(abs_errors)
    
    accuracy_3_percent = np.mean(abs_errors < 0.03) * 100
    accuracy_5_percent = np.mean(abs_errors < 0.05) * 100
    
    mean_error = np.mean(errors)
    std_error = np.std(errors)
    
    return {
        "mae": float(mae),
        "rmse": float(rmse),
        "r2_score": float(r2),
        "mape": float(mape),
        "max_error": float(max_error),
        "accuracy_within_3_percent": float(accuracy_3_percent),
        "accuracy_within_5_percent": float(accuracy_5_percent),
        "mean_error": float(mean_error),
        "std_error": float(std_error)
    }

# Calculate metrics
evaluation_metrics = calculate_metrics(y_true, y_pred)

print("="*70)
print("PERFORMANCE EVALUATION")
print("="*70)
print()
print("Regression Metrics:")
print(f"  • MAE:   {evaluation_metrics['mae']:.6f}  ({evaluation_metrics['mae']*100:.4f}%)")
print(f"  • RMSE:  {evaluation_metrics['rmse']:.6f}  ({evaluation_metrics['rmse']*100:.4f}%)")
print(f"  • R²:    {evaluation_metrics['r2_score']:.6f}")
print(f"  • MAPE:  {evaluation_metrics['mape']:.4f}%")
print()
print("Error Analysis:")
print(f"  • Max Error: {evaluation_metrics['max_error']:.6f}  ({evaluation_metrics['max_error']*100:.4f}%)")
print(f"  • Mean Error: {evaluation_metrics['mean_error']:.6f}")
print(f"  • Std Error: {evaluation_metrics['std_error']:.6f}")
print()
print("Accuracy Metrics:")
print(f"  • Within ±3%: {evaluation_metrics['accuracy_within_3_percent']:.2f}%")
print(f"  • Within ±5%: {evaluation_metrics['accuracy_within_5_percent']:.2f}%")
print()

# Performance assessment
mae_percent = evaluation_metrics['mae'] * 100
if mae_percent < 2:
    assessment = "Excellent performance (MAE < 2%)"
elif mae_percent < 3:
    assessment = "Very good performance (MAE < 3%)"
elif mae_percent < 5:
    assessment = "Good performance (MAE < 5%)"
elif mae_percent < 10:
    assessment = "Acceptable performance (MAE < 10%)"
else:
    assessment = "Needs improvement (MAE ≥ 10%)"

print(f"Performance Assessment: {assessment}")
print("="*70)
print()


# ============================================================================
# SAVE COMPLETE RESULTS
# ============================================================================

complete_results = {
    "model_info": {
        "model_type": model_info['model_type'],
        "model_version": "baseline_v1.0",
        "architecture": {
            "input_size": model_info['input_size'],
            "hidden_size": model_info['hidden_size'],
            "num_layers": model_info['num_layers'],
            "output_size": model_info['output_size'],
            "dropout": DROPOUT
        },
        "total_parameters": model_info['total_parameters'],
        "input_features": FEATURE_COLS,
        "sequence_length": SEQ_LENGTH
    },
    "training_configuration": {
        "num_epochs": len(training_history['epochs']),
        "batch_size": BATCH_SIZE,
        "learning_rate": LEARNING_RATE,
        "weight_decay": WEIGHT_DECAY,
        "optimizer": "Adam",
        "loss_function": "MSE"
    },
    "training_history": training_history,
    "dataset_info": {
        "data_source": DATA_PATH,
        "total_sequences": n_samples,
        "train_samples": len(X_train),
        "val_samples": len(X_val),
        "test_samples": len(X_test)
    },
    "test_results": test_results,
    "evaluation_metrics": evaluation_metrics,
    "metadata": {
        "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
        "pytorch_version": torch.__version__,
        "device": str(device),
        "seed": SEED
    }
}

# Save results
results_dir = '../results'
os.makedirs(results_dir, exist_ok=True)
output_path = os.path.join(results_dir, 'soh_estimation_complete_results.json')

with open(output_path, 'w') as f:
    json.dump(complete_results, f, indent=4)

print("="*70)
print("RESULTS SAVED")
print("="*70)
print(f"Complete results: {os.path.abspath(output_path)}")
print(f"Model weights: ../models/best_model.pth")
print("="*70)
print()
print("WORKFLOW COMPLETED SUCCESSFULLY!")


PERFORMANCE EVALUATION

Regression Metrics:
  • MAE:   0.003134  (0.3134%)
  • RMSE:  0.003417  (0.3417%)
  • R²:    0.139523
  • MAPE:  0.3146%

Error Analysis:
  • Max Error: 0.004947  (0.4947%)
  • Mean Error: 0.000661
  • Std Error: 0.003352

Accuracy Metrics:
  • Within ±3%: 100.00%
  • Within ±5%: 100.00%

Performance Assessment: Excellent performance (MAE < 2%)

RESULTS SAVED
Complete results: C:\Users\19069\GitHub_Trustworthy Battery AI\results\soh_estimation_complete_results.json
Model weights: ../models/best_model.pth

WORKFLOW COMPLETED SUCCESSFULLY!
