# Regular Expression Handlind version

In [1]:
import re
import matplotlib.pyplot as plt

# Lists to store parsed iterations and losses
iterations = []
losses = []

# Regex pattern to capture iteration number and reduced_train_loss
pattern = re.compile(
    r'Training epoch \d+,\s*iteration\s+(\d+)/\d+\s*\|\s*lr:[^|]+\|\s*global_batch_size:[^|]+\|\s*global_step:\s*\d+\s*\|\s*reduced_train_loss:\s*([0-9]*\.?[0-9]+(?:e[-+]?\d+)?)'
)

# Path to the log file
log_file_path = '/datasets/soc-20250703225140/nemotron_sft_output.log'

# Read and parse the log file
with open(log_file_path, 'r') as f:
    for line in f:
        match = pattern.search(line)
        if match:
            iter_num = int(match.group(1))
            loss_val = float(match.group(2))
            iterations.append(iter_num)
            losses.append(loss_val)

# Plotting
plt.figure()
plt.plot(iterations, losses, marker='o', linestyle='-')
plt.xlabel('Iteration')
plt.ylabel('Reduced Training Loss')
plt.title('Training Loss vs Iteration')
plt.grid(True)
plt.tight_layout()
plt.show()


In [10]:
import re
import matplotlib.pyplot as plt

# Lists to store parsed training and validation metrics
train_iterations = []
train_losses = []
val_steps = []
val_losses = []

# Regex patterns
train_pattern = re.compile(
    r'Training epoch \d+,\s*iteration\s+(\d+)/\d+\s*\|\s*lr:[^|]+\|\s*global_batch_size:[^|]+\|\s*global_step:\s*\d+\s*\|\s*reduced_train_loss:\s*([0-9]*\.?[0-9]+(?:e[-+]?\d+)?)'
)
val_pattern = re.compile(
    r"Epoch\s+\d+,\s*global step\s+(\d+):\s*'val_loss'\s*reached\s*([0-9]*\.?[0-9]+(?:e[-+]?\d+)?)"
)

# Path to the log file
log_file_path = '/datasets/soc-20250703225140/nemotron_sft_output.log'

# Read and parse the log file
with open(log_file_path, 'r') as f:
    for line in f:
        t_match = train_pattern.search(line)
        if t_match:
            train_iterations.append(int(t_match.group(1)))
            train_losses.append(float(t_match.group(2)))
        v_match = val_pattern.search(line)
        if v_match:
            val_steps.append(int(v_match.group(1)))
            val_losses.append(float(v_match.group(2)))

# Plot and save training loss
plt.figure()
plt.plot(train_iterations, train_losses, marker='o', linestyle='-')
plt.xlabel('Iteration')
plt.ylabel('Reduced Training Loss')
plt.title('Training Loss vs Iteration')
plt.grid(True)
plt.tight_layout()
plt.savefig('training_loss.png')
plt.close()

# Plot and save validation loss
plt.figure()
plt.plot(val_steps, val_losses, marker='o', linestyle='-')
plt.xlabel('Global Step')
plt.ylabel('Validation Loss')
plt.title('Validation Loss vs Global Step')
plt.grid(True)
plt.tight_layout()
plt.savefig('validation_loss.png')
plt.close()


# Tensorboard Parsing Version

In [39]:
import tensorflow as tf
from tensorflow.python.framework import tensor_util
import os

def get_values_tensorflow(filepath, tag, steps, values):
    """
    Parses a TensorBoard event file for the given tag.
    Appends event.step to 'steps' and the scalar value (from tensor or simple_value) to 'values'.
    """
    for event in tf.compat.v1.train.summary_iterator(filepath):
        for value in event.summary.value:
            if value.tag == tag:
                # Try tensor field first
                try:
                    arr = tensor_util.MakeNdarray(value.tensor)
                    val = arr.item(0)
                except (TypeError, AttributeError, ValueError):
                    # Fallback to simple_value for scalar summaries
                    val = value.simple_value
                steps.append(event.step)
                values.append(val)

def create_loss_graphs(event_files_folder_path, experiment_name, saving_folder_path):
    """
    Reads all event files in the folder, extracts train & val loss, and saves PNG graphs.
    """
    # Tags used in your NeMo logs
    train_tag = "reduced_train_loss"
    val_tag = "val_loss"

    # Collect event file names
    event_files = [f for f in os.listdir(event_files_folder_path) if f.startswith("events")]
    
    # Extract train loss
    train_steps, train_vals = [], []
    for f in event_files:
        get_values_tensorflow(os.path.join(event_files_folder_path, f), train_tag, train_steps, train_vals)
    # Extract val loss
    val_steps, val_vals = [], []
    for f in event_files:
        get_values_tensorflow(os.path.join(event_files_folder_path, f), val_tag, val_steps, val_vals)

    # Sort by step
    train_data = sorted(zip(train_steps, train_vals))
    val_data = sorted(zip(val_steps, val_vals))

    # Unzip
    ts, tv = zip(*train_data) if train_data else ([], [])
    vs, vv = zip(*val_data) if val_data else ([], [])

    # Plot and save
    import matplotlib.pyplot as plt

    plt.figure()
    plt.plot(ts, tv, marker='o', linestyle='-')
    plt.title(f"{experiment_name} - Train Loss")
    plt.xlabel("Global Step")
    plt.ylabel("Reduced Training Loss")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(saving_folder_path, "train_loss.png"))
    plt.close()

    plt.figure()
    plt.plot(vs, vv, marker='o', linestyle='-')
    plt.title(f"{experiment_name} - Validation Loss")
    plt.xlabel("Global Step")
    plt.ylabel("Validation Loss")
    plt.grid(True)
    plt.tight_layout()
    plt.savefig(os.path.join(saving_folder_path, "val_loss.png"))
    plt.close()

# Example usage:
# create_loss_graphs("/datasets/soc-20250703225140/nemo_checkpoints/nemotron_49b_super_custom_finetune/tb_logs", 
#                    "nemotron_49b_super_custom_finetune", "/datasets/soc-20250703225140/")


In [36]:
event_files = [filename for filename in os.listdir(event_files_folder_path) if 'events.out.' in filename]
event_files.sort()    
tags = get_tags(os.path.join(event_files_folder_path, event_files[0]))
print(tags)

['lr', 'global_batch_size', 'global_step', 'reduced_train_loss', 'grad_norm', 'num_zeros_in_grad', 'train_step_timing in s', 'consumed_samples', 'validation_step_timing in s', 'val_loss', 'epoch']
