## ROS2 Diagnostics Runtime Monitor
Monitor and calculate average runtime metrics from diagnostic_msgs/DiagnosticArray

In [None]:
!pip install rclpy diagnostic-msgs pandas numpy==1.25.0 matplotlib seaborn

In [None]:
import rclpy
from rclpy.node import Node
from diagnostic_msgs.msg import DiagnosticArray
import time
import threading
import pandas as pd
import numpy as np
from collections import defaultdict, deque
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

class DiagnosticsMonitor(Node):
    def __init__(self, window_duration_sec=60):
        super().__init__('diagnostics_monitor')
        
        # Configuration
        self.window_duration = window_duration_sec
        self.data_lock = threading.Lock()
        
        # Data storage - using deque for efficient sliding window
        self.timing_data = defaultdict(lambda: deque())
        self.timestamps = defaultdict(lambda: deque())
        
        # Statistics storage
        self.stats_history = []
        self.last_stats_time = time.time()
        
        # Create subscription
        self.subscription = self.create_subscription(
            DiagnosticArray,
            '/diagnostics',  # Change this to your topic name if different
            self.diagnostics_callback,
            10
        )
        
        self.get_logger().info(f'Started diagnostics monitor with {window_duration_sec}s window')
        
    def diagnostics_callback(self, msg):
        current_time = time.time()
        
        with self.data_lock:
            # Process each diagnostic status in the array
            for status in msg.status:
                # Extract timing values from the diagnostic status
                for value in status.values:
                    # Parse the value string
                    try:
                        parts = value.value.split()
                        if len(parts) != 2:
                            continue # Not a value-unit pair

                        timing_value_str, unit_str = parts
                        timing_value = float(timing_value_str)
                        
                        # Normalize the value to milliseconds (ms)
                        timing_value_ms = 0.0
                        if unit_str == 'ns':
                            timing_value_ms = timing_value / 1_000_000.0
                        elif unit_str == 'µs':
                            timing_value_ms = timing_value / 1_000.0
                        elif unit_str == 'ms':
                            timing_value_ms = timing_value
                        else:
                            continue # Skip if unit is not recognized
                            
                        metric_name = f"{status.name}: {value.key}"
                        
                        # Add new data point (already in ms)
                        self.timing_data[metric_name].append(timing_value_ms)
                        self.timestamps[metric_name].append(current_time)
                        
                        # Remove old data points outside the window
                        self._cleanup_old_data(metric_name, current_time)
                        
                    except (ValueError, IndexError):
                        # Skip non-numeric or malformed values gracefully
                        continue
    
    def _cleanup_old_data(self, metric_name, current_time):
        """Remove data points older than the window duration"""
        cutoff_time = current_time - self.window_duration
        
        while (self.timestamps[metric_name] and 
               self.timestamps[metric_name][0] < cutoff_time):
            self.timestamps[metric_name].popleft()
            self.timing_data[metric_name].popleft()
    
    def get_current_stats(self):
        """Calculate current statistics for all metrics"""
        stats = {}
        current_time = time.time()
        
        with self.data_lock:
            for metric_name in self.timing_data:
                data = list(self.timing_data[metric_name])
                timestamps = list(self.timestamps[metric_name])
                
                if len(data) > 0:
                    stats[metric_name] = {
                        'count': len(data),
                        'mean': np.mean(data),
                        'std': np.std(data),
                        'min': np.min(data),
                        'max': np.max(data),
                        'median': np.median(data),
                        'p95': np.percentile(data, 95),
                        'p99': np.percentile(data, 99),
                        'latest': data[-1] if data else 0,
                        'data_span_sec': timestamps[-1] - timestamps[0] if len(timestamps) > 1 else 0
                    }
        
        return stats
    
    def print_stats(self, precision='us'):
        """Print current statistics with configurable precision
        Args:
            precision: 'us' for microseconds, 'ms' for milliseconds, 'ns' for nanoseconds
        """
        stats = self.get_current_stats()
        
        if not stats:
            print("No timing data received yet...")
            return
        
        # Configure precision and units
        if precision == 'us':
            multiplier = 1000  # ms to microseconds
            unit = 'μs'
            decimals = 1
        elif precision == 'ns':
            multiplier = 1000000  # ms to nanoseconds
            unit = 'ns'
            decimals = 0
        else:  # ms
            multiplier = 1
            unit = 'ms'
            decimals = 6  # More decimal places for milliseconds
        
        print(f"\n{'='*80}")
        print(f"Runtime Statistics (Last {self.window_duration}s) - Precision: {unit}")
        print(f"Timestamp: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        print(f"{'='*80}")
        
        for metric_name, metric_stats in stats.items():
            print(f"\n{metric_name}:")
            print(f"  Count: {metric_stats['count']} samples")
            print(f"  Mean:  {metric_stats['mean']*multiplier:.{decimals}f} {unit}")
            print(f"  Std:   {metric_stats['std']*multiplier:.{decimals}f} {unit}")
            print(f"  Min:   {metric_stats['min']*multiplier:.{decimals}f} {unit}")
            print(f"  Max:   {metric_stats['max']*multiplier:.{decimals}f} {unit}")
            print(f"  Median:{metric_stats['median']*multiplier:.{decimals}f} {unit}")
            print(f"  P95:   {metric_stats['p95']*multiplier:.{decimals}f} {unit}")
            print(f"  P99:   {metric_stats['p99']*multiplier:.{decimals}f} {unit}")
            print(f"  Latest:{metric_stats['latest']*multiplier:.{decimals}f} {unit}")
            print(f"  Data span: {metric_stats['data_span_sec']:.1f}s")
    
    def get_dataframe(self):
        """Convert current data to pandas DataFrame for analysis"""
        data_rows = []
        
        with self.data_lock:
            for metric_name in self.timing_data:
                data = list(self.timing_data[metric_name])
                timestamps = list(self.timestamps[metric_name])
                
                for i, (value, timestamp) in enumerate(zip(data, timestamps)):
                    data_rows.append({
                        'metric': metric_name,
                        'value_ms': value,
                        'timestamp': timestamp,
                        'datetime': datetime.fromtimestamp(timestamp)
                    })
        
        return pd.DataFrame(data_rows)

# Jupyter Notebook Interface Functions
class DiagnosticsAnalyzer:
    def __init__(self):
        self.monitor = None
        self.node_thread = None
        self.executor = None
        
    def start_monitoring(self, window_duration_sec=60, topic_name='/diagnostics'):
        """Start the ROS2 diagnostics monitoring"""
        if self.monitor is not None:
            print("Monitor already running. Stop it first.")
            return
        
        # Initialize ROS2
        rclpy.init()
        
        # Create monitor node
        self.monitor = DiagnosticsMonitor(window_duration_sec)
        
        # Update topic name if specified
        if topic_name != '/diagnostics':
            self.monitor.subscription.destroy()
            self.monitor.subscription = self.monitor.create_subscription(
                DiagnosticArray,
                topic_name,
                self.monitor.diagnostics_callback,
                10
            )
        
        # Create executor and start spinning in separate thread
        self.executor = rclpy.executors.SingleThreadedExecutor()
        self.executor.add_node(self.monitor)
        
        def spin_node():
            try:
                self.executor.spin()
            except Exception as e:
                print(f"Node spinning error: {e}")
        
        self.node_thread = threading.Thread(target=spin_node, daemon=True)
        self.node_thread.start()
        
        print(f"Started monitoring diagnostics on topic: {topic_name}")
        print(f"Window duration: {window_duration_sec} seconds")
        
    def stop_monitoring(self):
        """Stop the ROS2 diagnostics monitoring"""
        if self.monitor is None:
            print("No monitor running.")
            return
        
        self.executor.shutdown()
        self.monitor.destroy_node()
        rclpy.shutdown()
        
        self.monitor = None
        self.executor = None
        self.node_thread = None
        
        print("Stopped monitoring.")
    
    def get_stats(self):
        """Get current statistics"""
        if self.monitor is None:
            print("Monitor not running. Call start_monitoring() first.")
            return None
        
        return self.monitor.get_current_stats()
    
    def print_stats(self, precision='us'):
        """Print current statistics with configurable precision"""
        if self.monitor is None:
            print("Monitor not running. Call start_monitoring() first.")
            return
        
        self.monitor.print_stats(precision)
    
    def summary_stats_table(self, precision='us'):
        """Generate a summary statistics table with configurable precision"""
        stats = self.get_stats()
        
        if not stats:
            print("No statistics available.")
            return pd.DataFrame()
        
        # Configure precision and units
        if precision == 'us':
            multiplier = 1000  # ms to microseconds
            unit = 'μs'
            decimals = 1
        elif precision == 'ns':
            multiplier = 1000000  # ms to nanoseconds
            unit = 'ns'
            decimals = 0
        else:  # ms
            multiplier = 1
            unit = 'ms'
            decimals = 6
        
        # Convert to DataFrame for nice display
        rows = []
        for metric_name, metric_stats in stats.items():
            rows.append({
                'Metric': metric_name,
                'Count': metric_stats['count'],
                f'Mean ({unit})': f"{metric_stats['mean']*multiplier:.{decimals}f}",
                f'Std ({unit})': f"{metric_stats['std']*multiplier:.{decimals}f}",
                f'Min ({unit})': f"{metric_stats['min']*multiplier:.{decimals}f}",
                f'Max ({unit})': f"{metric_stats['max']*multiplier:.{decimals}f}",
                f'P95 ({unit})': f"{metric_stats['p95']*multiplier:.{decimals}f}",
                f'P99 ({unit})': f"{metric_stats['p99']*multiplier:.{decimals}f}"
            })
        
        return pd.DataFrame(rows)
    
    def get_dataframe(self):
        """Get data as pandas DataFrame"""
        if self.monitor is None:
            print("Monitor not running. Call start_monitoring() first.")
            return pd.DataFrame()
        
        return self.monitor.get_dataframe()
    
    def plot_metrics(self, figsize=(12, 8), precision='us'):
        """Plot timing metrics with configurable precision"""
        df = self.get_dataframe()

        if df.empty:
            print("No data to plot yet.")
            return

        # Configure precision and units
        if precision == 'us':
            multiplier = 1000  # ms to microseconds
            unit = 'μs'
        elif precision == 'ns':
            multiplier = 1000000  # ms to nanoseconds
            unit = 'ns'
        else:  # ms
            multiplier = 1
            unit = 'ms'

        # Convert timing values
        df_plot = df.copy()
        df_plot['value_converted'] = df_plot['value_ms'] * multiplier

        # Create subplots for each metric
        metrics = df_plot['metric'].unique()
        n_metrics = len(metrics)

        if n_metrics == 0:
            print("No metrics found.")
            return

        fig, axes = plt.subplots(n_metrics, 1, figsize=figsize, squeeze=False)

        for i, metric in enumerate(metrics):
            metric_data = df_plot[df_plot['metric'] == metric]

            ax = axes[i, 0]
            # Convert Series to NumPy array before plotting
            ax.plot(metric_data['datetime'].to_numpy(), metric_data['value_converted'].to_numpy(), 'b-', alpha=0.7)
            ax.set_title(f'{metric}')
            ax.set_ylabel(f'Time ({unit})')
            ax.grid(True, alpha=0.3)

            # Add statistics text
            mean_val = metric_data['value_converted'].mean()
            std_val = metric_data['value_converted'].std()
            ax.axhline(y=mean_val, color='r', linestyle='--', alpha=0.8,
                      label=f'Mean: {mean_val:.1f}{unit}')
            ax.legend()

        plt.xlabel('Time')
        plt.tight_layout()
        plt.show()
    
    def get_high_precision_stats(self):
        """Get detailed statistics with full precision for analysis"""
        stats = self.get_stats()
        
        if not stats:
            return {}
        
        detailed_stats = {}
        for metric_name, metric_stats in stats.items():
            detailed_stats[metric_name] = {
                'count': metric_stats['count'],
                'mean_ms': metric_stats['mean'],
                'mean_us': metric_stats['mean'] * 1000,
                'mean_ns': metric_stats['mean'] * 1000000,
                'std_ms': metric_stats['std'],
                'std_us': metric_stats['std'] * 1000,
                'std_ns': metric_stats['std'] * 1000000,
                'min_ms': metric_stats['min'],
                'min_us': metric_stats['min'] * 1000,
                'max_ms': metric_stats['max'],
                'max_us': metric_stats['max'] * 1000,
                'p95_ms': metric_stats['p95'],
                'p95_us': metric_stats['p95'] * 1000,
                'p99_ms': metric_stats['p99'],
                'p99_us': metric_stats['p99'] * 1000,
                'coefficient_of_variation': metric_stats['std'] / metric_stats['mean'] if metric_stats['mean'] > 0 else 0
            }
        
        return detailed_stats

In [None]:
# Import the analyzer
analyzer = DiagnosticsAnalyzer()

# Start monitoring (60-second window)
analyzer.start_monitoring(window_duration_sec=60, topic_name='/diagnostics')

# Let it collect data for a while, then check stats
import time
time.sleep(65)  # Wait for more than window duration

# Print statistics
analyzer.print_stats()

# Get statistics as a nice table
stats_df = analyzer.summary_stats_table()
display(stats_df)

# Plot the metrics over time
analyzer.plot_metrics()

# Get raw data as DataFrame for custom analysis
df = analyzer.get_dataframe()
print(f"Collected {len(df)} data points")

# Stop monitoring when done
analyzer.stop_monitoring()

### From jetson-stat

In [None]:
!pip install rosbag2-py matplotlib seaborn pandas numpy==1.25.0 pyyaml

In [None]:
import rosbag2_py
import rclpy
from rclpy.serialization import deserialize_message
from rosidl_runtime_py.utilities import get_message
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
import pandas as pd

def read_bag_diagnostics(bag_path):
    """
    Read diagnostics messages from ROS2 bag file
    """
    # Initialize ROS2 (if not already done)
    try:
        rclpy.init()
    except:
        pass
    
    # Create reader
    storage_options = rosbag2_py.StorageOptions(uri=bag_path, storage_id='sqlite3')
    converter_options = rosbag2_py.ConverterOptions(
        input_serialization_format='cdr',
        output_serialization_format='cdr'
    )
    
    reader = rosbag2_py.SequentialReader()
    reader.open(storage_options, converter_options)
    
    # Get topic info
    topic_types = reader.get_all_topics_and_types()
    type_map = {topic.name: topic.type for topic in topic_types}
    
    # Data storage
    timestamps = []
    cpu_usage = {f'cpu_{i}': [] for i in range(8)}
    cpu_freq = {f'cpu_{i}': [] for i in range(8)}
    gpu_data = []
    memory_data = []
    temp_data = {}
    power_data_curr = []
    power_data_avg = []
    
    # Read messages
    while reader.has_next():
        (topic, data, timestamp) = reader.read_next()
        
        if topic == '/diagnostics':
            # Get message type and deserialize
            msg_type = get_message(type_map[topic])
            msg = deserialize_message(data, msg_type)
            
            # Convert timestamp to seconds
            time_sec = timestamp / 1e9
            timestamps.append(time_sec)
            
            # Parse diagnostic status
            for status in msg.status:
                if 'jetson_stats cpu' in status.name:
                    # Extract CPU number from name like "jetson_stats cpu 0"
                    cpu_num = int(status.name.split()[-1])
                    
                    # Get CPU usage from message
                    cpu_usage_val = float(status.message.rstrip('%'))
                    cpu_usage[f'cpu_{cpu_num}'].append(cpu_usage_val)
                    
                    # Get CPU frequency from values
                    freq_val = None
                    for value in status.values:
                        if value.key == 'Freq':
                            freq_val = float(value.value) / 1000.0  # Convert kHz to MHz
                            break
                    
                    if freq_val is not None:
                        cpu_freq[f'cpu_{cpu_num}'].append(freq_val)
                    else:
                        cpu_freq[f'cpu_{cpu_num}'].append(0)
                    
                elif 'jetson_stats gpu' in status.name:
                    gpu_usage = float(status.message.rstrip('%'))
                    gpu_data.append(gpu_usage)
                    
                elif 'jetson_stats mem ram' in status.name:
                    mem_info = status.message.split('/')
                    used_mem = float(mem_info[0].rstrip('GB'))
                    memory_data.append(used_mem)
                    
                elif 'jetson_stats temp' in status.name:
                    # Parse temperature data
                    for value in status.values:
                        if value.key in ['cpu', 'gpu', 'soc0', 'soc1', 'soc2']:
                            temp_val = float(value.value)
                            if temp_val > 0:  # Filter out invalid readings
                                if value.key not in temp_data:
                                    temp_data[value.key] = []
                                temp_data[value.key].append(temp_val)
                                
                elif 'jetson_stats power' in status.name:
                    power_info = status.message.split()
                    curr_power = int(power_info[0].split('=')[1].rstrip('mW'))
                    power_data_curr.append(curr_power)
                    avg_power = int(power_info[1].split('=')[1].rstrip('mW'))
                    power_data_avg.append(avg_power)
        
    return {
        'timestamps': timestamps,
        'cpu_usage': cpu_usage,
        'cpu_freq': cpu_freq,
        'gpu_data': gpu_data,
        'memory_data': memory_data,
        'temp_data': temp_data,
        'power_data_curr': power_data_curr,
        'power_data_avg': power_data_avg
    }

def plot_diagnostics(data):
    """
    Create diagnostic plots with CPU freq and utilization in one plot
    """
    timestamps = np.array(data['timestamps'])
    # Convert to relative time (seconds from start)
    if len(timestamps) > 0:
        timestamps = timestamps - timestamps[0]
    
    # Create subplots
    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
    fig.suptitle('Jetson Orin Diagnostics', fontsize=16)
    
    # CPU Usage and Frequency Plot (combined)
    ax1 = axes[0, 0]
    ax1_freq = ax1.twinx()  # Secondary y-axis for frequency
    
    # Colors for different CPUs
    colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', "#0F9E97", "#32E184"]
    
    for i, (cpu_name, cpu_usage_vals) in enumerate(data['cpu_usage'].items()):
        if len(cpu_usage_vals) > 0:
            color = colors[i % len(colors)]
            # Plot usage as solid line
            line1 = ax1.plot(timestamps[:len(cpu_usage_vals)], cpu_usage_vals, 
                           color=color, linestyle='-', linewidth=2, 
                           label=f'{cpu_name} usage', alpha=0.8)
            
            # Plot frequency as dashed line
            cpu_freq_vals = data['cpu_freq'][cpu_name]
            if len(cpu_freq_vals) > 0:
                line2 = ax1_freq.plot(timestamps[:len(cpu_freq_vals)], cpu_freq_vals, 
                                    color=color, linestyle='--', linewidth=1.5, 
                                    label=f'{cpu_name} freq', alpha=0.6)
    
    ax1.set_title('CPU Usage (%) and Frequency (MHz)')
    ax1.set_xlabel('Time (s)')
    ax1.set_ylabel('CPU Usage (%)', color='black')
    ax1_freq.set_ylabel('CPU Frequency (MHz)', color='gray')
    ax1.set_ylim(0, 100)
    ax1.grid(True, alpha=0.3)
    
    # Combine legends
    lines1, labels1 = ax1.get_legend_handles_labels()
    lines2, labels2 = ax1_freq.get_legend_handles_labels()
    ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left', fontsize=8)
    
    # GPU Usage Plot
    ax2 = axes[0, 1]
    if len(data['gpu_data']) > 0:
        ax2.plot(timestamps[:len(data['gpu_data'])], data['gpu_data'], 
                'g-', marker='o', markersize=3, label='GPU', linewidth=2)
        ax2.set_title('GPU Usage (%)')
        ax2.set_xlabel('Time (s)')
        ax2.set_ylabel('Usage (%)')
        ax2.set_ylim(0, 100)
        ax2.legend()
        ax2.grid(True, alpha=0.3)
    
    # Memory Usage Plot
    ax3 = axes[1, 0]
    if len(data['memory_data']) > 0:
        ax3.plot(timestamps[:len(data['memory_data'])], data['memory_data'], 
                'r-', marker='o', markersize=3, label='RAM Used', linewidth=2)
        ax3.set_title('Memory Usage (GB)')
        ax3.set_xlabel('Time (s)')
        ax3.set_ylabel('Memory (GB)')
        ax3.legend()
        ax3.grid(True, alpha=0.3)
    
    # Temperature Plot
    ax4 = axes[1, 1]
    temp_colors = {'cpu': 'red', 'gpu': 'green', 'soc0': 'blue', 'soc1': 'orange', 'soc2': 'purple'}
    for temp_name, temp_values in data['temp_data'].items():
        if len(temp_values) > 0:
            color = temp_colors.get(temp_name, 'black')
            ax4.plot(timestamps[:len(temp_values)], temp_values, 
                    color=color, marker='o', markersize=2, label=temp_name, linewidth=2)
    ax4.set_title('Temperature (°C)')
    ax4.set_xlabel('Time (s)')
    ax4.set_ylabel('Temperature (°C)')
    ax4.legend()
    ax4.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.show()
    
    # Additional detailed CPU plot
    if any(len(vals) > 0 for vals in data['cpu_usage'].values()):
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(14, 8))
        
        # CPU Usage only
        for i, (cpu_name, cpu_usage_vals) in enumerate(data['cpu_usage'].items()):
            if len(cpu_usage_vals) > 0:
                color = colors[i % len(colors)]
                ax1.plot(timestamps[:len(cpu_usage_vals)], cpu_usage_vals, 
                        color=color, marker='o', markersize=2, 
                        label=cpu_name, linewidth=2)
        
        ax1.set_title('CPU Usage per Core (%)')
        ax1.set_ylabel('Usage (%)')
        ax1.set_ylim(0, 100)
        ax1.legend()
        ax1.grid(True, alpha=0.3)
        
        # CPU Frequency only
        for i, (cpu_name, cpu_freq_vals) in enumerate(data['cpu_freq'].items()):
            if len(cpu_freq_vals) > 0:
                color = colors[i % len(colors)]
                ax2.plot(timestamps[:len(cpu_freq_vals)], cpu_freq_vals, 
                        color=color, marker='s', markersize=2, 
                        label=cpu_name, linewidth=2)
        
        ax2.set_title('CPU Frequency per Core (MHz)')
        ax2.set_xlabel('Time (s)')
        ax2.set_ylabel('Frequency (MHz)')
        ax2.legend()
        ax2.grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.show()
    
    # Power plot if available
    if len(data['power_data_curr']) > 0:
        plt.figure(figsize=(12, 4))
    
    # Plot current power data
        plt.plot(timestamps[:len(data['power_data_curr'])], data['power_data_curr'],
             'purple', marker='o', markersize=3, label='Current Power', linewidth=2)
    
    # Plot average power data (if available and same length)
        if len(data['power_data_avg']) > 0:
            plt.plot(timestamps[:len(data['power_data_avg'])], data['power_data_avg'],
                 'orange', marker='s', markersize=3, label='Average Power', linewidth=2)
    
        plt.title('Power Consumption')
        plt.xlabel('Time (s)')
        plt.ylabel('Power (mW)')
        plt.legend()
        plt.grid(True, alpha=0.3)
        plt.show()

# Usage example
if __name__ == "__main__":
    bag_path = "../bags/test1" 
    data = read_bag_diagnostics(bag_path)
    plot_diagnostics(data)