# MLCommons Earthquake GPU Analysis Notebook
- Generates GPU Events Graphs
- Generates GPU Power Usage Graphs
- Generates GPU Execution Time Comparison Graph

In [None]:
import os
import re
import glob
import pickle
import datetime
import numpy as np
import pandas as pd
import seaborn as sns
from datetime import timedelta
import matplotlib.dates as md
import matplotlib.pyplot as plt

## Collect Power Data

In [None]:
def get_power_data(data_dict):
    """ Convert gpu dataframe into data for plots.
    Args:
        data_dict: dictionary of run data. 
    Returns:
        dataframe with power data.
    """   
    # setup
    data = {}
    rename = {
        '# time': 'time',
        'id': 'id',
        'gpu_util %': 'gpu_util',
        'memory_util %': 'memory_util',
        'encoder_util %': 'encoder_util',
        'decoder_util %': 'decoder_util',
        'gpu_temp C': 'gpu_temp',
        'power_draw W': 'power_draw'
    }
    
    # collect run info
    data['gpu'] = data_dict['run_info']['gpu']
    data['numGpus'] = data_dict['run_info']['numGpus']
    data['numCpus'] = data_dict['run_info']['numCpus']
    data['mem'] = data_dict['run_info']['mem']
    data['epochs'] = data_dict['run_info']['epochs']
    
    # build power data total notebook
    gpu_df = data_dict['gpu_df'].rename(columns=rename)
    grouped = gpu_df.groupby(['time']).mean()['power_draw'].reset_index()       
    data['kWh_total'] = sum(grouped['power_draw'])*(1/3600)*(1/1000)
    data = pd.DataFrame([data], columns=data.keys())
    
    # build power data model fit
    timer_df = data_dict['timer_df']
    delta = min(timer_df['start']) - min(grouped['time']).round('1h')
    fit_event = timer_df.loc[timer_df['timer'] == 'RunTFTCustomVersion train']
    fit_start = fit_event['start'] - delta
    fit_end = fit_event['end'] - delta
    fit_grouped = grouped[(grouped['time'] >= fit_start.values[0]) & (grouped['time'] <= fit_end.values[0])]
    data['kWh_fit'] = sum(fit_grouped['power_draw'])*(1/3600)*(1/1000)
    
    return data

### Plotting Functions

In [None]:
# formatting dictionary for event plot
timers_dict = {
 'EVAL':{
     'hatch':None, 
     'facecolor':"none", 
     'edgecolor':None,'rename':None, 
     'color':'tab:blue', 
     'alpha':0.15
 },
 'CELL_READ_DATA':{
     'hatch':'//',
     'facecolor':"none",
     'edgecolor':'black',
     'rename':None,
     'color':None,
     'alpha':0.7
 },
 'data head setup':{
     'hatch':None, 
     'facecolor':"none", 
     'edgecolor':None,
     'rename':None,
     'color':'tab:green',
     'alpha':0.15
 },
 'legal sampling location':{
     'hatch':'\\\\', 
     'facecolor':"none",
     'edgecolor':'black',
     'rename':None,
     'color':None,
     'alpha':0.7
 },
 'RunTFTCustomVersion bestfit finalize TFTTestpredict':{
     'hatch':None, 
     'facecolor':"none", 
     'edgecolor':None,
     'rename':'TFTTestpredict',
     'color':'tab:cyan',
     'alpha':0.15
 },
 'RunTFTCustomVersion bestfit finalize VisualizeTFT TFTSaveandInterpret setFFFFmapping':{
     'hatch':None,
     'facecolor':"none",
     'edgecolor':None,
     'rename':'setFFFFmapping',
     'color':'tab:purple',
     'alpha':0.15
 },
 'RunTFTCustomVersion bestfit finalize VisualizeTFT DLprediction':{
     'hatch':None,
     'facecolor':"none",
     'edgecolor':None,
     'rename':'DLprediction',
     'color':'tab:orange',
     'alpha':0.15
 },
 #'DLResults_Graphs':{
 #    'hatch':None,
 #    'facecolor':"none",
 #    'edgecolor':None,
 #    'rename':'DLResults_Graphs',
 #    'color':'tab:olive',
 #    'alpha':0.15
 #}
}

In [None]:
def plot_gpu_events(timer_df, gpu_df, epochs, name, path, zoom=False):
    """ Create gpu events plot and save figure.
    Args:
        timer_df: timer dataframe. 
        gpu_df: gpu log dataframe.
        epochs: number of epochs.
        name: run name.
        path: output path.
        zoom: optional arg for zooming on event
    """
    # initialize
    event_times_dir = os.path.join(path,'event_times')
    if not zoom == False:
        name = f'{name}_zoomed_{zoom}'
        fig = plt.figure(figsize=(7,7), facecolor='white', dpi=360)
    else:
        fig = plt.figure(figsize=(10,7), facecolor='white', dpi=360)
    ax = fig.add_subplot(111)

    # get epoch data
    num_epochs = int(epochs)
    count = num_epochs
    epoch_timers = []
    for epoch in range(num_epochs):
        epoch_times = [x for x in timer_df['timer'] if f'Epoch:{epoch}' in x]
        if not epoch_times:
            return
        end_time = max(timer_df[timer_df['timer'].isin(epoch_times)]['end'])
        timer_df.loc[timer_df['timer'] == f'RunTFTCustomVersion train Epoch:{epoch}', 'end'] = end_time
        timer_df.loc[timer_df['timer'] == f'RunTFTCustomVersion train Epoch:{epoch}', 'timer'] = f'Epoch:{epoch}'
        epoch_timers.append(f'Epoch:{epoch}')
    epoch_alpha = 0.2
    alpha_inc = (0.7)/num_epochs
    
    # select columns of interest    
    timers = list(timers_dict.keys()) + epoch_timers
    event_df = timer_df[timer_df['timer'].isin(timers)]
    
    # find time delta
    delta = min(event_df['start']) - min(gpu_df.reset_index()['time'])

    # create plot of each event
    ax.plot(gpu_df['time'], gpu_df['power_draw W'], color='black', linewidth=0.75)
    for i, row in event_df.iterrows():
        start_time = row['start'] - delta.round('1h')
        end_time = row['end'] - delta.round('1h')
        if 'Epoch:' in row['timer'] and zoom is False:
            count = count - 1
            if not count == 1:
                ax.axvspan(start_time, end_time,
                            alpha=epoch_alpha,
                            #label=row['timer'],
                            label='_nolegend_',
                            color='tab:red')
            else:
                ax.axvspan(start_time, end_time,
                            alpha=epoch_alpha,
                            #label=row['timer'],
                            label='Epochs',
                            color='tab:red')                
            epoch_alpha += alpha_inc
        elif 'Epoch:' not in row['timer']:
            timer_style = timers_dict[row['timer']]
            if timer_style['rename'] is not None:
                row['timer'] = timer_style['rename']
            if zoom == False:
                label= row['timer']
            else:
                label = '_nolegend_'
            ax.axvspan(start_time, end_time, 
                        alpha=timer_style['alpha'], label=label, 
                        hatch=timer_style['hatch'], facecolor=timer_style['facecolor'], 
                        edgecolor=timer_style['edgecolor'], 
                        color=timer_style['color'])
    # annotations
    annotation_epoch = num_epochs-2
    sample = timer_df[timer_df['timer'] == f'RunTFTCustomVersion validation bestfit Epoch:{annotation_epoch}']
    annotation_height = 1.13
    start_time = sample['start'] - delta.round('1h')
    end_time = sample['end'] - delta.round('1h')
    filtered = gpu_df[(gpu_df['time'] >= start_time.values[0]) & (gpu_df['time'] <= end_time.values[0])]
    watts = filtered.loc[filtered['power_draw W'].idxmax()]['power_draw W']
    time = filtered.loc[filtered['power_draw W'].idxmax()]['time']
    
    # make annotations zoom dependent
    if zoom == False:
        #plt.annotate('validation/bestfit', 
        #             xy=(time,watts), 
        #             xytext=(time+timedelta(hours=1), max(gpu_df['power_draw W'])*annotation_height),
        #             xycoords='data',
        #             horizontalalignment="left", verticalalignment='center',
                     #connectionstyle='angle,angleA=-90,angleB=10,rad=5'
        #             arrowprops=dict(arrowstyle='->',lw=1, connectionstyle="arc,angleB=70,armA=0,armB=20"))
        save_dir = os.path.join(event_times_dir,'full')
        plt.title(f'{name} Event Times')
        ax.legend(bbox_to_anchor=(1.01, 1), loc='upper left')
    elif zoom == 'validation':
        row = timer_df.loc[timer_df['timer'] == f'RunTFTCustomVersion validation bestfit Epoch:{annotation_epoch}']
        start_time = row['start'] - delta.round('1h')
        end_time = row['end'] - delta.round('1h')
        ax.axvspan(start_time, end_time,
            label= 'Validation',
            alpha=0.3,
            color='tab:olive')
        ax.set_xlim([start_time - timedelta(minutes=3.5) ,end_time + timedelta(minutes=3.5)])
        ax.xaxis.set_major_locator(md.MinuteLocator())
        plt.title('Epoch Validation Fit')
        ax.legend(loc='upper right')
        save_dir = os.path.join(event_times_dir,'validation_zoom')
    elif zoom == 'DLResults':
        row = timer_df.loc[timer_df['timer'] == 'DLResults_Graphs']
        start_time = row['start'] - delta.round('1h')
        end_time = row['end'] - delta.round('1h')
        ax.set_xlim([start_time - timedelta(minutes=3.5) ,end_time + timedelta(minutes=0.5)])
        plt.title('DLResults')
        save_dir = os.path.join(event_times_dir,'DLResults_zoom')
        ax.legend(loc='upper right')
    
    # plot formatting 
    ax.set_ylabel(f'Watts')
    ax.set_xlabel(f'Execution Time (Hours)')
    ax.set_ylim(0,max(gpu_df['power_draw W'])*1.25)
    if zoom == False:
        ax.xaxis.set_major_locator(md.MinuteLocator(byminute = [0, 30]))
    ax.xaxis.set_major_formatter(md.DateFormatter('%H:%M'))
    ax.grid(False)
    plt.setp(ax.xaxis.get_majorticklabels(), rotation = 90)
    plt.gcf().autofmt_xdate()
    plt.show()
    
    # save figure
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    plot, _ = SAVEFIG(fig, name, save_dir)
    plt.clf()
    plt.close(fig)

In [None]:
def plot_power_usage(df, path, span='total', normalize=True):
    """ Create power usage plot and save figure.
    Args:
        df: power dataframe. 
        path: output path.
        span: 'total' or 'train' or 'avg'
        normalize: bool argument to normalize to number of epochs
    """
    fig = plt.figure(figsize=(8, 6), dpi=360)
    power_usage_dir = os.path.join(path,'power_usage')
    if not os.path.exists(power_usage_dir):
        os.mkdir(power_usage_dir)
    
    # augment data
    df['epochs'] = df['epochs'].astype(int)
    df.sort_values('epochs')   
    df['kWh/epoch_total'] = df['kWh_total']/df['epochs']
    df['kWh/epoch_fit'] = df['kWh_fit']/df['epochs']
    
    if span == 'total' and normalize == False:
        # plot total notebook epochs vs. kWh
        save_name = 'total_epoch_vs_watts'
        sns.barplot(x='epochs', y='kWh_total', hue='gpu',data=df) 
        plt.title('Total Notebook: Epochs vs. kWh')
        plt.ylabel('kWh')

    if span == 'total' and normalize == True:
        # plot total notebook kWh per Epoch plot
        save_name = 'total_kWh_per_epoch'
        sns.barplot(x='epochs', y='kWh/epoch_total', hue='gpu',data=df) 
        plt.title('Total Notebook: Epochs vs. kWh/Epoch')
        plt.ylabel('kWh/epoch')
    
    if span == 'train' and normalize == False:
        # plot model fit epochs vs. kWh
        save_name = 'model_fit_epoch_vs_watts'
        sns.barplot(x='epochs', y='kWh_fit', hue='gpu',data=df) 
        plt.title('Model Fit: Epochs vs. kWh') 
        plt.ylabel('kWh')
    
    if span == 'train' and normalize == True:
        # plot model fit kWh per Epoch plot
        save_name = 'model_fit_kWh_per_epoch'
        sns.set_style("whitegrid")
        sns.barplot(x='epochs', y='kWh/epoch_fit', hue='gpu',data=df) 
        plt.title('Model Fit: Epochs vs. kWh/Epoch')
        plt.ylabel('kWh/epoch')
    
    if span == 'avg' and normalize == True:
        # plot average model fit kWh per Epoch
        save_name = 'average_kWh_per_epoch'
        grouped = df.groupby('gpu').mean()['kWh/epoch_fit'].reset_index()
        sns.set_style("whitegrid")
        sns.barplot(x="gpu", y="kWh/epoch_fit", data=grouped)
        plt.title('Average kWh/Epoch per GPU', fontsize=14)
        plt.ylabel('kWh/epoch', fontsize=12)
        plt.xlabel('GPU', fontsize=12)
    plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left')
    plot, _ = SAVEFIG(fig, save_name, power_usage_dir)
    plt.show()
    plt.close(fig)

In [None]:
def plot_train_times(data_dict, path, gpu=None, timer='RunTFTCustomVersion train'):
    """ Create plot to compare training times and save figure.
    Args:
        data_dict: dictionary of run data. 
        path: output path.
    """
    #plt.rcParams['figure.figsize'] = [10, 7]
    fig = plt.figure(figsize=(8, 6), dpi=360)
    time_dir = os.path.join(path,'train_times')

    if gpu == None:
        data = pd.DataFrame(columns=['gpu','epochs','time'])
        for experiment in data_dict.keys():
            row = pd.Series(dtype='object')            
            timer_df = data_dict[experiment]['timer_df']
            if timer in timer_df['timer'].values:
                row['time'] = float(timer_df.loc[timer_df['timer'] == timer]['time'].values[0])/3600
                row['gpu'] = data_dict[experiment]['run_info']['gpu']
                row['epochs'] = int(data_dict[experiment]['run_info']['epochs'])
                data = pd.concat([data, row.to_frame().T])
            else:
                print(f"Missing train timer for {experiment}")

        # update dtypes
        data['epochs'] = data['epochs'].astype(int)
        data['time'] = data['time'].astype(float)
        hue = 'gpu'
        timer_ = timer.replace(' ','_')
        save_name = os.path.join(time_dir,f'{timer_}_times.png')
        
    else:
        data = pd.DataFrame()
        for experiment in data_dict.keys():
            run = {}
            run['gpu'] = data_dict[experiment]['run_info']['gpu']
            run['system'] = data_dict[experiment]['run_info']['system']
            run['filesystem'] = data_dict[experiment]['run_info']['filesystem']
            run['epochs'] = int(data_dict[experiment]['run_info']['epochs'])
            timer_df = data_dict[experiment]['timer_df']
            if timer in timer_df['timer'].values:
                run['time'] = float(timer_df[timer_df['timer'] == timer]['time'].values[0])/3600
                run['sys'] = f"{run['gpu']}-{run['system']}-{run['filesystem']}"
                run_df = pd.DataFrame([run])
                data = pd.concat([data, run_df], ignore_index=True)        
        data = data[~data['sys'].str.contains('rivanna-rivanna')]
        data = data[data['gpu']== gpu]
        hue = 'sys'
        timer = timer.replace(' ','_')
        #save_name = os.path.join(time_dir,f'{gpu}_{timer}_times.png')
        save_name = f'{gpu}_{timer}_times'
    # create plot
    #plt.grid()
    sns.lineplot(x='epochs', 
                 y='time',
                 hue=hue,
                 data=data,
                 legend = False)
    sns.scatterplot(x='epochs', 
                    y='time',
                    hue=hue,
                    s=50,
                    data=data)
    
    epoch_list = data['epochs'].unique()
    plt.title(f'{timer} Time Comparison')
    plt.ylabel('Time (hours)')
    sns.set_style("whitegrid")
    #plt.xticks(epoch_list)
    if not os.path.exists(time_dir):
        os.mkdir(time_dir)
    #plt.savefig(save_name)
    plot, _ = SAVEFIG(fig, save_name, time_dir)
    plt.show()
    plt.close(fig)

In [None]:
def SAVEFIG(fig, filename, path=None, formats=('png', 'pdf')):
    fileout = os.path.join(path,filename)
    for my_format in formats:
        fig.savefig(f"{fileout}.{my_format}", format=my_format, bbox_inches="tight")
    return 1, tuple(f'{fileout}.{fmt}' for fmt in formats)

### Load Pickle

In [None]:
cwd = os.getcwd()
pickle_file = os.path.join(cwd,'experiment_data.pkl')
with open(pickle_file, 'rb') as f:
    loaded_dict = pickle.load(f)

### Display Available Data

In [None]:
data = pd.DataFrame()
for experiment in loaded_dict.keys():
    run = {}
    gpu = loaded_dict[experiment]['run_info']['gpu']
    system = loaded_dict[experiment]['run_info']['system']
    filesystem = loaded_dict[experiment]['run_info']['filesystem']
    run['count'] = 1
    run['sys'] = f"{gpu}-{system}-{filesystem}"
    run_df = pd.DataFrame([run])
    data = pd.concat([data, run_df], ignore_index=True) 
data.groupby('sys').count()

### Create Analysis Outputs

In [None]:
analysis_path = os.path.join(os.getcwd(),'analysis')
power_df = pd.DataFrame()
for experiment in loaded_dict.keys():
    if not experiment.startswith('mar2022'):
        path = loaded_dict[experiment]['run_info']['path']
        epochs = loaded_dict[experiment]['run_info']['epochs']
        dir_path = path.rsplit('/',1)[0]
        if not os.path.exists(analysis_path):
            os.mkdir(analysis_path)
        timer_df = loaded_dict[experiment]['timer_df']
        gpu_df = loaded_dict[experiment]['gpu_df']
        gpu_df['time'] = pd.to_datetime(gpu_df['time'].str.split(".").str[0],format='%Y-%m-%d:%H:%M:%S')
        plot_gpu_events(timer_df, gpu_df, epochs, experiment, analysis_path)
        plot_gpu_events(timer_df, gpu_df, epochs, experiment, analysis_path, 'validation')
        #plot_gpu_events(timer_df, gpu_df, epochs, experiment, plot_path, 'DLResults')
        power_data = get_power_data(loaded_dict[experiment])
        power_df = pd.concat([power_df,power_data])
        plt.close("all")

# power usage plots
plot_power_usage(power_df, analysis_path, span='total', normalize=False)
plot_power_usage(power_df, analysis_path, span='total', normalize=True)
plot_power_usage(power_df, analysis_path, span='train', normalize=False)
plot_power_usage(power_df, analysis_path, span='train', normalize=True)
plot_power_usage(power_df, analysis_path, span='avg', normalize=True)

#train time plots
plot_train_times(loaded_dict, analysis_path)
gpus = []
for experiment in loaded_dict.keys():
    gpu = loaded_dict[experiment]['run_info']['gpu']
    if not gpu in gpus:
        gpus.append(gpu)
for gpu in gpus:
    plot_train_times(loaded_dict, analysis_path, gpu=gpu, timer='total')

### Recreate rtx3090-gregor.ipynb graphs

In [None]:
GRAPHICSDIR = os.path.join(cwd,'analysis','archive')
if not os.path.exists(GRAPHICSDIR):
    os.mkdir(GRAPHICSDIR)

#### Plot Bestfit

In [None]:
data = pd.DataFrame()
for experiment in loaded_dict.keys():
    if experiment.startswith('mar2022_epoch2'):
        run = {}
        run['gpu'] = loaded_dict[experiment]['run_info']['gpu']
        run['system'] = loaded_dict[experiment]['run_info']['system']
        run['filesystem'] = loaded_dict[experiment]['run_info']['filesystem']
        timer_df = loaded_dict[experiment]['timer_df']
        run['total'] = timer_df[timer_df['timer'] == 'total']['time'].values[0]/3600
        run['bestfit'] = timer_df[timer_df['timer'] == 'RunTFTCustomVersion bestfit']['time'].values[0]/3600
        if run['filesystem'] == 'localscratch':
            run['plot_name'] = f"{run['gpu']}-{run['system']}-{run['filesystem']}"
        else:
            run['plot_name'] = f"{run['gpu']}-{run['system']}"
        run_df = pd.DataFrame([run])
        data = pd.concat([data, run_df], ignore_index=True)

In [None]:
fig = plt.figure(figsize=(8, 6), dpi=360)

sns.set_theme(style="whitegrid")
ax = sns.barplot(data=data, x="plot_name", y="total", order=data.sort_values('bestfit', ascending=False).plot_name)
ax = sns.barplot(data=data,
                 x="plot_name",
                 y="bestfit",
                 order=data.sort_values('bestfit', ascending=False).plot_name)
ax.tick_params(axis='x', rotation=90)
ax.set(xlabel='Experiment', ylabel='Time (Hours)', title='GPU BestFit Barplot')

plot, _ = SAVEFIG(fig, 'Graphics_Cards_BestFit_bar', GRAPHICSDIR)

#### Plot Execution Time

In [None]:
data = pd.DataFrame()
for experiment in loaded_dict.keys():
    mar2022 = experiment.startswith('mar2022')
    epoch2 = experiment.startswith('mar2022_epoch2')
    if mar2022 and not epoch2:
        run = {}
        run['gpu'] = loaded_dict[experiment]['run_info']['gpu']
        run['system'] = loaded_dict[experiment]['run_info']['system']
        run['filesystem'] = loaded_dict[experiment]['run_info']['filesystem']
        run['epochs'] = int(loaded_dict[experiment]['run_info']['epochs'])
        #run['epochs'] = run['epochs'].astype(int)
        timer_df = loaded_dict[experiment]['timer_df']
        run['total'] = timer_df[timer_df['timer'] == 'total']['time'].values[0]/3600
        if run['system'] in ['colab','personal_pc_g','personal_pc_r']:
            run['sys'] = f"{run['gpu']}-{run['system']}"
        else:
            run['sys'] = f"{run['gpu']}-{run['system']}-{run['filesystem']}"
        run_df = pd.DataFrame([run])
        data = pd.concat([data, run_df], ignore_index=True)     

In [None]:
fig = plt.figure(figsize=(8, 6), dpi=360)
sns.lineplot(x='epochs', 
             y='total',
             hue='sys',
             data=data,
             legend=False)
sns.scatterplot(x='epochs', 
                y='total',
                hue='sys',
                #s=50,
                data=data)
plt.title('Earthquake Notebook Completion Times')
plt.ylabel("Time (Hours)")
plt.legend(bbox_to_anchor=(1.01, 1), loc='upper left')
plot, _ = SAVEFIG(fig, 'Benchmark_comp_resource', GRAPHICSDIR)