# MLCommons Earthquake GPU Data Collection Notebook
- Creates Pickle file with data for all available runs

Please follow the instructions listed in README.md before running this notebook. This notebook should not be run while it stands in the analysis folder, but rather the results folder with the earthquake run data structure.

In [1]:
import os
import re
import glob
import pickle
import datetime
import numpy as np
import pandas as pd

In [2]:
#get time
now = datetime.datetime.now()
print(now)

2023-09-01 12:13:17.912646


### Data Extraction Functions

In [3]:
def format_gpu_log(path):
    """ Format the gpu data log into dataframe.

    Args:
        path: file path for the gpu log.
    Returns:
        dataframe with gpu data.
    """
    # read in data
    gpu_df = pd.read_csv(path, skiprows=1,header=None,low_memory=False)
    
    # get headers
    header = gpu_df.loc[0]
    header = header.str[2:].str.strip()
    gpu_df = gpu_df.drop(index = [0]).reset_index(drop=True)
    gpu_df= gpu_df.set_axis(header,axis=1,inplace=False)
    
    # set types
    int_col = list(gpu_df.columns[1:-1])
    gpu_df[int_col] = gpu_df[int_col].astype('int')
    float_col = list(gpu_df.columns[-1:])
    gpu_df[float_col] = gpu_df[float_col].astype('float')
    time_col = list(gpu_df.columns[:1])[0]

    gpu_df = gpu_df.groupby('time').mean().reset_index()
    
    return gpu_df

In [4]:
def get_timer_data(err_path):
    """ Collect timer data from run output and create dataframe and csv.

    Args:
        path: file path for the run log.
    Returns:
        dataframe with timer data.
    """   
    def index_containing_substring(the_list, substring, _not=False):
        index = []
        for i, s in enumerate(the_list):
            if not _not:
                if substring in s:
                    index.append(i)
            else:
                if substring not in s:
                    index.append(i)
        if _not:
            return index
        elif substring == '# csv,RUN_STOP' or substring == '# csv,label3':
            if len(index) < 2:
                return -1
            return index[-2] + 1
        elif not index is None:
            return index[-1]
        return -1    

    directory = err_path.rsplit("/",1)[0]
    output_path = os.path.join(directory,'timer.csv')
    
    # read in data
    with open(err_path, 'r', encoding='cp850') as f:
        content = f.readlines()

    success = any([True if 'Execution Complete' in x else False for x in content])
    if not success:
        print(f'Incomplete Run: {directory}')
        return None

    timerUpdate = any([True if 'RUN_STOP' in x else False for x in content])
    #import pdb; pdb.set_trace()
    if timerUpdate:
        # get timer content
        start = index_containing_substring(content, '# csv,timer,status,time,sum,start,tag,msg,uname.node,user,uname.system,platform.version')
        stop = index_containing_substring(content, '# csv,RUN_STOP')
        if stop == -1:
            print(f'Incomplete Run: {directory}')
            return None
        content = content[start:stop]
        neg = index_containing_substring(content, '# csv', _not=True)

        # fix for dictionary in csv
        if not len(neg) == 0:
            fixed = ''.join(content[min(neg):max(neg)+1]).strip().replace('\n','').replace('\s+','').replace('\t+','')
            for x in range(min(neg),max(neg)+1):
                content.pop(min(neg))
    else:
        # get timer content
        start = index_containing_substring(content, '# csv,timer,status,time,sum,start,tag,msg,uname.node,user,uname.system,platform.version')
        stop = index_containing_substring(content, '# csv,RunTFTCustomVersion bestfit finalize VisualizeTFT event_num:0,ok,0.0,0.0,')
        if stop == -1:
            print(f'Incomplete Run: {directory}')
            return None
        content = content[start:stop]

    # formatting
    times = []
    for x in content:
        times.append(x.strip('\n').replace('# csv,',''))
    if timerUpdate:
        if not len(neg) == 0:
            times[min(neg)-1] = times[min(neg)-1]+fixed
    data = []
    for x in times:
        x = re.sub("\{[^}]*\}", lambda x:x.group(0).replace(',',';'), x)
        data.append(x)

    # save off data
    df = pd.DataFrame(data)[0].str.split(',', expand=True)
    df.columns = df.iloc[0]
    df = df[1:]

    # convert to datetime
    df['start'] = pd.to_datetime(df['start'], format='%Y-%m-%d %H:%M:%S')
    df = df[df['status'] != 'failed']

    # get end time
    # display(df)
    # pprint(df.timer.unique())
    for i, row in df.iterrows():
        df.loc[i,'end'] = row['start'] + datetime.timedelta(seconds=float(row.time))

    return df

In [5]:
def get_power_data(data_dict, gpu_df, timer_df):
    """ Convert gpu dataframe into data for plots.
    Args:
        data_dict: dictionary of run data. 
        gpu_df: DataFrame with gpu usage data.
        timer_df: DataFrame with event time data.
    Returns:
        dataframe with power data.
    """
    # setup
    data = {}
    rename = {
        '# time': 'time',
        'id': 'id',
        'gpu_util %': 'gpu_util',
        'memory_util %': 'memory_util',
        'encoder_util %': 'encoder_util',
        'decoder_util %': 'decoder_util',
        'gpu_temp C': 'gpu_temp',
        'power_draw W': 'power_draw'
    }

    # collect run info
    data['gpu'] = data_dict['run_info']['gpu']
    data['numGpus'] = data_dict['run_info']['numGpus']
    data['numCpus'] = data_dict['run_info']['numCpus']
    data['mem'] = data_dict['run_info']['mem']
    data['epochs'] = data_dict['run_info']['epochs']

    # build power data total notebook
    gpu_df = gpu_df.rename(columns=rename)
    gpu_df['time'] = pd.to_datetime(gpu_df['time'].str.split(".").str[0],format='%Y-%m-%d:%H:%M:%S')
    grouped = gpu_df.groupby(['time']).mean()['power_draw'].reset_index()
    data['kWh_total'] = sum(grouped['power_draw'])*(1/3600)*(1/1000)
    data = pd.DataFrame([data], columns=data.keys())

    # build power data model fit
    #gpu_df['time'] = pd.to_datetime(gpu_df['time'].str.split(".").str[0],format='%Y-%m-%d:%H:%M:%S')
    delta = min(timer_df['start']) - min(grouped['time']).round('1h')
    fit_event = timer_df.loc[timer_df['timer'] == 'RunTFTCustomVersion train timer_num:0']
    fit_start = fit_event['start'] - delta
    fit_end = fit_event['end'] - delta
    fit_grouped = grouped[(grouped['time'] >= fit_start.values[0]) & (grouped['time'] <= fit_end.values[0])]
    data['kWh_fit'] = sum(fit_grouped['power_draw'])*(1/3600)*(1/1000)
    
    return data

In [6]:
def get_NNSE_data(err_path, run_info):
    with open(err_path,'r', encoding='cp850') as f:
        content = f.readlines()
    content_df = pd.DataFrame(content).rename(columns={0:'string'})

    indexes = content_df[content_df['string'].str.contains('NNSE \n')].index

    NNSE_df = pd.DataFrame()
    for i in indexes:
        data = {}
        data['experiment'] = run_info['experiment']
        data['gpu'] = run_info['gpu']
        data['epochs'] = run_info['epochs']
        NNSE_index = []
        NNSE_index.append(i-1)
        NNSE_index.append(i)
        for x in range(1,6):
            NNSE_index.append(i+x)
        df = content_df.iloc[NNSE_index]
        data['eval'] = df.iloc[0].values[0].strip()
        data['time'] = df.iloc[2].values[0].strip()
        data['averaged'] = float(df.iloc[3].values[0].split(' ')[-1].strip())
        data['summed'] = float(df.iloc[5].values[0].split(' ')[-1].strip())
        data = pd.DataFrame(data, index=[0])
        NNSE_df = pd.concat([NNSE_df, data], ignore_index=True)
    return NNSE_df

### Data Collection

In [7]:
# find run directories
cwd = os.getcwd()
directories = [os.path.join(cwd,x) for x in os.listdir(cwd) if os.path.isdir(x)]

data_dict = {}
for system in directories:
    filesystems = [os.path.join(system,x) for x in os.listdir(system) if os.path.isdir(os.path.join(system,x))]
    for filesystem in filesystems:
        dates = [os.path.join(filesystem,x) for x in os.listdir(filesystem) if os.path.isdir(os.path.join(filesystem,x))]
        for date in dates:
            experiments = [os.path.join(date,x) for x in os.listdir(date) if os.path.isdir(os.path.join(date,x))]
            for experiment in experiments:
                if 'card_name' in experiment:
                    # print(experiment)
                    if '_output' in os.listdir(experiment):
                        log_path = glob.glob(os.path.join(experiment,'*.err'))
                        if not log_path:
                            print(f"Incomplete Run: {experiment}")
                            continue
                        # print(log_path[0])

                        timer_df = get_timer_data(log_path[0])
                        if timer_df is None:
                            print(f"Incomplete Run: {experiment}")
                            continue
                        experiment_path = experiment
                        experiment = experiment.split('/')[-1]
                        system = system.split('/')[-1].replace('-','_')
                        filesystem = filesystem.split('/')[-1]
                        date = date.split('/')[-1]
                        #nsse_df = get_NNSE_data(log_path)
                        experiment_name = f"{experiment}.{system}.{filesystem}.{date}"
                        gpu_log = os.path.join(experiment_path,'gpu0.log')
                        if os.path.exists(os.path.join(experiment_path,'gpu0.log')):
                            gpu_log = os.path.join(experiment_path,'gpu0.log')
                        elif os.path.exists(os.path.join(experiment_path,'gpu0.log')):
                            gpu_log = os.path.join(experiment_path,'gpu0.log')
                        else:
                            print(f'Check path for GPU log for {experiment_path}')
                            continue
                        gpu_df = format_gpu_log(gpu_log)
                        if gpu_df is None:
                            continue
                        data_dict[experiment_name] = {}

                        if experiment.split('.')[-1].split('_')[2] == '1':
                            gpu = 'a100'
                        else:
                            gpu = experiment.split('.')[-1].split('_')[2]

                        # add run info to dictionary
                        data_dict[experiment_name]['run_info'] = {
                            'system': system,
                            'filesystem': filesystem,
                            'date': date,
                            'gpu': gpu,
                            'experiment': f"{gpu}-{system}-{filesystem}-{date}",
                            'numGpus': experiment.split('.')[-1].split('_')[5],
                            'numCpus': experiment.split('.')[-1].split('_')[8],
                            'mem': experiment.split('.')[-1].split('_')[10],
                            'epochs': experiment.split('.')[-1].split('_')[-1],
                            'path': experiment_path,
                        }

                        # get power data
                        power_df = get_power_data(data_dict[experiment_name], gpu_df, timer_df)
                        # pprint(data_dict)
                        # get NNSE data
                        NNSE_df = get_NNSE_data(log_path[0], data_dict[experiment_name]['run_info'])

                        # add DataFrames to dictionary
                        data_dict[experiment_name]['gpu_df'] = gpu_df
                        data_dict[experiment_name]['timer_df'] = timer_df
                        data_dict[experiment_name]['power_df'] = power_df
                        data_dict[experiment_name]['NNSE_df'] = NNSE_df



### Get archived data

In [8]:
archive = os.path.join(cwd,'rtx3090','archive')
if os.path.exists(archive):
    mar2022_df = pd.read_csv(os.path.join(archive,'mar2022_data.csv'))
    for i, row in mar2022_df.iterrows():
        row = row.rename({'Unnamed: 0':'experiment'})
        row['experiment'] = f"mar2022_{row['experiment']}"
        experiment = row['experiment']
        if 'colab' not in row['experiment']:
            system = 'rivanna'
        else:
            system = 'colab'
        date, gpu, filesystem, epochs = row['experiment'].split('_')
        experiment_name = f"{experiment}.{system}.{filesystem}.{date}"
        #gpu = experiment_name.split('_')[1].lower()
        # run info
        data_dict[experiment_name] = {}
        data_dict[experiment_name]['run_info'] = {
            'system': system,
            'filesystem': filesystem,
            'date': date,
            'gpu': gpu,
            'experiment': f"{gpu}-{system}-{filesystem}-{date}",
            'numGpus': 1,
            'numCpus': 1,
            'mem': np.nan,
            'epochs': epochs,
            'path': np.nan
        }
        # timer df
        if not gpu == 'V100':
            timer_df = row.drop(['experiment','__RunTFTCustomVersion bestfit']).to_frame().reset_index()
        else:
            timer_series = row.drop('experiment')
            bestfit = row['__RunTFTCustomVersion bestfit']
            timer_series = row.drop('__RunTFTCustomVersion bestfit')
            row['RunTFTCustomVersion bestfit'] = bestfit
            timer_df = row.to_frame().reset_index()
        timer_df.columns = ['timer', 'time']

        data_dict[experiment_name]['timer_df'] = timer_df

        # no data available for gpu
        data_dict[experiment_name]['gpu_df'] = None
        data_dict[experiment_name]['power_df'] = None
    
    epoch2_table = pd.read_csv(os.path.join(archive,'epoch2_table.csv'))
    for i, row in epoch2_table.iterrows():
        gpu = row['experiment'].split('(')[0].lower()
        system = row['experiment'].split('(')[1][0]
        if system == 'r':
            system = 'rivanna'
            filesystem = 'rivanna'
        elif system == 'R':
            system = 'personal_pc_r'
            filesystem = 'personal_pc_r'
        elif system == 'G':
            system = 'personal_pc_g'
            filesystem = 'personal_pc_g'
        elif system == 'L':
            system = 'rivanna'
            filesystem = 'localscratch'
        elif system == 'c':
            system = 'colab'
            filesystem = 'colab'
        experiment_name = f'mar2022_epoch2_{gpu}_{system}_{filesystem}.{system}.{filesystem}.mar2022'

        # run info
        data_dict[experiment_name] = {}
        data_dict[experiment_name]['run_info'] = {
            'system': system,
            'filesystem': filesystem,
            'date': 'mar2022',
            'gpu': gpu,
            'experiment': f"{gpu}-{system}-{filesystem}-mar2022",
            'numGpus': 1,
            'numCpus': 1,
            'mem': np.nan,
            'epochs': 2,
            'path': np.nan
        }
        # timer df
        timer_df = row.drop('experiment').to_frame().reset_index()
        timer_df.columns = ['timer', 'time']

        data_dict[experiment_name]['timer_df'] = timer_df

        # no data available for gpu
        data_dict[experiment_name]['gpu_df'] = None
        data_dict[experiment_name]['power_df'] = None
        
    rtx3090 = pd.read_csv(os.path.join(archive,'rtx3090_data.csv'))
    for i, row in rtx3090.iterrows():
        row = row.drop('Unnamed: 0')
        gpu = 'rtx3090'
        system = 'personal_pc_g'
        filesystem = 'personal_pc_g'
        epochs = row['epochs']
        experiment_name = f'mar2022_rtx3090_personal_{epochs}.{system}.{filesystem}.{date}'
        # run info
        data_dict[experiment_name] = {}
        data_dict[experiment_name]['run_info'] = {
            'system': system,
            'filesystem': filesystem,
            'date': 'mar2022',
            'gpu': gpu,
            'experiment': f"{gpu}-{system}-{filesystem}-mar2022",
            'numGpus': 1,
            'numCpus': 1,
            'mem': np.nan,
            'epochs': epochs,
            'path': np.nan
        }
        timer_df = row.drop('epochs').to_frame().reset_index()
        timer_df.columns = ['timer','time']
        data_dict[experiment_name]['timer_df'] = timer_df

        # no data available for gpu
        data_dict[experiment_name]['gpu_df'] = None
        data_dict[experiment_name]['power_df'] = None
        

else:
    print('No archived data')

No archived data


### Create pickle file

In [9]:
pickle_file = os.path.join(cwd,'experiment_data.pkl')
with open(pickle_file, 'wb') as f:
    pickle.dump(data_dict, f)     