In [8]:
import pandas as pd
import numpy as np

In [None]:
## Requires minibatch_index_list file and powermode runs (Use detect_clean_mbs)

In [9]:
def populate_data_time(sampled_powermodes, path, filename, offset_dict):
    cpu_cores_multipler = 1
    cpu_frq_divider = 1
    gpu_frq_divider = 1
    mem_frq_divider = 1

    all_data = []

    for powermode in sampled_powermodes:
        # Get the offset before scaling
        offset = offset_dict.get(powermode, -1)
        file = path + "/" + "pm_" + powermode + "/" + filename
        cores = int(powermode.split("_")[0]) * cpu_cores_multipler
        cpu = int(powermode.split("_")[1]) / cpu_frq_divider
        gpu = int(powermode.split("_")[2]) / gpu_frq_divider
        mem = int(powermode.split("_")[3]) / mem_frq_divider

        temp_df = pd.read_csv(file, header=None)
        # temp_df = temp_df[temp_df[4].replace('.', '', 1).astype(str).str.isnumeric()]
        # temp_df = temp_df[temp_df[4].apply(is_numeric_value)]
        # print(temp_df.head())

        if offset == -1:
            start = len(temp_df) - 40
            end = len(temp_df)
        elif offset == 0:
            start = offset + 1
            end = start + 40
        else:
            start = offset
            end = start + 40

        diff = end-start
        if len(temp_df[start:end]) != 40:
            print("Diff :", diff)
            print("Start :",start)
            print("End:",end)
            print("Offset: ",offset)
            print("Powermode :",powermode)
            print("Length :",len(temp_df))

        temp_df = temp_df.iloc[start:end]
        # Changed from 4 to 5 for yolo
        temp_df = temp_df.iloc[:,4]
        temp_df = temp_df.to_frame() 
        # print(temp_df.head())
        temp_df['Cores'] = cores
        temp_df['CPU_frequency'] = cpu
        temp_df['GPU_frequency'] = gpu
        temp_df['Memory_frequency'] = mem
        temp_df.columns = ['Minibatch_time', 'Cores', 'CPU_frequency', 'GPU_frequency', 'Memory_frequency']
        # print(temp_df.head())

        all_data.append(temp_df)

    master_df = pd.concat(all_data, ignore_index=True)
    return master_df

In [10]:
def populate_data_power(sampled_powermodes, path, tg_filename, offset_dict, start_dict, end_dict):

    rows = []
    for powermode in sampled_powermodes:
        offset = offset_dict.get(powermode, -1)
        start_time = start_dict.get(powermode, -1)
        end_time = end_dict.get(powermode, -1)

        if end_time - start_time == 0:
            end_time = end_time + 2
            start_time = start_time - 2
            print("End time is less than start time")
        # file = path + "/" + "pm_" + powermode + "/" + filename
        tg_file = path + "/" + "pm_" + powermode + "/" + tg_filename
        # print(start_time)
        tg_df = pd.read_csv(tg_file)
        # print(tg_file)
        filtered_df = tg_df[(tg_df['log_time'] >= start_time) & (tg_df['log_time'] <= end_time)]
        # print(filtered_df)
        power_list = filtered_df['power cur'].astype(float).dropna().tolist()
        # print(powermode)
        # print(power_list)
        #Resnet: 22
        #Mobnet: 40
        #Yolo: 100
        required_length = 500
        if len(power_list) < required_length:
            repeats_required = -(-required_length // len(power_list))  
            power_list = (power_list * repeats_required)[:required_length]

        power_list = power_list[:required_length]

        # Split the powermode into its components
        cores, cpu, gpu, mem = powermode.split("_")

        for sample in power_list:
            rows.append({
                'cores': cores,
                'cpu': cpu,
                'gpu': gpu,
                'mem': mem,
                'power_sample': sample
            })

    minibatch_power_df = pd.DataFrame(rows)
    # minibatch_power_df.to_csv("val_power.csv", index=False)
    return minibatch_power_df


In [11]:
def extract_offsets_from_csv(csv_file):
    df = pd.read_csv(csv_file) 
    powermode_offsets = {}
    start_times = {}
    end_times = {}
    # Iterate through the DataFrame and populate the dictionary
    for _, row in df.iterrows():
        powermode = f"{int(row['cores'])}_{int(row['cpu'])}_{int(row['gpu'])}_{int(row['mem'])}"
        offset = int(row['skip_index'])
        start_time = float(row['start_time'])
        end_time = float(row['end_time'])
        powermode_offsets[powermode] = offset
        start_times[powermode] = start_time
        end_times[powermode] = end_time
    return powermode_offsets, start_times, end_times

In [12]:
def generate_powermodes():
    core_vals=[4, 8, 12] #3 possible values
    gpu_vals=[114750000, 318750000, 522750000, 726750000, 930750000, 1134750000, 1300500000]
    cpu_vals=[422400, 729600, 1036800, 1344000, 1651200, 1958400, 2201600] #in kHz, 7 possible values
    mem_vals = [665600000, 2133000000, 3199000000]
    #get combinations of all 4 as powermode Ex.2_1300500000_268800_204000000
    all_powermodes=[] #6*13*14*4=4368 possible values
    for cpu_frequency in cpu_vals:
        for gpu_frequency in gpu_vals:
            for cpu_core in core_vals:
                for mem_frequency in mem_vals:
                    all_powermodes.append(str(cpu_core)+"_"+str(cpu_frequency)+"_"+str(gpu_frequency)+"_"+str(mem_frequency))


    return all_powermodes

In [13]:
all_powermodes = generate_powermodes()

path = '/home/saisamarth/exp/training_bs16_runs/LSTM'
time_filename = 'mn_nw4_pf2_epoch_stats.csv'
power_filename = 'mn_nw4_pf2_tegrastats.csv'
offset_file = '/home/saisamarth/exp/AALSTM.csv'
offsets, start_times, end_times = extract_offsets_from_csv(offset_file)


In [14]:
val_time = []
val_power = []
for powermode in all_powermodes:
    val_time.append(populate_data_time([powermode], path, time_filename, offsets))
    val_power.append(populate_data_power([powermode], path, power_filename, offsets, start_times, end_times))
time_df = pd.concat(val_time, ignore_index=True)
power_df = pd.concat(val_power, ignore_index=True)

In [15]:
power_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 220500 entries, 0 to 220499
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   cores         220500 non-null  object 
 1   cpu           220500 non-null  object 
 2   gpu           220500 non-null  object 
 3   mem           220500 non-null  object 
 4   power_sample  220500 non-null  float64
dtypes: float64(1), object(4)
memory usage: 8.4+ MB


In [16]:
time_df['Minibatch_time'] = time_df['Minibatch_time'].astype(float)
power_df['cores'] = power_df['cores'].astype(int)
power_df['cpu'] = power_df['cpu'].astype(int)
power_df['gpu'] = power_df['gpu'].astype(int)
power_df['mem'] = power_df['mem'].astype(int)

In [17]:
# take median of every 40 sample for time df and 500 samples for power df
time_df = time_df.groupby(time_df.index // 40).apply(lambda x: x.median())
power_df = power_df.groupby(power_df.index // 500).apply(lambda x: x.median())

In [18]:
time_df = time_df[['Cores', 'CPU_frequency', 'GPU_frequency', 'Memory_frequency', 'Minibatch_time']]
time_df.columns = ['cores', 'cpu', 'gpu', 'mem', 'observed_time']

In [19]:
power_df.columns = ['cores', 'cpu', 'gpu', 'mem', 'observed_power']

In [20]:
# merged both the dataframes on cores, cpu, gpu, mem
merged_df = pd.merge(time_df, power_df, on=['cores', 'cpu', 'gpu', 'mem'], how='inner')

In [21]:
merged_df['cores'] = merged_df['cores'].astype(int)
merged_df['cpu'] = merged_df['cpu'].astype(int)
merged_df['gpu'] = merged_df['gpu'].astype(int)
merged_df['mem'] = merged_df['mem'].astype(int)

merged_df['observed_power'] = merged_df['observed_power']/1000.0

In [22]:
merged_df.to_csv("lstm_train_data_final.csv", index=False)