In [1]:
%run SetUp.ipynb

Home Directory = /Users/cjparra/Work/wfey
Log Path = /Users/cjparra/Work/wfey/logs/


# Processing Logs

In [2]:
def removeOutliers(DATA, THRESHHOLD_Z=2, VERBOSE=False):
    
    hwmon2_mean_z  = np.abs(stats.zscore(DATA['hwmon2_mean'], nan_policy='omit'))
    hwmon2_min_z   = np.abs(stats.zscore(DATA['hwmon2_min'], nan_policy='omit'))
    hwmon2_max_z   = np.abs(stats.zscore(DATA['hwmon2_max'], nan_policy='omit'))
    latency_mean_z = np.abs(stats.zscore(DATA['latency_mean'], nan_policy='omit'))
    latency_min_z  = np.abs(stats.zscore(DATA['latency_min'], nan_policy='omit'))
    latency_max_z  = np.abs(stats.zscore(DATA['latency_max'], nan_policy='omit'))

    hwmon2_mean_out = np.where(hwmon2_mean_z > THRESHHOLD_Z)[0]
    hwmon2_min_out = np.where(hwmon2_min_z > THRESHHOLD_Z)[0]
    hwmon2_max_out = np.where(hwmon2_max_z > THRESHHOLD_Z)[0]
    latency_mean_out = np.where(latency_mean_z > THRESHHOLD_Z)[0]
    latency_min_out = np.where(latency_min_z > THRESHHOLD_Z)[0]
    latency_max_out = np.where(latency_max_z > THRESHHOLD_Z)[0]

    
    outlier_indices = np.unique(np.concat((hwmon2_mean_out, hwmon2_min_out, hwmon2_max_out, latency_mean_out, latency_min_out, latency_max_out)))
    no_outliers = DATA.drop(outlier_indices)

    if (VERBOSE):
        print("Z scores:")
        print("Hwmon2 mean, min, max")
        print(hwmon2_mean_z, hwmon2_min_z, hwmon2_max_z)
        print("Latency mean, min, max")
        print(latency_mean_z,latency_min_z,latency_max_z)
        
        print("Original DataFrame Shape:", DATA.shape)
        print("DataFrame Shape after Removing Outliers:", no_outliers.shape)
        print("Removed Indexes:")
        print(outlier_indices)
        print("\t HWMON -- mean, min, max")
        print(DATA.loc[hwmon2_mean_out, ['KEY', 'hwmon2_mean']])
        print(DATA.loc[hwmon2_min_out, ['KEY', 'hwmon2_min']])
        print(DATA.loc[hwmon2_max_out, ['KEY', 'hwmon2_max']])
        print("\t Latency -- mean, min, max")
        print(DATA.loc[latency_mean_out, ['KEY', 'latency_mean']])
        print(DATA.loc[latency_min_out, ['KEY', 'latency_min']])
        print(DATA.loc[latency_max_out, ['KEY', 'latency_max']])
    
    return no_outliers.reset_index(drop=True)

In [3]:
arr = []
data = []

# If RAW=True this means that the entire run's hwmon data will be used, if false only the middle 68% will be used

def processData(CONFIGS, NUMEVENTS, EVENTPROCCPUS, SLEEPTIME, SOURCESCPUS, RAW=True, VERBOSE=False):
    for C in CONFIGS:
        for EVENTS in NUMEVENTS:
            for EVENTCPU in EVENTPROCCPUS:
                for SLEEP in SLEEPTIME:
                    for SOURCECPU in SOURCECPUS:
                        SCPUs=str(SOURCECPU).replace(" ", "_")

                        KEY=C+"/"+str(EVENTS)+"_"+EVENTCPU+"_"+SLEEP+"_"+SCPUs
                        FILE=PATH_TO_LOGS+KEY+"/"
                        HWMONFILE=glob.glob(FILE+'hwmon-*.out')

                        # If no files found with these args -- skip
                        if not HWMONFILE:
                            continue
                            
                        ### For every run of this set of parameters concat
                        hwmonoutput=None
                        latencyoutput=None

                        for runs in HWMONFILE:
                            if os.path.exists(runs):
                                if VERBOSE:
                                    print("processsing:" + runs)
                                    
                                try:
                                    df=pd.read_csv(runs, sep=' ', usecols=['hwmon2', 'hwmon3'])
                                except:
                                    print("SKIPPING: " + runs + " --- Problem parsing hwmon numbers")
                                    continue

                                if (len(df) > OVERFLOW_NUM):
                                    print("SKIPPING: " + runs  +" --- Possible error occured during experiment")
                                    continue
                                
                                hwmonoutput = pd.concat([hwmonoutput, df])

                                if (list(hwmonoutput) != ['hwmon2', 'hwmon3']):
                                    ## KNOWN BUG: if no header on cvs and error with gathering HWMON it will not catch it and create table of NULL
                                    print("bad columns for:", runs)

                                ## --- Grabbing Latency Numbers --- ##
                                latency_file = runs.replace("hwmon", "latency")

                                if os.path.exists(latency_file):
                                    if VERBOSE:
                                        print("processsing:" + latency_file)

                                    try:
                                        latency_df = pd.read_csv(latency_file)
                                    except:
                                        print("SKIPPING: " + latency_file + " --- Problem parsing latency numbers")
                                        continue

                                    latencyoutput = pd.concat([latencyoutput, latency_df])
                       
                        if (hwmonoutput is None) or (latencyoutput is None):
                            continue
                        
                        ## --- Find the mean of all runs --- ##
                        by_row_index = hwmonoutput.groupby(hwmonoutput.index)
                        raw_result=by_row_index.mean()

                        ## --- Raw Data or Middle Data --- ##
                        results = raw_result
                        if (not RAW):
                            adj_index = round(len(raw_result)*(PERCENT/100))

                            middle_point = round(len(raw_result)/2)

                            low_index = middle_point-(math.floor(adj_index/2))
                            high_index = middle_point + (math.floor(adj_index/2))

                            results = raw_result[low_index : high_index+1]
     
                        ## --- Find Means of Latencys --- ###
                        ### This means that the individual data for the sources is not saved 
                        ### But I believe that's fine bc we don't care about the about the data
                        ### at the cpu scale but the number of cpu handling the workload
                        
                        latency_results=latencyoutput.drop("ID", axis=1)

                        latency_min = latency_results.loc[:,"Min"].to_numpy()
                        latency_max = latency_results.loc[:,"Max"].to_numpy()
                        latency_mean = latency_results.loc[:,"Mean"].to_numpy()

                        latency_min = stats.gmean(latency_min)
                        latency_max = stats.gmean(latency_max)
                        latency_mean = stats.gmean(latency_mean)

                        ## --- Getting time for every power number -- #
                        
                        runningtime = len(results.index) * TIME_BTN_POWER
                        
                        time_range = np.arange(0, runningtime, TIME_BTN_POWER)
                        
                        ## --- Making array out of hwmon output --- ##
                        hwmon2_pwr = results.loc[:,"hwmon2"].to_numpy()
                        hwmon3_pwr = results.loc[:,"hwmon3"].to_numpy()

                        ## --- Processing HWMON Numbers --- ## 
                        hwmon2_min = hwmon2_pwr.min()
                        hwmon3_min = hwmon3_pwr.min()

                        hwmon2_max = hwmon2_pwr.max()
                        hwmon3_max = hwmon3_pwr.max()

                        hwmon2_mean = stats.gmean(hwmon2_pwr)
                        hwmon3_mean = stats.gmean(hwmon3_pwr)

                        ## --- Adding data to list --- ##
                        
                        arr.append((C,EVENTS,SLEEP,EVENTCPU,SCPUs,hwmon2_pwr,hwmon3_pwr, hwmon2_min, hwmon2_max, hwmon2_mean, hwmon3_min, hwmon3_max, hwmon3_mean, time_range, latency_min, latency_max, latency_mean, KEY))

    df = pd.DataFrame(data=arr, columns=[ "configs", "numevents", "sleeptime", "eventprocCPUs", "sourceCPUs", "hwmon2", "hwmon3", "hwmon2_min", "hwmon2_max", "hwmon2_mean", "hwmon3_min", "hwmon3_max", "hwmon3_mean", "time", "latency_min", "latency_max", "latency_mean", "KEY"])
    return df

### Creating Dataframes

##### Process the Data

In [4]:
wfey_output = processData(CONFIGS, NUMEVENTS, EVENTPROCCPUS, SLEEPTIME, SOURCECPUS, RAW=False, VERBOSE=False)
wfey_output_raw = processData(CONFIGS, NUMEVENTS, EVENTPROCCPUS, SLEEPTIME, SOURCECPUS, RAW=True, VERBOSE=False)

SKIPPING: /Users/cjparra/Work/wfey/logs/busypoll_db_wfey/10_1_0.001_10/latency-2025-07-08-17-54-43.out --- Problem parsing latency numbers
SKIPPING: /Users/cjparra/Work/wfey/logs/busypoll_db_wfey/10_1_0.001_10/latency-2025-07-08-15-45-54.out --- Problem parsing latency numbers
SKIPPING: /Users/cjparra/Work/wfey/logs/busypoll_db_wfey/1000_1_0.001_5/latency-2025-07-08-15-47-03.out --- Problem parsing latency numbers
SKIPPING: /Users/cjparra/Work/wfey/logs/wfe_db_nomon_wfey/10000_1_0.001_50/hwmon-2025-07-15-13-20-29.out --- Possible error occured during experiment
SKIPPING: /Users/cjparra/Work/wfey/logs/wfe_db_mon_wfey/100_1_0.001_10/latency-2025-07-08-18-07-21.out --- Problem parsing latency numbers
SKIPPING: /Users/cjparra/Work/wfey/logs/busypoll_db_wfey/10_1_0.001_10/latency-2025-07-08-17-54-43.out --- Problem parsing latency numbers
SKIPPING: /Users/cjparra/Work/wfey/logs/busypoll_db_wfey/10_1_0.001_10/latency-2025-07-08-15-45-54.out --- Problem parsing latency numbers
SKIPPING: /User

##### Remove Outliers

In [7]:
## Removing outlier 2x -- unused sources have MAXINT as latency values and skew the z values
wfey_no_out_first = removeOutliers(wfey_output, THRESHHOLD_Z=5, VERBOSE=False)
wfey_no_out = removeOutliers(wfey_no_out_first, THRESHHOLD_Z=3, VERBOSE=False)

wfey_no_out_raw_first = removeOutliers(wfey_output_raw, THRESHHOLD_Z=5, VERBOSE=False)
wfey_no_out_raw = removeOutliers(wfey_no_out_raw_first, THRESHHOLD_Z=3, VERBOSE=False)

#print(wfey_no_out[ ( wfey_no_out['numevents'] == 10000) & (wfey_no_out['sleeptime'] == '0.01') & (wfey_no_out['sourceCPUs'] == '10')].loc[:, ])

##### Export the Data Frames

In [8]:
wfey_output.to_pickle(HOME_DIRECTORY+'/df/benchmark_output.pkl')
wfey_output_raw.to_pickle(HOME_DIRECTORY+'/df/benchmark_output_raw.pkl')

wfey_no_out.to_pickle(HOME_DIRECTORY+'/df/benchmark_output_clean.pkl')
wfey_no_out_raw.to_pickle(HOME_DIRECTORY+'/df/benchmark_output_clean_raw.pkl')