In [21]:
%run SetUp.ipynb

Home Directory = /home/cjpar/Work/wfe/wfey
Log Path = /home/cjpar/Work/wfe/wfey/logs/zero_test/


# Processing Logs

In [2]:
def removeOutliers(DATA, THRESHHOLD_Z=2, VERBOSE=False):
    
    hwmon2_mean_z  = np.abs(stats.zscore(DATA['hwmon2_mean'], nan_policy='omit'))
    hwmon2_min_z   = np.abs(stats.zscore(DATA['hwmon2_min'], nan_policy='omit'))
    hwmon2_max_z   = np.abs(stats.zscore(DATA['hwmon2_max'], nan_policy='omit'))
    latency_mean_z = np.abs(stats.zscore(DATA['latency_mean'], nan_policy='omit'))
    latency_min_z  = np.abs(stats.zscore(DATA['latency_min'], nan_policy='omit'))
    latency_max_z  = np.abs(stats.zscore(DATA['latency_max'], nan_policy='omit'))

    hwmon2_mean_out = np.where(hwmon2_mean_z > THRESHHOLD_Z)[0]
    hwmon2_min_out = np.where(hwmon2_min_z > THRESHHOLD_Z)[0]
    hwmon2_max_out = np.where(hwmon2_max_z > THRESHHOLD_Z)[0]
    latency_mean_out = np.where(latency_mean_z > THRESHHOLD_Z)[0]
    latency_min_out = np.where(latency_min_z > THRESHHOLD_Z)[0]
    latency_max_out = np.where(latency_max_z > THRESHHOLD_Z)[0]

    
    outlier_indices = np.unique(np.concat((hwmon2_mean_out, hwmon2_min_out, hwmon2_max_out, latency_mean_out, latency_min_out, latency_max_out)))
    no_outliers = DATA.drop(outlier_indices)

    if (VERBOSE):
        print("Z scores:")
        print("Hwmon2 mean, min, max")
        print(hwmon2_mean_z, hwmon2_min_z, hwmon2_max_z)
        print("Latency mean, min, max")
        print(latency_mean_z,latency_min_z,latency_max_z)
        
        print("Original DataFrame Shape:", DATA.shape)
        print("DataFrame Shape after Removing Outliers:", no_outliers.shape)
        print("Removed Indexes:")
        print(outlier_indices)
        print("\t HWMON -- mean, min, max")
        print(DATA.loc[hwmon2_mean_out, ['KEY', 'hwmon2_mean']])
        print(DATA.loc[hwmon2_min_out, ['KEY', 'hwmon2_min']])
        print(DATA.loc[hwmon2_max_out, ['KEY', 'hwmon2_max']])
        print("\t Latency -- mean, min, max")
        print(DATA.loc[latency_mean_out, ['KEY', 'latency_mean']])
        print(DATA.loc[latency_min_out, ['KEY', 'latency_min']])
        print(DATA.loc[latency_max_out, ['KEY', 'latency_max']])
    
    return no_outliers.reset_index(drop=True)

In [45]:
arr = []
data = []

# If RAW=True this means that the entire run's hwmon data will be used, if false only the middle 68% will be used

def processData(CONFIGS, EVENTRATE, EVENTPROCCPUS, SOURCESCPUS, RAW=True, VERBOSE=False):
    column_headers = []
    for C in CONFIGS:
        for EVENTS in EVENTRATE:
            for EVENTCPU in EVENTPROCCPUS:
                for SOURCECPU in SOURCECPUS:
                    SCPUs=str(SOURCECPU).replace(" ", "_")

                    KEY=C+"/"+str(EVENTS)+"_"+EVENTCPU+"_"+SCPUs
                    FILE=PATH_TO_LOGS+KEY+"/"
                    HWMONFILE=glob.glob(FILE+'hwmon-*.out')

                    # If no files found with these args -- skip
                    if not HWMONFILE:
                        continue
                        
                    ### For every run of this set of parameters concat
                    hwmonoutput=None
                    latencyoutput=None
                    bmoutput=None

                    if VERBOSE:
                        print("KEY: ", KEY)

                    for runs in HWMONFILE:
                        hw_file = runs
                        bm_file = runs.replace("hwmon", "wfey")
                        latency_file = runs.replace("hwmon", "latency")
                           
                        if (os.path.exists(hw_file)) and (os.path.exists(bm_file)) and (os.path.exists(latency_file)):
                            ## --- Grabbing Energy Numbers --- ##
                            if VERBOSE:
                                print("processsing:" + hw_file)
                                
                            try:
                                df=pd.read_csv(hw_file, sep=' ')
                                df = df.iloc[:, :-1] # dropping last column because there is an extra space to be dealt with later
                                ## NOTE: if we want min/max numbers this needs to be done later and the start and end values should stay
                                df = df.diff().iloc[1:2, :]
                            except:
                                print("SKIPPING: " + hw_file + " --- Problem parsing hwmon numbers")
                                continue

                            if (len(df) > OVERFLOW_NUM):
                                print("SKIPPING: " + hw_file  +" --- Possible error occured during experiment")
                                continue
                            
                            

                            ### This fix was for old version with know headers -- we will see if this is an issue that needs to be dealt in this version as well
                            ### I hypothesize that the 'none' check of the outputs below check for the same bug and that is sufficient
                            ## KNOWN BUG: if no header on cvs and error with gathering HWMON it will not catch it and create table of NULL
                            #if (list(hwmonoutput) != ['hwmon2', 'hwmon3']):
                            #    print("bad columns for:", runs)

                            ## --- Grabbing Benchmark Numbers --- ##
                            if VERBOSE:
                                print("processsing:" + bm_file)

                            try:
                                wfey_file_contents=[]
                                with open(bm_file) as wfey_file:
                                    wfey_file_contents = [line.strip() for line in wfey_file]
                                    wfey_file.close()

                                ### TODO - combine the next two lists into one
                                bm_string_list = [dict([kv.split('=') for kv in record.split(', ')]) for record in wfey_file_contents]
                                ### NOTE: address value of epthread convers into a int -- useless
                                bm_list = [dict([a, int(x,0)] for a, x in b.items()) for b in bm_string_list]
                                bm_df=pd.DataFrame(bm_list)
                            except:
                                print("SKIPPING: " + bm_file + " --- Problem parsing benchmark numbers")
                                continue

                            ## --- Grabbing Latency Numbers --- ##
                            if VERBOSE:
                                print("processsing:" + latency_file)

                            try:
                                latency_df = pd.read_csv(latency_file)
                            except:
                                print("SKIPPING: " + latency_file + " --- Problem parsing latency numbers")
                                continue

                            ## --- Adding all Values of this File to Output --- ##

                            hwmonoutput = pd.concat([hwmonoutput, df])
                            bmoutput = pd.concat([bmoutput, bm_df])
                            latencyoutput = pd.concat([latencyoutput, latency_df])
                        else:
                            files = runs.replace("hwmon", "*")
                            print("SKIPPING: " + files + " --- Problem with Test Output")
                            continue

                   
                    if (hwmonoutput is None) or (latencyoutput is None) or (bmoutput is None):
                        continue


                    #print("hwmon output: ", hwmonoutput)
                    #print("latency output: ", latencyoutput)
                    #print("bm output: ", bmoutput)
                    
                    ## --- Find the energy mean of all runs --- ##
                    by_row_index = hwmonoutput.groupby(hwmonoutput.index)
                    raw_result=by_row_index.mean()
                    
                    ## --- Find the bm mean of all runs --- ##
                    ### TODO add not raw output to the benchmark data
                    bm_by_row_index = bmoutput.groupby(bmoutput.index)
                    bm_raw_result=bm_by_row_index.mean()

                    ## --- Raw Data or Middle Data --- ##
                    results = raw_result
                    bm_results = bm_raw_result
                    if (not RAW):
                        adj_index = round(len(raw_result)*(PERCENT/100))

                        middle_point = round(len(raw_result)/2)

                        low_index = middle_point-(math.floor(adj_index/2))
                        high_index = middle_point + (math.floor(adj_index/2))

                        results = raw_result[low_index : high_index+1]
 
                    ## --- Find Means of Latencys --- ###
                    ### This means that the individual data for the sources is not saved 
                    ### But I believe that's fine bc we don't care about the about the data
                    ### at the cpu scale but the number of cpu handling the workload
                    
                    latency_results=latencyoutput.drop("ID", axis=1)

                    
                    latency_min = latency_results.loc[:,"Min"].to_numpy()
                    latency_max = latency_results.loc[:,"Max"].to_numpy()

                    latency_min = stats.gmean(latency_min)
                    latency_max = stats.gmean(latency_max)

                    ### NOTE: If the sources had no completed events then the mean is -1
                    ### You can use this as a landmark to skip those values when evaluating
                    ### If mean is nan you know there were 0 events total
                    latency_mean = latency_results.loc[:,"Mean"].to_numpy()
                    latency_mean = [x for x in latency_mean if x!=-1]
                    with warnings.catch_warnings(): 
                        # assuming nan mean is okay because means no events at all
                        warnings.filterwarnings(action="ignore", message='One or more sample arguments is too small')
                        latency_mean = stats.gmean(latency_mean)

                    
                    ## --- Processing Output --- ##
                    temp_headers = []
                    ## --- Making array out of core output --- ##
                    core_energy_data = []
                    for core_name in results.columns:
                        temp_headers.append(core_name)
                        core_results = results.loc[:, core_name].to_numpy()
                        core_energy_data.append(core_results[0]) # only grabbing the one result -- more are not expected?
            
                    ## --- Making array out of Benchmark output --- ##
                    ### TODO -- sanity check this with multiple values because this was done quick and sloppy
                    ### i.e don't want id to be mean'ed together
                    bm_data = []
                    for bm_data_name in bm_results.columns:
                        temp_headers.append(bm_data_name)
                        bm_data.append(bm_results.loc[:, bm_data_name].to_numpy())

                    ### HACK that this is here -- want the headers used in the files and i'm taking the first one processed as truth
                    ### if there is a bug in the file -- everything goes to shit but i believe above error checking makes here safe
                    if not column_headers:
                        column_headers = temp_headers
                        
                    ## --- Adding data to list --- ##
                    
                    main_values = [KEY,C,EVENTS,EVENTCPU,SCPUs, latency_min, latency_max, latency_mean]
                    var_values = core_energy_data + bm_data

                    all_values = main_values + var_values
                    arr.append(all_values)

    constant_headers = [ "KEY", "configs", "eventrate", "eventprocCPUs", "sourceCPUs", "latency_min", "latency_max", "latency_mean"]
    all_headers = constant_headers + column_headers

    df = pd.DataFrame(data=arr, columns=all_headers)    
    #print(df)
    
    return df

### Creating Dataframes

##### Process the Data

In [39]:
wfey_output = processData(CONFIGS, EVENTRATE, EVENTPROCCPUS, SOURCECPUS, RAW=False, VERBOSE=False)
wfey_output_raw = processData(CONFIGS, EVENTRATE, EVENTPROCCPUS, SOURCECPUS, RAW=True, VERBOSE=False)

##### Remove Outliers

In [43]:
### HACK -- VERY TEMPORARY
### not fixing outlier with new data yet -- it will return the same values
def removeOutliers(DATA, THRESHHOLD_Z=2, VERBOSE=False):
    return DATA

## Removing outlier 2x -- unused sources have MAXINT as latency values and skew the z values
wfey_no_out_first = removeOutliers(wfey_output, THRESHHOLD_Z=5, VERBOSE=False)
wfey_no_out = removeOutliers(wfey_no_out_first, THRESHHOLD_Z=3, VERBOSE=False)

wfey_no_out_raw_first = removeOutliers(wfey_output_raw, THRESHHOLD_Z=5, VERBOSE=False)
wfey_no_out_raw = removeOutliers(wfey_no_out_raw_first, THRESHHOLD_Z=3, VERBOSE=False)

#print(wfey_no_out[ ( wfey_no_out['numevents'] == 10000) & (wfey_no_out['sleeptime'] == '0.01') & (wfey_no_out['sourceCPUs'] == '10')].loc[:, ])

##### Export the Data Frames

In [44]:
wfey_output.to_pickle(HOME_DIRECTORY+'/df/'+LOGS+'_output.pkl')
wfey_output_raw.to_pickle(HOME_DIRECTORY+'/df/'+LOGS+'_output_raw.pkl')

wfey_no_out.to_pickle(HOME_DIRECTORY+'/df/'+LOGS+'_output_clean.pkl')
wfey_no_out_raw.to_pickle(HOME_DIRECTORY+'/df/'+LOGS+'_output_clean_raw.pkl')