In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys # error msg, add the modules

import pandas as pd
import numpy as np
import operator
import copy

sys.path.append("../../pycode/")
from magus_util import read_nvprof_trace, parse_nvprof_trace, getruntime, sort_dict_by_val
from magus_contention import *

In [2]:
#
# read metrics for sim
#
app2metric_dd = np.load('../similarity/app2metric_dd.npy').item()

In [3]:
len(app2metric_dd)

79

In [4]:
# 
#read traces files in similarity folder
#

traceFolder = "../similarity/traces/"

traceFiles = os.listdir(traceFolder)

In [5]:
traceFiles[:3]

['traces_cudasdk_MCEstimatePiP.csv',
 'traces_cudasdk_simpleCUBLAS.csv',
 'traces_cudasdk_shflscan.csv']

In [6]:
print len(traceFiles)

79


### parse traces and save them into data dictionary

In [7]:
app2trace_dd = {}

for appcsv in traceFiles:
    #print appcsv[7:][:-4]
    
    appName = appcsv[7:][:-4]
    print appName
    
    file_csv = traceFolder + appcsv
    #print file_csv
    
    # read csv file to dataframe
    df_trace = read_nvprof_trace(file_csv)
    
    # convert dataframe to trace list
    appTraceList = parse_nvprof_trace(df_trace)
    
    app2trace_dd[appName] = appTraceList
    #break

cudasdk_MCEstimatePiP
cudasdk_simpleCUBLAS
cudasdk_shflscan
poly_gesummv
cudasdk_binomialOptions
poly_atax
cudasdk_lineOfSight
lonestar_mst
cudasdk_BlackScholes
cudasdk_MCSingleAsianOptionP
shoc_lev1sort
poly_3dconv
rodinia_hotspot
cudasdk_scalarProd
poly_3mm
parboil_mriq
parboil_stencil
poly_gemm
cudasdk_radixSortThrust
rodinia_gaussian
cudasdk_SobolQRNG
poly_fdtd2d
rodinia_pathfinder
poly_correlation
shoc_lev1BFS
cudasdk_convolutionTexture
lonestar_sssp
shoc_lev1reduction
cudasdk_concurrentKernels
rodinia_lud
shoc_lev1fft
cudasdk_MCEstimatePiQ
cudasdk_batchCUBLAS
shoc_lev1GEMM
poly_syrk
cudasdk_matrixMul
cudasdk_convolutionFFT2D
cudasdk_dxtc
rodinia_hybridsort
cudasdk_c++11Cuda
rodinia_needle
cudasdk_stereoDisparity
poly_mvt
cudasdk_threadFenceReduction
lonestar_bh
cudasdk_mergeSort
parboil_lbm
rodinia_dwt2d
rodinia_backprop
parboil_sgemm
cudasdk_convolutionSeparable
cudasdk_boxFilterNPP
cudasdk_scan
cudasdk_dwtHaar1D
cudasdk_quasirandomGenerator
cudasdk_interval
cudasdk_vectorAdd
cu

In [8]:
#
# save to a file
#
np.save('app2trace_dd.npy', app2trace_dd)

### run performance model

In [9]:
def getruntime(appTraceList):
    """
    Return the difference between 1st api start and last api end.
    """
    return appTraceList[-1][2] - appTraceList[0][1]


def update_trace_offset(tracelist, offset):
    """
    Adjust the starting time (add offset) to each api call in the traceList.
    """
    for eachApi in tracelist:
        eachApi[1] += offset
        eachApi[2] += offset
        
def update_trace_api(tracelist, api_index, offset):
    """
    Adjust the starting time (starting from api_index) in the traceList.
    """
    for pid in xrange(len(tracelist)):
        if pid >= api_index:
            tracelist[pid][1] += offset
            tracelist[pid][2] += offset


            

def adjust_prevTraceTable_api(traceTable, apiID, newStart, oldStart):
    offset = newStart - oldStart
    for api_id, apiCall in enumerate(traceTable):
        # for each api that start before oldStart, remain the same
        # that start after oldStart, add an offset
        myStart, myEnd = apiCall[1], apiCall[2]
        if myStart >= oldStart:
            # add offset
            traceTable[api_id][1] += offset
            traceTable[api_id][2] += offset

            

def model_contention(prevTraceList, newapi, copyEngineNum=2):
    """
    For the newapi, look for contention duing apiStart and apiEnd.
    Default configuration assumes the copy engine number is 2.
    """
    curType, curStart, curEnd = newapi[0], newapi[1], newapi[2]

    #print "\n(Current Api)"
    #print curType, curStart, curEnd

    contentionCount = 0
    adjCurrent, adjTraceTab = False, False

    # iterate all the apps in the traceTable
    for apiID, apiCall in enumerate(prevTraceList):
        preType, preStart, preEnd = apiCall[0], apiCall[1], apiCall[2]

        if (curStart < preEnd <= curEnd) or (curStart <= preStart < curEnd) or (curStart > preStart and curEnd < preEnd):
            if preType == curType:
                contentionCount = contentionCount + 1
                if preStart <= curStart:  # delay current api till the end of prevEnd
                    # print "adjust new api"
                    adjCurrent = True
                    newStart = preEnd
                    oldStart = curStart
                else:  # move the app in traceTable after current api
                    # print "adjust app in traceTable"
                    adjTraceTab = True
                    newStart = curEnd
                    oldStart = preStart
                # find out whether current api has any contention with previous application's api calls 
                return contentionCount, adjCurrent, adjTraceTab, newStart, oldStart, apiID


            if ((preType == 'h2d' and curType == 'd2h') or (preType == 'd2h' and curType == 'h2d')) and (copyEngineNum == 1):
                contentionCount = contentionCount + 1
                # Duplicate previous operations
                if preStart <= curStart:  # delay current api till the end of prevEnd
                    #print "adjust new api"
                    adjCurrent = True
                    newStart = prevEnd
                    oldStart = curStart
                else:  # move the app in traceTable after current api
                    #print "adjust app in traceTable"
                    adjTraceTab = True
                    newStart = curEnd
                    oldStart = preStart
                return contentionCount, adjCurrent, adjTraceTab, newStart, oldStart, apiID

    return contentionCount, adjCurrent, adjTraceTab, None, None, None



def predict_perf(prev_trace_org, current_trace_org):
    """
    Predict performance impact between two application traces
    """
    prev_trace = copy.deepcopy(prev_trace_org)
    current_trace = copy.deepcopy(current_trace_org)

    AvgSlowDown = 0

    #===============#
    # record the orginal runtime 
    #===============#
    orgTime = []
    prev_rt = getruntime(prev_trace)
    orgTime.append(prev_rt)
    ##print "\n=> prev app runtime : %f" % prev_rt

    current_rt = getruntime(current_trace)
    ##print "=> current app runtime : %f" % current_rt
    orgTime.append(current_rt)

    #===============#
    # figure out when to start the coming workload
    #===============#
    # get the ending time of 1st api (for prev app) : [apitype, start, end, .... ]
    prevapp_type  = prev_trace[0][0]
    prevapp_start = prev_trace[0][1]
    prevapp_end   = prev_trace[0][2]

    newapp_type = current_trace[0][0]

    simulate_startPos = None
    extra_delay_for_newapp = 0.

    if prevapp_type == newapp_type:
        # when there is contention, start after prev ends
        simulate_startPos = prevapp_end
        # [Note] count in the starting delay
        extra_delay_for_newapp = prevapp_end - prevapp_start
    else:
        # if different, assume they start at the same time
        simulate_startPos = prevapp_start

    newapp_start = current_trace[0][1] # update new app api starting point

    prev_cur_diff = simulate_startPos - newapp_start  # the amount to adjust the starting point

    newapp_trace = copy.deepcopy(current_trace)

    # sync newapp timing with traceTable
    update_trace_offset(newapp_trace, prev_cur_diff)

    #===============#
    # analyze the contention for each API 
    #===============#
    for i in xrange(len(newapp_trace)):
        api = newapp_trace[i]
        CheckContention = True

        while CheckContention:
            #
            # check contention for current api call
            #
            contentionCount, adjCurrent, adjTraceTab, newStart, oldStart, apiID = model_contention(prev_trace, api)

            if contentionCount == 0:
                CheckContention = False  # move to the next api
            else:
                # there are contention for current api
                #print contentionCount, adjCurrent, adjTraceTab, newStart, appID, apiID

                if adjCurrent:
                    #print "=>adjust current api"
                    #print "before updating api"
                    #print newapp_trace

                    api_offset = newStart - api[1]
                    update_trace_api(newapp_trace, i, api_offset)  # update new app trace list

                    #print "after updating api"
                    #print newapp_trace

                if adjTraceTab:
                    adjust_prevTraceTable_api(prev_trace, apiID, newStart, oldStart)

    #=====================================================#
    # measure slowdown ratio for each application
    #=====================================================#
    newTime = []
    myRuntime = getruntime(prev_trace)
    ##print "\n=> prev app runtime (after adjustment) : %f" % myRuntime 
    newTime.append(myRuntime)

    # add adjusted timing for new app + with extra starting delay
    newTime.append(getruntime(newapp_trace) + extra_delay_for_newapp) 
    ##print "\n=> current app runtime (after adjustment) : %f" % getruntime(newapp_trace)
    
    #=====================================================#
    # measure slowdown ratio for each application
    #=====================================================#
    slowdown_ratio = []
    for i, newT in enumerate(newTime):
        sdr = float(newT) / orgTime[i] - 1.   # compute slowdown ratio
        slowdown_ratio.append(sdr)

    AvgSlowDown = sum(slowdown_ratio) / float(len(newTime))

    return AvgSlowDown

### test cases

In [10]:
def select_top3_least_slowdown(prev_app_trace, app, app2trace_dd):
    AvgSlowDown_dd = {}
    for key, value in app2trace_dd.iteritems():
        if key <> app:
            AvgSlowDown = predict_perf(prev_app_trace, value) # select app to corun
            AvgSlowDown_dd[key] = AvgSlowDown
            #print AvgSlowDown
    avg_slowdown_sort = sorted(AvgSlowDown_dd.items(), key=operator.itemgetter(1))
    #print avg_slowdown_sort
    
    print "\nTop3 least impact (slowdown) for %s\n" % app
    print avg_slowdown_sort[0]
    print avg_slowdown_sort[1]
    print avg_slowdown_sort[2]
    
#     print "\nTop5 least impact (slowdown) for %s\n" % app
#     print avg_slowdown_sort[0]
#     print avg_slowdown_sort[1]
#     print avg_slowdown_sort[2]
#     print avg_slowdown_sort[3]
#     print avg_slowdown_sort[4]
    

In [11]:
#
# 2gpus
#
test1 = ['cudasdk_convolutionSeparable','cudasdk_fastWalshTransform']

for app in test1:
    select_top3_least_slowdown(app2trace_dd[app],app,app2trace_dd)
    


Top3 least impact (slowdown) for cudasdk_convolutionSeparable

('cudasdk_MCEstimatePiInlineP', 0.0)
('cudasdk_concurrentKernels', 0.0)
('cudasdk_MCEstimatePiP', 0.0)

Top3 least impact (slowdown) for cudasdk_fastWalshTransform

('cudasdk_MCEstimatePiInlineQ', 0.0)
('cudasdk_MCEstimatePiInlineP', 0.0)
('rodinia_gaussian', 0.0)


In [12]:
# # apply similarity to select the best


# app1_metric = app2metric_dd['cudasdk_convolutionSeparable']

# dist_dd = {}
# for app2 in ['cudasdk_MCEstimatePiInlineP', 'cudasdk_concurrentKernels', 'cudasdk_MCEstimatePiP']:
#     app2_metric = app2metric_dd[app2]
#     dist = np.linalg.norm(app1_metric - app2_metric)
#     dist_dd[app2] = dist

# sort_dist = sorted(dist_dd.items(), key=operator.itemgetter(1))
# print sort_dist[-1]



# app1_metric = app2metric_dd['cudasdk_fastWalshTransform']

# dist_dd = {}
# for app2 in ['cudasdk_MCEstimatePiInlineP', 'rodinia_gaussian', 'cudasdk_MCEstimatePiP']:
#     app2_metric = app2metric_dd[app2]
#     dist = np.linalg.norm(app1_metric - app2_metric)
#     dist_dd[app2] = dist

# sort_dist = sorted(dist_dd.items(), key=operator.itemgetter(1))
# print sort_dist[-1]



In [13]:
test2 = ['poly_correlation','poly_covariance']

for app in test2:
    select_top3_least_slowdown(app2trace_dd[app],app,app2trace_dd)


Top3 least impact (slowdown) for poly_correlation

('cudasdk_MCEstimatePiInlineP', 0.0)
('cudasdk_concurrentKernels', 0.0)
('cudasdk_MCEstimatePiP', 0.0)

Top3 least impact (slowdown) for poly_covariance

('cudasdk_interval', -1.6653345369377348e-16)
('cudasdk_MCEstimatePiInlineP', 0.0)
('cudasdk_concurrentKernels', 0.0)


In [14]:
test3 = ['cudasdk_stereoDisparity','poly_3mm']

for app in test3:
    select_top3_least_slowdown(app2trace_dd[app],app,app2trace_dd)


Top3 least impact (slowdown) for cudasdk_stereoDisparity

('cudasdk_MCEstimatePiInlineP', 0.0)
('cudasdk_MCEstimatePiP', 0.0)
('cudasdk_interval', 0.013468186886816724)

Top3 least impact (slowdown) for poly_3mm

('cudasdk_MCEstimatePiInlineP', 0.0)
('cudasdk_concurrentKernels', 0.0)
('cudasdk_MCEstimatePiP', 0.0)
