* read single stream strace (./profile_results_s1): extract kernel info
* read two stream trace (./profile_results) : according to the overlapping, find out the total kernel runtime, as the ground truth
* run the avg blk model to predict the cke (two stream case) kernel runtime.

In [1]:
import warnings
import pandas as pd
import numpy as np
import os
import operator # sorting
from math import *

from read_trace import *
from avgblkmodel import *

warnings.filterwarnings("ignore", category=np.VisibleDeprecationWarning)

# GPU info

In [2]:
gtx950 = DeviceInfo()
gtx950.sm_num = 6
gtx950.sharedmem_per_sm = 49152
gtx950.reg_per_sm = 65536
gtx950.maxthreads_per_sm = 2048

# Two Stream Timing

* read the cke trace folder, find out the data size of positive overlapping rate
* find the concurrent kernel runtime for the positive ovlp cases

In [3]:
#----------------------------------------------
# ls all the trace files in the targeted folder
#----------------------------------------------
target_folder = './profile_results'
trace_list = []
for root, dirs, files in os.walk(target_folder):
    for file in files:
        if 'trace' in file:
            trace_list.append(file)

#----------------------------------------------------
# record the positive overlapping rate for different data size
#----------------------------------------------------
ovlp_dict = {}

for item in trace_list:
    trace_file = target_folder + "/" + item
    current_ovlp, cke_time_ms = check_kernel_ovlprate(trace_file)
    N = item.replace("trace_", "").replace(".csv","") # find out the data size
    if current_ovlp > 0:
        ovlp_dict[N] = [current_ovlp, cke_time_ms]

In [4]:
#sorted_ovlp_dict = sorted(ovlp_dict.items(), key=operator.itemgetter(1), reverse=True)

In [5]:
#sorted_ovlp_dict

In [6]:
ovlp_dict['23000'][1]

0.11270500000000538

### Function to model N kernel concurrent execution

In [7]:
#---------------------------------------------
# model cke function
#---------------------------------------------
def model_cke(Gpu, kernels, avg_blk_time_list):
    import pandas as pd
    from avgblkmodel import *
    
    trace_columns = ['sm_id', 'block_id', 'block_start', 'block_end', 'batch_id', 'kernel_id', 'active']
    
    kernel_num = len(kernels)
    
    # init SM
    sm_num = Gpu.sm_num
    sms = [sm_stat() for i in range(sm_num)]
    for i in range(sm_num):
        sms[i].init(Gpu)
    
    # a trace table to record all the block trace: using pd dataframe
    trace_table = pd.DataFrame(columns=trace_columns)    
    sm_trace = [trace_table for x in range(Gpu.sm_num)] # have a trace table for each sm 

    #----------------
    # start modeling the trace
    #----------------
    sm2start = 0

    for i in range(kernel_num):
        kern = kernels[i] # schedule current kernel on the device
        kernel_blocks = int(kern.gridDim) # total block for current kern

        last_block_on_sm = 0

        for bid in range(kernel_blocks):
            # find out which sm to allocate
            sm_id = (bid + sm2start) % sm_num

            # check whether current sm has enough resources to host the block
            to_allocate_another_block = check_sm_resource(sms[sm_id], kern)

            #-------------------------------------------
            # There is no more resources to host the blk, consider SM is full now
            # we need to (1) decide how many blks to retire (2) when to start current blk
            #-------------------------------------------
            if to_allocate_another_block == 0:
                # find the list blocks to retire
                df_sm = sm_trace[sm_id]
                df_activeblk = df_sm.loc[df_sm['active'] == 1]

                blkend_min = df_activeblk['block_end'].min()
                df_blk2end = df_activeblk.loc[df_activeblk['block_end'] == blkend_min]
                for index, row in df_blk2end.iterrows():
                    sm_trace[sm_id].loc[index]['active'] = 0 # retire the block
                    sms[sm_id].Rm(kern) # free the block resource

                # after retiring some blocks, we have resources to allocate current block
                sms[sm_id].Allocate_block(kern)

                block_start = blkend_min # when prev blks end, current block starts
                block_end = block_start + avg_blk_time_list[i] # add avgblktime for currrent kernel

                # update the trace table
                sm_trace[sm_id] = sm_trace[sm_id].append({'sm_id': sm_id, 
                                                          'block_id': bid, 
                                                          'block_start': block_start,
                                                          'block_end' : block_end,
                                                          'batch_id': sms[sm_id].batch,
                                                          'kernel_id': i,
                                                          'active': 1}, ignore_index=True)

            #----------------------------------
            # there is enough resource to host the current block
            #----------------------------------
            if to_allocate_another_block == 1:
                # allocate the block on current sm
                sms[sm_id].Allocate_block(kern)

                # register the block in the trace table
                block_start = None

                # if current sm trace table is empty, start from 0
                # else find the blocks that will end soon, and retire them
                if sm_trace[sm_id].empty:
                    block_start = 0
                else:
                    # read the sm_trace table, find out all the active blocks on current sm, look for the earliest start
                    block_start = Search_block_start(sm_trace[sm_id], i)

                block_end = block_start + avg_blk_time_list[i]

                # add the current block info to the current sm
                sm_trace[sm_id] = sm_trace[sm_id].append({'sm_id': sm_id, 
                                                          'block_id': bid, 
                                                          'block_start': block_start,
                                                          'block_end' : block_end,
                                                          'batch_id': sms[sm_id].batch,
                                                          'kernel_id': i,
                                                          'active': 1}, ignore_index=True)
            last_block_on_sm = sm_id

        # end of running previous kernel blocks
        sm2start = (last_block_on_sm + 1) % sm_num # start from next smd
    
    # end of for loop to run cke model
    
    #------------------------------
    # predict time
    #------------------------------
    pred_kern_time = 0.0

    for i in range(sm_num):
        sm_time_max = sm_trace[i]['block_end'].max() - sm_trace[i]['block_start'].min()
        if pred_kern_time < sm_time_max:
            pred_kern_time = sm_time_max
        
    return pred_kern_time
#---------------------------------------------
# end of model cke function
#---------------------------------------------    

  def model_cke(Gpu, kernels, avg_blk_time_list):


In [8]:
# output data table
df_results = pd.DataFrame(columns=['datasize', 'real', 'model'])

In [9]:
for key, value in ovlp_dict.iteritems():
    # key is the data size
    # value is a list [overlapping rate, kernel_runtime]
    data_size = int(key)
    
    #------------------------------------------
    # real runtime
    #------------------------------------------
    real_kern_time = float(value[1])
    
    
    #------------------------------------------
    # read the tracefile, get the kernel info and single kernel runtime
    #------------------------------------------
    # trace_file = './profile_results_s1/trace_23000.csv'
    trace_file = './profile_results_s1/trace_' + str(data_size) + '.csv'
    
    df_trace = trace2dataframe(trace_file) # read the trace to the dataframe
    streaminfo = get_stream_info(df_trace) # read the steam info: h2d/d2d/kernel/kernel_info
    
    # kernel info
    current_kern_info = streaminfo[0].kernel_info[0]
    grid_dim = float(current_kern_info.grid_x) * float(current_kern_info.grid_y) * float(current_kern_info.grid_z)
    block_dim = float(current_kern_info.blk_x) * float(current_kern_info.blk_y) * float(current_kern_info.blk_z)
    reg_per_thread = float(current_kern_info.regs_per_thread)
    sm_per_blk = float(current_kern_info.sm_per_block)
    
    # kernel runtime in ms
    current_kern =  streaminfo[0].kernel[0]
    kern_runtime_ms = float(current_kern.end_time_ms) - float(current_kern.start_time_ms)
    
    #------------------------------------------
    # set up prediction paramters
    #------------------------------------------
    # simulate kernel number
    kernel_num = 2

    kernels = [KernelInfo() for i in range(kernel_num)]

    # running the identical kernels
    for kid in range(kernel_num):
        kernels[kid].blockDim = block_dim
        kernels[kid].gridDim = grid_dim
        kernels[kid].reg_per_thread = reg_per_thread
        kernels[kid].sharedmem_per_blk = sm_per_blk
        kernels[kid].runtime_ms = kern_runtime_ms
        
    # compute the avg block runtime for current single kernel
    avg_blk_time_list = []
    for kid in range(kernel_num):
        avg_blk_time_list.append(compute_avgblktime(gtx950, kernels[kid]))
    
    #------------------------------------------
    # run cke model prediction
    #------------------------------------------
    pred_kern_time = model_cke(gtx950, kernels, avg_blk_time_list)
    
    df_results = df_results.append({'datasize':data_size,
                                   'real': real_kern_time,
                                   'model': pred_kern_time}, ignore_index=True)
    
    #break    

In [11]:
df_results.to_csv('model_results.csv', index=False, encoding='utf-8')

In [12]:
df_results

Unnamed: 0,datasize,real,model
0,286000.0,1.156777,1.136407
1,205000.0,0.833447,0.848328
2,110000.0,0.517124,0.539460
3,66000.0,0.321763,0.295799
4,178000.0,0.724230,0.703059
5,248000.0,1.005864,0.982196
6,128000.0,0.521220,0.543104
7,244000.0,0.989192,0.992008
8,230000.0,0.932231,0.934920
9,59000.0,0.290051,0.292738
