In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import os
import glob

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers
#import tensorflow_decision_forests as tfdf

print(tf.__version__)

# Make NumPy printouts easier to read.
np.set_printoptions(precision=3, suppress=True)

2.8.0


### Club all the traces together into a single dataframe

In [2]:
path = './traces/'# use your path
#all_files = glob.glob(path + "/*.h5")
all_files = glob.glob(path + "/dpotrf_T_*_N_20k-*.prof.h5")

li = []

for filename in all_files:
    trace =  pd.HDFStore(filename)
    data = trace.get('/events')
    trace.close()
    
    li.append(data)

df = pd.concat(li, axis=0, ignore_index=True)


In [3]:
print(len(li))

8


# Clean the data

In [5]:
for i in range(0, len(li)):
    li[i] = li[i][['begin', 'end', 'taskpool_id', 'task_class_id','chore_id', 'nb_data_items', 'total_data_size', 'priority']].copy()
    li[i]['exec_time'] = (li[i]['end'] - li[i]['begin']) * .001 #0.001 nano seconds to micro seconds
    li[i] = li[i].astype(float)

In [6]:
df_class_name = {0: 'dpotrf', 1: 'dtrsm', 2: 'dsyrk', 3: 'dgemm'}

In [7]:
for i in range(0, len(li)):
    filter = ( (li[i]['task_class_id'] >= 0 ) & ( li[i]['task_class_id'] <= 3 ) & ( li[i]['priority'] >= 0 ) )
    #remove all other classes other than dpotrf, dgemm, trsm and syrk
    li[i] = li[i][filter]
    li[i].head()

## integrate likelihood of the data being in cash.
we assume that all the task whose execution time is in the first quartile could have had all its data in the cache. While, the rest of the tasks would have resulted in a cache flush. While predicting we assume that the that the data of the task is not in the cache 

In [80]:
li[2][li[2]['task_class_id'] == 3].head()

Unnamed: 0,begin,end,taskpool_id,task_class_id,chore_id,nb_data_items,total_data_size,priority,exec_time
127,983020197.0,985542618.0,4.0,3.0,0.0,3.0,960000.0,732924.0,2522.421
128,985588651.0,986180220.0,4.0,3.0,0.0,3.0,960000.0,805509.0,591.569
129,986207271.0,986783456.0,4.0,3.0,0.0,3.0,960000.0,707183.0,576.185
130,986806269.0,987403878.0,4.0,3.0,0.0,3.0,960000.0,665169.0,597.609
131,987428517.0,988010563.0,4.0,3.0,0.0,3.0,960000.0,729588.0,582.046


## Load Models

In [10]:
new_LR_model = tf.keras.models.load_model('./LR_model_all_class')

2022-04-14 07:53:31.738348: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
new_DNN_model = tf.keras.models.load_model('./DNN_all_class')

In [79]:
# nb_data_items total_data_size priority cache_likelihood task_class_0.0 task_class_1.0 task_class_2.0 task_class_3.0

LR_predicted = new_LR_model.predict([1.0, 320000.0, 804357.0, 0, 1, 0, 0, 0])
DNN_predicted = new_DNN_model.predict([1.0, 320000.0, 1804357.0, 0, 1, 0, 0, 0])
Actual = '348.527'
print('Actual = ' + str(Actual) + ' LR_predicted ' + str(LR_predicted) + ' DNN_predicted ' + str(DNN_predicted))

filter = (li[2]['task_class_id'] == 0 )
df_2 = li[2][filter]
print('----------------- Class 2 -------------------------- ')
print('Min = ' + str(df_2['exec_time'].min()))
print('Max = ' + str(df_2['exec_time'].max()))
print('Avg = ' + str(sum(df_2['exec_time']) / len(df_2['exec_time'])))
print('Std = ' + str(df_2['exec_time'].std()))

Actual = 348.527 LR_predicted [[239.487]] DNN_predicted [[287.472]]
----------------- Class 2 -------------------------- 
Min = 292.163
Max = 36786.868
Avg = 1974.9610200000004
Std = 6806.272262886053


In [74]:
# nb_data_items total_data_size priority cache_likelihood task_class_0.0 task_class_1.0 task_class_2.0 task_class_3.0

LR_predicted = new_LR_model.predict([2.0, 640000.0, 128.0, 0, 0, 1, 0, 0])
DNN_predicted = new_DNN_model.predict([2.0, 640000.0, 128.0, 0, 0, 1, 0, 0])
Actual = '566.124'
print('Actual = ' + str(Actual) + ' LR_predicted ' + str(LR_predicted) + ' DNN_predicted ' + str(DNN_predicted))

filter = (li[2]['task_class_id'] == 1 )
df_2 = li[2][filter]
print('----------------- Class 2 -------------------------- ')
print('Min = ' + str(df_2['exec_time'].min()))
print('Max = ' + str(df_2['exec_time'].max()))
print('Avg = ' + str(sum(df_2['exec_time']) / len(df_2['exec_time'])))
print('Std = ' + str(df_2['exec_time'].std()))

Actual = 566.124 LR_predicted [[521.365]] DNN_predicted [[559.016]]
----------------- Class 2 -------------------------- 
Min = 242.71200000000002
Max = 46661.549
Avg = 1205.5731048979617
Std = 4379.527597086793


In [72]:
# nb_data_items total_data_size priority cache_likelihood task_class_0.0 task_class_1.0 task_class_2.0 task_class_3.0

LR_predicted = new_LR_model.predict([2.0, 640000.0, 45.0, 0, 0, 0, 1, 0])
DNN_predicted = new_DNN_model.predict([2.0, 640000.0, 45.0, 0, 0, 0, 1, 0])
Actual = '410.847'
print('Actual = ' + str(Actual) + ' LR_predicted ' + str(LR_predicted) + ' DNN_predicted ' + str(DNN_predicted))

filter = (li[2]['task_class_id'] == 2 )
df_2 = li[2][filter]
print('----------------- Class 2 -------------------------- ')
print('Min = ' + str(df_2['exec_time'].min()))
print('Max = ' + str(df_2['exec_time'].max()))
print('Avg = ' + str(sum(df_2['exec_time']) / len(df_2['exec_time'])))
print('Std = ' + str(df_2['exec_time'].std()))

Actual = 410.847 LR_predicted [[460.452]] DNN_predicted [[420.474]]
-----------------Class-------------------------- 
Min = 174.193
Max = 115778.071
Avg = 1317.3406866666628
Std = 5363.1834662625515


In [81]:
# nb_data_items total_data_size priority cache_likelihood task_class_0.0 task_class_1.0 task_class_2.0 task_class_3.0

LR_predicted = new_LR_model.predict([3.0, 960000.0, 729588.0, 0, 0, 0, 0, 1])
DNN_predicted = new_DNN_model.predict([3.0, 960000.0, 729588.0, 0, 0, 0, 0, 1])
Actual = '582.046'
print('Actual = ' + str(Actual) + ' LR_predicted ' + str(LR_predicted) + ' DNN_predicted ' + str(DNN_predicted))

filter = (li[2]['task_class_id'] == 3 )
df_2 = li[2][filter]
print('----------------- Class 3 -------------------------- ')
print('Min = ' + str(df_2['exec_time'].min()))
print('Max = ' + str(df_2['exec_time'].max()))
print('Avg = ' + str(sum(df_2['exec_time']) / len(df_2['exec_time'])))
print('Std = ' + str(df_2['exec_time'].std()))



Actual = 582.046 LR_predicted [[672.263]] DNN_predicted [[1106.801]]
----------------- Class 3 -------------------------- 
Min = 484.341
Max = 158086.646
Avg = 1769.7356544217769
Std = 6268.618821464601
