In [1]:
import ROOT
import numpy as np
import pandas as pd 
import json
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from matplotlib.ticker import FormatStrFormatter

ROOT.gROOT.ProcessLine( "gErrorIgnoreLevel = kError;");
ROOT.gStyle.SetCanvasDefW(2400);
ROOT.gStyle.SetCanvasDefH(900);
ROOT.gStyle.SetLegendTextSize(0.05)
ROOT.gStyle.SetLabelSize(0.05)
ROOT.gStyle.SetMarkerSize(1)
ROOT.gStyle.SetMarkerStyle(8)
ROOT.gStyle.SetLineWidth(2)
ROOT.gStyle.SetTickLength(0.02, "y")

Welcome to JupyROOT 6.31/01


In [2]:
def convert_to_type(df, col, type):
    df[col] =  df[col].apply(lambda s: type(s))

def normalize_df(df, cols, norm):
    for col in cols:
        df[col] = df[col].div(df[norm])

In [10]:
df = pd.read_csv("das6-gpu/20231206-174647")
df["nvals"] =  df["input"].apply(lambda s: np.float64(s.split("_")[-1].split(".")[0]))
df["distribution"] =  df["input"].apply(lambda s: s.split("_")[1])
convert_to_type(df, "edges", np.bool_)
convert_to_type(df, "ttotal", np.float64)
df["type"] = "Buffers"

df_usm = pd.read_csv("das6-gpu/20231211-155923")
df_usm["nvals"] =  df_usm["input"].apply(lambda s: np.float64(s.split("_")[-1].split(".")[0]))
df_usm["distribution"] =  df_usm["input"].apply(lambda s: s.split("_")[1])
convert_to_type(df_usm, "edges", np.bool_)
convert_to_type(df_usm, "ttotal", np.float64)
df_usm["type"] = "USM"

df = pd.concat([df, df_usm])
del df["input"]
df

Unnamed: 0,iter,env,gpu,nbins,bulksize,edges,tfindbin,tfill,tstats,ttotal,nvals,distribution,type
0,0,DPC++,A4000,1,32768,False,0.0,0.0,0.0,3.158348,5.000000e+07,uniform,Buffers
1,0,DPC++,A4000,1,32768,True,0.0,0.0,0.0,3.143772,5.000000e+07,uniform,Buffers
2,0,DPC++,A4000,1,32768,False,0.0,0.0,0.0,3.593378,1.000000e+08,uniform,Buffers
3,0,DPC++,A4000,1,32768,True,0.0,0.0,0.0,4.094069,1.000000e+08,uniform,Buffers
4,0,DPC++,A4000,1,32768,False,0.0,0.0,0.0,9.966019,5.000000e+08,uniform,Buffers
...,...,...,...,...,...,...,...,...,...,...,...,...,...
315,4,DPC++,A4000,1000,32768,True,0.0,0.0,0.0,3.748883,1.000000e+08,uniform,USM
316,4,DPC++,A4000,1000,32768,False,0.0,0.0,0.0,10.399123,5.000000e+08,uniform,USM
317,4,DPC++,A4000,1000,32768,True,0.0,0.0,0.0,10.190794,5.000000e+08,uniform,USM
318,4,DPC++,A4000,1000,32768,False,0.0,0.0,0.0,18.280992,1.000000e+09,uniform,USM


In [12]:
gp = df.groupby(["env", "gpu", "type", "distribution", "nvals", "nbins", "bulksize", "edges"])
gp.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,iter,tfindbin,tfill,tstats,ttotal
env,gpu,type,distribution,nvals,nbins,bulksize,edges,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
AdaptiveCpp,A4000,Buffers,uniform,5.000000e+07,1,32768,False,2.0,0.0,0.0,0.0,3.949111
AdaptiveCpp,A4000,Buffers,uniform,5.000000e+07,1,32768,True,2.0,0.0,0.0,0.0,3.837626
AdaptiveCpp,A4000,Buffers,uniform,5.000000e+07,10,32768,False,2.0,0.0,0.0,0.0,3.738558
AdaptiveCpp,A4000,Buffers,uniform,5.000000e+07,10,32768,True,2.0,0.0,0.0,0.0,3.144034
AdaptiveCpp,A4000,Buffers,uniform,5.000000e+07,100,32768,False,2.0,0.0,0.0,0.0,4.693720
...,...,...,...,...,...,...,...,...,...,...,...,...
DPC++,A4000,USM,uniform,1.000000e+09,10,32768,True,2.0,0.0,0.0,0.0,17.607234
DPC++,A4000,USM,uniform,1.000000e+09,100,32768,False,2.0,0.0,0.0,0.0,17.707912
DPC++,A4000,USM,uniform,1.000000e+09,100,32768,True,2.0,0.0,0.0,0.0,17.653145
DPC++,A4000,USM,uniform,1.000000e+09,1000,32768,False,2.0,0.0,0.0,0.0,18.165045


In [15]:
unique_env = df["env"].unique()
unique_bulksize = df["bulksize"].unique()
unique_nbins = df["nbins"].unique()
unique_nvals = df["nvals"].unique()
unique_gpus = df["gpu"].unique()
unique_distributions = df["distribution"].unique()
unique_edges = [True, False]
unique_type = ["Buffers", "USM"]
unique_env, unique_gpus, unique_type, unique_nbins, unique_bulksize, unique_edges, unique_nvals, unique_distributions

(array(['DPC++', 'AdaptiveCpp', 'CUDA_HIST'], dtype=object),
 array(['A4000'], dtype=object),
 ['Buffers', 'USM'],
 array([   1,   10,  100, 1000]),
 array([32768]),
 [True, False],
 array([5.e+07, 1.e+08, 5.e+08, 1.e+09]),
 array(['uniform'], dtype=object))

In [89]:
api_buf = pd.read_csv("das6-gpu/nsys-20231211-155923/api")
api_buf["Time"] = api_buf["Total Time (ns)"].div(1e9)
api_buf["nvals"] =  api_buf["input"].apply(lambda s: np.float64(s.split("_")[-1].split(".")[0]))
api_buf["distribution"] =  api_buf["input"].apply(lambda s: s.split("_")[1])
convert_to_type(api_buf, "edges", np.bool_)
api_buf["type"] = "Buffers"

api_usm = pd.read_csv("das6-gpu/nsys-20231206-174647/api")
api_usm["Time"] = api_usm["Total Time (ns)"].div(1e9)
api_usm["nvals"] =  api_usm["input"].apply(lambda s: np.float64(s.split("_")[-1].split(".")[0]))
api_usm["distribution"] =  api_usm["input"].apply(lambda s: s.split("_")[1])
convert_to_type(api_usm, "edges", np.bool_)
api_usm["type"] = "USM"

api = pd.concat([api_buf, api_usm])
del api["input"]
api

Unnamed: 0,iter,env,gpu,nbins,bulksize,edges,Time (%),Total Time (ns),Num Calls,Avg (ns),Med (ns),Min (ns),Max (ns),StdDev (ns),Name,Time,nvals,distribution,type
0,0,AdaptiveCpp,A4000,1,32768,False,31.1,226624986,10690,21199.7,7865.0,6753,132465001,1281105.6,cudaLaunchKernel,0.226625,5.000000e+07,uniform,Buffers
1,0,AdaptiveCpp,A4000,1,32768,False,23.2,168862502,4,42215625.5,4097.5,2985,168851322,84423797.7,cudaStreamCreateWithFlags,0.168863,5.000000e+07,uniform,Buffers
2,0,AdaptiveCpp,A4000,1,32768,False,22.6,164683254,3064,53747.8,55674.0,5851,119444,5370.4,cudaMemcpyAsync,0.164683,5.000000e+07,uniform,Buffers
3,0,AdaptiveCpp,A4000,1,32768,False,6.6,47877497,6118,7825.7,5570.0,4238,332824,7542.2,cudaFree,0.047877,5.000000e+07,uniform,Buffers
4,0,AdaptiveCpp,A4000,1,32768,False,6.6,47789458,6118,7811.3,5600.0,4638,537979,8192.6,cudaMalloc,0.047789,5.000000e+07,uniform,Buffers
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5334,4,AdaptiveCpp,A4000,1000,32768,True,0.0,133470,1,133470.0,133470.0,133470,133470,0.0,cudaGetDeviceProperties_v2_v12000,0.000133,1.000000e+09,uniform,USM
5335,4,AdaptiveCpp,A4000,1000,32768,True,0.0,102061,3,34020.3,9047.0,4508,88506,47240.5,cudaMemsetAsync,0.000102,1.000000e+09,uniform,USM
5336,4,AdaptiveCpp,A4000,1000,32768,True,0.0,29065,4,7266.3,4894.5,4579,14697,4956.1,cudaStreamDestroy,0.000029,1.000000e+09,uniform,USM
5337,4,AdaptiveCpp,A4000,1000,32768,True,0.0,22210,18,1233.9,1262.0,852,2254,350.9,cudaEventDestroy,0.000022,1.000000e+09,uniform,USM


In [90]:
gp_api = api.groupby(["env", "gpu", "type", "distribution", "nvals", "nbins", "bulksize", "edges", "Name"])
gp_api.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,iter,Time (%),Total Time (ns),Num Calls,Avg (ns),Med (ns),Min (ns),Max (ns),StdDev (ns),Time
env,gpu,type,distribution,nvals,nbins,bulksize,edges,Name,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
AdaptiveCpp,A4000,Buffers,uniform,5.000000e+07,1,32768,False,cuModuleGetLoadingMode,2.0,0.00,1360.6,1.0,1360.60,1360.6,1360.6,1360.6,0.00,0.000001
AdaptiveCpp,A4000,Buffers,uniform,5.000000e+07,1,32768,False,cudaEventCreate,2.0,0.00,202287.6,15.0,13485.84,1683.0,1398.6,159549.4,40510.52,0.000202
AdaptiveCpp,A4000,Buffers,uniform,5.000000e+07,1,32768,False,cudaEventDestroy,2.0,0.00,25281.6,15.0,1685.44,1611.2,1250.4,2881.4,398.96,0.000025
AdaptiveCpp,A4000,Buffers,uniform,5.000000e+07,1,32768,False,cudaEventRecord,2.0,6.30,46411093.6,10706.0,4335.04,4234.0,3374.2,155102.8,2086.54,0.046411
AdaptiveCpp,A4000,Buffers,uniform,5.000000e+07,1,32768,False,cudaEventSynchronize,2.0,1.76,12986268.2,4593.0,2827.40,2845.0,1499.0,36334.2,1149.52,0.012986
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DPC++,A4000,USM,uniform,1.000000e+09,1000,32768,True,cuModuleUnload,2.0,1.62,104065004.4,2.0,52032502.20,52032502.2,68300.0,103996704.4,73488479.52,0.104065
DPC++,A4000,USM,uniform,1.000000e+09,1000,32768,True,cuStreamCreateWithPriority,2.0,0.10,6639014.6,192.0,34578.18,8820.4,3793.0,1282405.2,152953.18,0.006639
DPC++,A4000,USM,uniform,1.000000e+09,1000,32768,True,cuStreamDestroy_v2,2.0,0.00,680202.0,192.0,3542.72,2954.5,1970.0,29070.2,3317.78,0.000680
DPC++,A4000,USM,uniform,1.000000e+09,1000,32768,True,cuStreamSynchronize,2.0,0.00,299818.2,393.0,762.88,687.4,492.6,7764.4,470.12,0.000300


In [93]:
kernels_buf = pd.read_csv("das6-gpu/nsys-20231211-155923/kernel")
kernels_buf["Time"] = kernels_buf["Total Time (ns)"].div(1e9)
kernels_buf["nvals"] =  kernels_buf["input"].apply(lambda s: np.float64(s.split("_")[-1].split(".")[0]))
kernels_buf["distribution"] =  kernels_buf["input"].apply(lambda s: s.split("_")[1])
convert_to_type(kernels_buf, "edges", np.bool_)
kernels_buf["type"] = "Buffers"

kernels_usm = pd.read_csv("das6-gpu/nsys-20231206-174647/kernel")
kernels_usm["Time"] = kernels_usm["Total Time (ns)"].div(1e9)
kernels_usm["nvals"] =  kernels_usm["input"].apply(lambda s: np.float64(s.split("_")[-1].split(".")[0]))
kernels_usm["distribution"] =  kernels_usm["input"].apply(lambda s: s.split("_")[1])
convert_to_type(kernels_usm, "edges", np.bool_)
kernels_usm["type"] = "USM"

kernels = pd.concat([kernels_buf, kernels_usm])
del kernels["input"]
kernels

Unnamed: 0,iter,env,gpu,nbins,bulksize,edges,Time (%),Total Time (ns),Instances,Avg (ns),Med (ns),Min (ns),Max (ns),StdDev (ns),Name,Time,nvals,distribution,type
0,0,AdaptiveCpp,A4000,1,32768,False,72.6,81750572,1527,53536.7,53536.0,39136,55777,738.1,void __hipsycl_kernel<ROOT::Experimental::Hist...,0.081751,5.000000e+07,uniform,Buffers
1,0,AdaptiveCpp,A4000,1,32768,False,7.1,8036343,1527,5262.8,5248.0,5215,5472,24.7,void __hipsycl_kernel<ROOT::Experimental::RHnS...,0.008036,5.000000e+07,uniform,Buffers
2,0,AdaptiveCpp,A4000,1,32768,False,6.8,7645544,1527,5006.9,4993.0,4960,5344,21.0,void __hipsycl_kernel<ROOT::Experimental::RHnS...,0.007646,5.000000e+07,uniform,Buffers
3,0,AdaptiveCpp,A4000,1,32768,False,4.6,5125609,1527,3356.7,3360.0,3327,3393,11.8,void __hipsycl_kernel<auto void hipsycl::glue:...,0.005126,5.000000e+07,uniform,Buffers
4,0,AdaptiveCpp,A4000,1,32768,False,4.5,5121366,1527,3353.9,3360.0,3327,3392,13.3,void __hipsycl_kernel<auto void hipsycl::glue:...,0.005121,5.000000e+07,uniform,Buffers
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3330,4,AdaptiveCpp,A4000,1000,32768,True,16.0,146457859,30534,4796.6,4800.0,4767,5056,17.7,void __hipsycl_kernel<ROOT::Experimental::RHnS...,0.146458,1.000000e+09,uniform,USM
3331,4,AdaptiveCpp,A4000,1000,32768,True,11.2,102243858,30534,3348.5,3360.0,3327,3393,15.5,void __hipsycl_kernel<auto void hipsycl::glue:...,0.102244,1.000000e+09,uniform,USM
3332,4,AdaptiveCpp,A4000,1000,32768,True,11.2,102141981,30534,3345.2,3359.0,3296,3393,16.0,void __hipsycl_kernel<auto void hipsycl::glue:...,0.102142,1.000000e+09,uniform,USM
3333,4,AdaptiveCpp,A4000,1000,32768,True,6.0,54795424,30534,1794.6,1792.0,1600,2176,14.2,void __hipsycl_kernel<ROOT::Experimental::Excl...,0.054795,1.000000e+09,uniform,USM


In [95]:
gp_kernels = kernels.groupby(["env", "gpu", "type", "distribution", "nvals", "nbins", "bulksize", "edges", "Name"])
gp_kernels.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,iter,Time (%),Total Time (ns),Instances,Avg (ns),Med (ns),Min (ns),Max (ns),StdDev (ns),Time
env,gpu,type,distribution,nvals,nbins,bulksize,edges,Name,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
AdaptiveCpp,A4000,Buffers,uniform,5.000000e+07,1,32768,False,void __hipsycl_kernel<ROOT::Experimental::CombineStatsKernel>(),2.0,1.80,2007062.2,1527.0,1314.38,1312.0,1279.0,1357.4,12.52,0.002007
AdaptiveCpp,A4000,Buffers,uniform,5.000000e+07,1,32768,False,void __hipsycl_kernel<ROOT::Experimental::ExcludeUOverflowKernel<(unsigned int)1>>(),2.0,2.60,2891006.8,1527.0,1893.26,1888.0,1772.4,2240.0,17.50,0.002891
AdaptiveCpp,A4000,Buffers,uniform,5.000000e+07,1,32768,False,"void __hipsycl_kernel<ROOT::Experimental::HistogramLocal<double, (unsigned int)1>>()",2.0,72.60,81703586.4,1527.0,53505.96,53504.2,38630.6,55814.8,749.76,0.081704
AdaptiveCpp,A4000,Buffers,uniform,5.000000e+07,1,32768,False,"void __hipsycl_kernel<ROOT::Experimental::RHnSYCL<double, (unsigned int)1, (unsigned int)256>::GetStats(unsigned long)::[lambda(hipsycl::sycl::handler &) (instance 2)]::operator ()(hipsycl::sycl::handler &) const::[lambda(hipsycl::sycl::id<(int)1>, T1 &, T2 &) (instance 1)]>()",2.0,6.80,7646515.6,1527.0,5007.56,4998.8,4960.0,5395.2,21.60,0.007647
AdaptiveCpp,A4000,Buffers,uniform,5.000000e+07,1,32768,False,"void __hipsycl_kernel<ROOT::Experimental::RHnSYCL<double, (unsigned int)1, (unsigned int)256>::GetStats(unsigned long)::[lambda(hipsycl::sycl::handler &) (instance 3)]::operator ()(hipsycl::sycl::handler &) const::[lambda(hipsycl::sycl::id<(int)1>, T1 &, T2 &) (instance 1)]>()",2.0,7.10,8028150.0,1527.0,5257.44,5248.0,5208.8,5472.2,24.02,0.008028
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DPC++,A4000,USM,uniform,1.000000e+09,1000,32768,True,"Typeinfo name for auto void sycl::_V1::detail::reduCGFuncMulti<sycl::_V1::detail::auto_name, void sycl::_V1::detail::reduction_parallel_for<sycl::_V1::detail::auto_name, (sycl::_V1::detail::reduction::strategy)0, (int)1, sycl::_V1::ext::oneapi::experimental::properties<std::tuple<>>, sycl::_V1::detail::reduction_impl<double, std::plus<double>, (int)0, (unsigned long)1, (bool)0, double *>, sycl::_V1::detail::reduction_impl<double, std::plus<double>, (int)0, (unsigned long)1, (bool)0, double *>, ROOT::Experimental::RHnSYCL<double, (unsigned int)1, (unsigned int)256>::GetStats(unsigned long, sycl::_V1::event &)::[lambda(sycl::_V1::handler &) (instance 2)]::operator ()(sycl::_V1::handler &) const::[lambda(sycl::_V1::id<(int)1>, T1 &, T2 &) (instance 1)]>(sycl::_V1::handler &, sycl::_V1::range<T3>, T4, T5...)::[lambda(T1, T2 &...) (instance 1)], (int)1, sycl::_V1::ext::oneapi::experimental::properties<std::tuple<>>, sycl::_V1::detail::reduction_impl<double, std::plus<double>, (int)0, (unsigned long)1, (bool)0, double *>, sycl::_V1::detail::reduction_impl<double, std::plus<double>, (int)0, (unsigned long)1, (bool)0, double *>, (unsigned long)0, (unsigned long)1>(sycl::_V1::handler &, T2, const sycl::_V1::nd_range<T3> &, T4, std::tuple<T5...>&, std::integer_sequence<unsigned long, T6...>)::[lambda(T1, T2) (instance 1)]::operator ()<sycl::_V1::detail::KernelMultipleWGTag, sycl::_V1::detail::tuple<sycl::_V1::accessor<sycl::_V1::detail::ReducerElement<double, std::plus<double>, (bool)0>, (int)1, (sycl::_V1::access::mode)1026, (sycl::_V1::access::target)2014, (sycl::_V1::access::placeholder)0, sycl::_V1::ext::oneapi::accessor_property_list<>>, sycl::_V1::accessor<sycl::_V1::detail::ReducerElement<double, std::plus<double>, (bool)0>, (int)1, (sycl::_V1::access::mode)1026, (sycl::_V1::access::target)2014, (sycl::_V1::access::placeholder)0, sycl::_V1::ext::oneapi::accessor_property_list<>>>>(T1, T2) const::[lambda(sycl::_V1::nd_item<(int)1>) (instance 1)]",2.0,29.88,437317642.6,30533.0,14322.80,14336.0,12806.4,14822.4,84.42,0.437318
DPC++,A4000,USM,uniform,1.000000e+09,1000,32768,True,"Typeinfo name for auto void sycl::_V1::detail::reduCGFuncMulti<sycl::_V1::detail::auto_name, void sycl::_V1::detail::reduction_parallel_for<sycl::_V1::detail::auto_name, (sycl::_V1::detail::reduction::strategy)0, (int)1, sycl::_V1::ext::oneapi::experimental::properties<std::tuple<>>, sycl::_V1::detail::reduction_impl<double, std::plus<double>, (int)0, (unsigned long)1, (bool)0, double *>, sycl::_V1::detail::reduction_impl<double, std::plus<double>, (int)0, (unsigned long)1, (bool)0, double *>, ROOT::Experimental::RHnSYCL<double, (unsigned int)1, (unsigned int)256>::GetStats(unsigned long, sycl::_V1::event &)::[lambda(sycl::_V1::handler &) (instance 2)]::operator ()(sycl::_V1::handler &) const::[lambda(sycl::_V1::id<(int)1>, T1 &, T2 &) (instance 1)]>(sycl::_V1::handler &, sycl::_V1::range<T3>, T4, T5...)::[lambda(T1, T2 &...) (instance 1)], (int)1, sycl::_V1::ext::oneapi::experimental::properties<std::tuple<>>, sycl::_V1::detail::reduction_impl<double, std::plus<double>, (int)0, (unsigned long)1, (bool)0, double *>, sycl::_V1::detail::reduction_impl<double, std::plus<double>, (int)0, (unsigned long)1, (bool)0, double *>, (unsigned long)0, (unsigned long)1>(sycl::_V1::handler &, T2, const sycl::_V1::nd_range<T3> &, T4, std::tuple<T5...>&, std::integer_sequence<unsigned long, T6...>)::[lambda(T1, T2) (instance 1)]::operator ()<sycl::_V1::detail::KernelOneWGTag, sycl::_V1::detail::tuple<double *, double *>>(T1, T2) const::[lambda(sycl::_V1::nd_item<(int)1>) (instance 1)]",2.0,0.00,9952.0,1.0,9952.00,9952.0,9952.0,9952.0,0.00,0.000010
DPC++,A4000,USM,uniform,1.000000e+09,1000,32768,True,"Typeinfo name for auto void sycl::_V1::detail::reduCGFuncMulti<sycl::_V1::detail::auto_name, void sycl::_V1::detail::reduction_parallel_for<sycl::_V1::detail::auto_name, (sycl::_V1::detail::reduction::strategy)0, (int)1, sycl::_V1::ext::oneapi::experimental::properties<std::tuple<>>, sycl::_V1::detail::reduction_impl<double, std::plus<double>, (int)0, (unsigned long)1, (bool)0, double *>, sycl::_V1::detail::reduction_impl<double, std::plus<double>, (int)0, (unsigned long)1, (bool)0, double *>, ROOT::Experimental::RHnSYCL<double, (unsigned int)1, (unsigned int)256>::GetStats(unsigned long, sycl::_V1::event &)::[lambda(sycl::_V1::handler &) (instance 3)]::operator ()(sycl::_V1::handler &) const::[lambda(sycl::_V1::id<(int)1>, T1 &, T2 &) (instance 1)]>(sycl::_V1::handler &, sycl::_V1::range<T3>, T4, T5...)::[lambda(T1, T2 &...) (instance 1)], (int)1, sycl::_V1::ext::oneapi::experimental::properties<std::tuple<>>, sycl::_V1::detail::reduction_impl<double, std::plus<double>, (int)0, (unsigned long)1, (bool)0, double *>, sycl::_V1::detail::reduction_impl<double, std::plus<double>, (int)0, (unsigned long)1, (bool)0, double *>, (unsigned long)0, (unsigned long)1>(sycl::_V1::handler &, T2, const sycl::_V1::nd_range<T3> &, T4, std::tuple<T5...>&, std::integer_sequence<unsigned long, T6...>)::[lambda(T1, T2) (instance 1)]::operator ()<sycl::_V1::detail::KernelMultipleWGTag, sycl::_V1::detail::tuple<sycl::_V1::accessor<sycl::_V1::detail::ReducerElement<double, std::plus<double>, (bool)0>, (int)1, (sycl::_V1::access::mode)1026, (sycl::_V1::access::target)2014, (sycl::_V1::access::placeholder)0, sycl::_V1::ext::oneapi::accessor_property_list<>>, sycl::_V1::accessor<sycl::_V1::detail::ReducerElement<double, std::plus<double>, (bool)0>, (int)1, (sycl::_V1::access::mode)1026, (sycl::_V1::access::target)2014, (sycl::_V1::access::placeholder)0, sycl::_V1::ext::oneapi::accessor_property_list<>>>>(T1, T2) const::[lambda(sycl::_V1::nd_item<(int)1>) (instance 1)]",2.0,36.28,531518266.4,30533.0,17407.98,17408.0,15424.0,17536.6,109.38,0.531518
DPC++,A4000,USM,uniform,1.000000e+09,1000,32768,True,"Typeinfo name for auto void sycl::_V1::detail::reduCGFuncMulti<sycl::_V1::detail::auto_name, void sycl::_V1::detail::reduction_parallel_for<sycl::_V1::detail::auto_name, (sycl::_V1::detail::reduction::strategy)0, (int)1, sycl::_V1::ext::oneapi::experimental::properties<std::tuple<>>, sycl::_V1::detail::reduction_impl<double, std::plus<double>, (int)0, (unsigned long)1, (bool)0, double *>, sycl::_V1::detail::reduction_impl<double, std::plus<double>, (int)0, (unsigned long)1, (bool)0, double *>, ROOT::Experimental::RHnSYCL<double, (unsigned int)1, (unsigned int)256>::GetStats(unsigned long, sycl::_V1::event &)::[lambda(sycl::_V1::handler &) (instance 3)]::operator ()(sycl::_V1::handler &) const::[lambda(sycl::_V1::id<(int)1>, T1 &, T2 &) (instance 1)]>(sycl::_V1::handler &, sycl::_V1::range<T3>, T4, T5...)::[lambda(T1, T2 &...) (instance 1)], (int)1, sycl::_V1::ext::oneapi::experimental::properties<std::tuple<>>, sycl::_V1::detail::reduction_impl<double, std::plus<double>, (int)0, (unsigned long)1, (bool)0, double *>, sycl::_V1::detail::reduction_impl<double, std::plus<double>, (int)0, (unsigned long)1, (bool)0, double *>, (unsigned long)0, (unsigned long)1>(sycl::_V1::handler &, T2, const sycl::_V1::nd_range<T3> &, T4, std::tuple<T5...>&, std::integer_sequence<unsigned long, T6...>)::[lambda(T1, T2) (instance 1)]::operator ()<sycl::_V1::detail::KernelOneWGTag, sycl::_V1::detail::tuple<double *, double *>>(T1, T2) const::[lambda(sycl::_V1::nd_item<(int)1>) (instance 1)]",2.0,0.00,11788.8,1.0,11788.80,11788.8,11788.8,11788.8,0.00,0.000012


In [97]:
memops_buf = pd.read_csv("das6-gpu/nsys-20231211-155923/memop")
memops_buf["Time"] = memops_buf["Total Time (ns)"].div(1e9)
memops_buf["nvals"] =  memops_buf["input"].apply(lambda s: np.float64(s.split("_")[-1].split(".")[0]))
memops_buf["distribution"] =  memops_buf["input"].apply(lambda s: s.split("_")[1])
convert_to_type(kernels_buf, "edges", np.bool_)
memops_buf["type"] = "Buffers"

memops_usm = pd.read_csv("das6-gpu/nsys-20231206-174647/memop")
memops_usm["Time"] = memops_usm["Total Time (ns)"].div(1e9)
memops_usm["nvals"] =  memops_usm["input"].apply(lambda s: np.float64(s.split("_")[-1].split(".")[0]))
memops_usm["distribution"] =  memops_usm["input"].apply(lambda s: s.split("_")[1])
convert_to_type(memops_usm, "edges", np.bool_)
memops_usm["type"] = "USM"

df_memops = pd.concat([memops_buf, memops_usm])
del df_memops["input"]
df_memops

Unnamed: 0,iter,env,gpu,nbins,bulksize,edges,Time (%),Total Time (ns),Count,Avg (ns),Med (ns),Min (ns),Max (ns),StdDev (ns),Operation,Time,nvals,distribution,type
0,0,AdaptiveCpp,A4000,1,32768,False,100.0,47118513,3062,15388.1,15392.0,415,16321,776.4,[CUDA memcpy Host-to-Device],4.711851e-02,5.000000e+07,uniform,Buffers
1,0,AdaptiveCpp,A4000,1,32768,False,0.0,3104,2,1552.0,1552.0,1536,1568,22.6,[CUDA memcpy Device-to-Host],3.104000e-06,5.000000e+07,uniform,Buffers
2,0,AdaptiveCpp,A4000,1,32768,False,0.0,864,2,432.0,432.0,416,448,22.6,[CUDA memset],8.640000e-07,5.000000e+07,uniform,Buffers
3,0,AdaptiveCpp,A4000,1,32768,True,100.0,46987364,3063,15340.3,15360.0,415,16033,814.6,[CUDA memcpy Host-to-Device],4.698736e-02,5.000000e+07,uniform,Buffers
4,0,AdaptiveCpp,A4000,1,32768,True,0.0,2848,2,1424.0,1424.0,1344,1504,113.1,[CUDA memcpy Device-to-Host],2.848000e-06,5.000000e+07,uniform,Buffers
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1297,4,AdaptiveCpp,A4000,1000,32768,False,0.0,2656,2,1328.0,1328.0,1184,1472,203.6,[CUDA memcpy Device-to-Host],2.656000e-06,1.000000e+09,uniform,USM
1298,4,AdaptiveCpp,A4000,1000,32768,False,0.0,2592,3,864.0,448.0,416,1728,748.4,[CUDA memset],2.592000e-06,1.000000e+09,uniform,USM
1299,4,AdaptiveCpp,A4000,1000,32768,True,100.0,938885457,61073,15373.2,15328.0,415,16288,219.3,[CUDA memcpy Host-to-Device],9.388855e-01,1.000000e+09,uniform,USM
1300,4,AdaptiveCpp,A4000,1000,32768,True,0.0,2624,2,1312.0,1312.0,1216,1408,135.8,[CUDA memcpy Device-to-Host],2.624000e-06,1.000000e+09,uniform,USM


In [98]:
gp_memops = df_memops.groupby(["env", "gpu", "type", "distribution", "nvals", "nbins", "bulksize", "edges", "Operation"])
gp_memops.mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,iter,Time (%),Total Time (ns),Count,Avg (ns),Med (ns),Min (ns),Max (ns),StdDev (ns),Time
env,gpu,type,distribution,nvals,nbins,bulksize,edges,Operation,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
AdaptiveCpp,A4000,Buffers,uniform,5.000000e+07,1,32768,False,[CUDA memcpy Device-to-Host],2.0,0.0,2950.2,2.0,1475.10,1475.1,1414.4,1535.8,85.84,2.950200e-06
AdaptiveCpp,A4000,Buffers,uniform,5.000000e+07,1,32768,False,[CUDA memcpy Host-to-Device],2.0,100.0,47148652.4,3062.0,15397.98,15398.4,415.6,16499.6,777.52,4.714865e-02
AdaptiveCpp,A4000,Buffers,uniform,5.000000e+07,1,32768,False,[CUDA memset],2.0,0.0,864.0,2.0,432.00,432.0,422.2,441.8,13.84,8.640000e-07
AdaptiveCpp,A4000,Buffers,uniform,5.000000e+07,1,32768,True,[CUDA memcpy Device-to-Host],2.0,0.0,2905.8,2.0,1452.90,1452.9,1376.0,1529.8,108.76,2.905800e-06
AdaptiveCpp,A4000,Buffers,uniform,5.000000e+07,1,32768,True,[CUDA memcpy Host-to-Device],2.0,100.0,47142131.6,3063.0,15390.84,15398.4,415.4,39744.0,1115.14,4.714213e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
DPC++,A4000,USM,uniform,1.000000e+09,1000,32768,False,[CUDA memcpy Host-to-Device],2.0,100.0,946041115.4,61072.0,15490.60,15475.0,416.0,40256.0,314.50,9.460411e-01
DPC++,A4000,USM,uniform,1.000000e+09,1000,32768,False,[CUDA memset],2.0,0.0,2138.0,2.0,1069.00,1069.0,416.2,1721.8,923.18,2.138000e-06
DPC++,A4000,USM,uniform,1.000000e+09,1000,32768,True,[CUDA memcpy Device-to-Host],2.0,0.0,2783.8,2.0,1391.90,1391.9,1235.0,1548.8,221.88,2.783800e-06
DPC++,A4000,USM,uniform,1.000000e+09,1000,32768,True,[CUDA memcpy Host-to-Device],2.0,100.0,945228744.4,61073.0,15477.02,15436.6,416.0,69209.6,502.48,9.452287e-01


## A4000

In [24]:
selected_env = unique_env
selected_bulksize = 32768
selected_type = unique_type
selected_bins = 1000
selected_nvals = unique_nvals
selected_gpu = "A4000"
selected_distr = "uniform"
selected_edges = True

In [75]:
%jsroot on
title = f"Total Runtime of Histo1D with Different Input Sizes on NVIDIA A4000"

w = 800
h = 450
c = ROOT.TCanvas("c1", "", w, h)
c.SetRightMargin(0.39)
c.SetBottomMargin(0.3)

mg = ROOT.TMultiGraph()

l = ROOT.TLegend(0.62, 0.533, 1, 0.9)
l.SetTextSize(0.05)
markerstyles = [21, 25, 22, 26, 20]

ROOT.gStyle.SetPalette(ROOT.kRainbow)  
for ei, env in enumerate(selected_env):
    for ti, type in enumerate(selected_type):
        if type == "USM" and "CUDA" in env:
            continue 
        avg = np.array(gp.mean().loc[env, selected_gpu, type, selected_distr, :, 
                       selected_bins, selected_bulksize, selected_edges]["ttotal"], dtype=np.float64)
        std = np.array(gp.std().loc[env, selected_gpu, type, selected_distr, :,
                       selected_bins, selected_bulksize, selected_edges]["ttotal"], dtype=np.float64)
        gr = ROOT.TGraphErrors(len(selected_nvals), selected_nvals.astype(np.float64), avg, 
                               np.repeat(0., len(selected_nvals)), std)
    
        color = ROOT.TColor.GetPalette()[50+ 60 * ei]
        gr.SetMarkerColor(color)
        gr.SetLineColor(color)
        gr.SetLineWidth(2)
        gr.SetMarkerSize(2)
        gr.SetMarkerStyle(markerstyles[2*ei + ti])
        
        gr.GetHistogram().SetMinimum(0)
        gr.GetHistogram().SetLineWidth(10)
    
        mg.Add(gr, "ALP ")
        l.AddEntry(gr, env.split("_")[0] + " " + type)

# mg.SetTitle(title)

xaxis = mg.GetXaxis()
# xaxis.SetTitle("#splitline{Bulk Size}")
xaxis.SetTitle("Number of values")
# xaxis.SetRangeUser(0, 10000000)
# xaxis.SetTitleOffset(1.5)
xaxis.SetTitleSize(0.05)
xaxis.SetLabelSize(0.05)
# xaxis.LabelsOption("hM")
# xaxis.SetTickSize(0)

yaxis = mg.GetYaxis()
yaxis.SetTitle("Time (s)")
yaxis.SetTitleOffset(1)
yaxis.SetTitleSize(0.05)
yaxis.SetLabelSize(0.05)

ROOT.gStyle.SetTitleFontSize(0.1)

mg.Draw("a")
l.Draw()
c.DrawClone()
# c.SaveAs(f"nsys_histogram_{env}.png")

<cppyy.gbl.TCanvas object at 0x55c69977b700>

In [76]:
def get_cell(df, row, row_col, col):
    return df[df[row_col] == row][col].iloc[0]
    
nbins = len(environs) * len(args) + len(args) + 1
title = f"Nsys profiling results for time spent on GPU when filling a 1D histogram with RDataFrame"
gpuFillStyle = [None, None, None, None]
ROOT.gStyle.SetErrorX(0.);    

w = 1600
h = 900
c = ROOT.TCanvas("c1", title, w, h)
c.SetRightMargin(0.36)
c.SetBottomMargin(0.4)

hs = ROOT.THStack("hs", "")

l = ROOT.TLegend(0.65, 0.5, 1, 0.9)
l.SetTextSize(0.05)

unique_kernels = []

for ei, env in enumerate(environs):
    ROOT.gStyle.SetPalette(ROOT.kBlackBody)
    memop_names = memops[(env, *args[0])].mean(numeric_only=True)["Operation"]
    for mi, memop in enumerate(memop_names):
        bin = 1 + ei
        for arg in args:
            memop_avg = memops[(env, *arg)].mean()
            memop_std = memops[(env, *arg)].std()
    
            h3 = ROOT.TH1F(f"{arg}_{memop}", "gpu", nbins, 0, nbins)
            fill_bar(
                h3,
                bin,
                get_cell(memop_avg, memop, "Operation", "Time"),
                get_cell(memop_std, memop, "Operation", "Time"),
                ROOT.TColor.GetPalette()[50 * mi],
                gpuFillStyle[ei],
            )
            h3.SetLineWidth(3)
            hs.Add(h3)
            bin += len(environs) + 1
        if ei == 0:
            l.AddEntry(h3, memop)
            
    ROOT.gStyle.SetPalette(ROOT.kRainbow)
    kernel_names = kernel_calls[(env, *args[0])].mean()["Name"]
    for ni, name in enumerate(kernel_names[:3]):
    # for ni, name in enumerate(["GetStats"]):
        bin = 1 + ei 
        for arg in args:
            kernels_avg = kernel_calls[(env, *arg)].mean()
            kernels_std = kernel_calls[(env, *arg)].std()
    
            h1 = ROOT.TH1F(f"{arg}_{name}", "gpu", nbins, 0, nbins)
            fill_bar(
                h1,
                bin,
                get_cell(kernels_avg, name, "Name", "Time"),
                get_cell(kernels_std, name, "Name", "Time"),
                ROOT.TColor.GetPalette()[60 * ni],
                gpuFillStyle[ei],
            )
            h1.SetLineWidth(3)
            hs.Add(h1)
            bin += len(environs) + 1
        if name not in unique_kernels:
            l.AddEntry(h1, name)
            unique_kernels.append(name)

hs.Draw("bar")
hs.SetTitle(title)
# hs.SetMaximum(120)

xaxis = hs.GetXaxis()
xaxis.SetTitle("#splitline{      Implementation}{Number of doubles}")
# xaxis.SetTitle("Bulk size")
xaxis.SetTitleOffset(4)
xaxis.SetLabelSize(0.05)
for i, e in enumerate(range(2, nbins, len(environs) + 1)):
    # for ei, env in enumerate(environs):
    for ei, env in enumerate(["AdaptiveCPP", "CUDA", "Intel DPC++"]):
        xaxis.SetBinLabel(e + ei, f"{env.split('_')[0]}")
        # xaxis.SetBinLabel(e + ei, f"")
xaxis.LabelsOption("vM")
xaxis.SetTickSize(0)
xaxis.SetTitleSize(0.05)
xaxis.SetLabelSize(0.05)

ox = ROOT.TGaxis(
    0,
    0,  # xmin, ymin
    nbins,
    0,  # xmax, ymax
    0,
    nbins,  # wmin, wmax
    nbins,
        "S",  # ndiv, chopt
)
ox.SetTickSize(0)
ox.SetLabelOffset(0.2)
ox.SetLabelFont(42)
ox.SetLabelSize(0.05)

labelbins = range(2, nbins + 1, len(environs) + 1)
vallabels = ["100M", "500M", "  1B", "  5B", "  10B"]
for i in range(nbins + 2):
    if i not in labelbins:
        ox.ChangeLabel(i, -1, 0.0)
for i, e in enumerate(labelbins):
    ox.ChangeLabel(e, 0, -1, 12, -1, -1, vallabels[i])

ox.Draw()

yaxis = hs.GetYaxis()
yaxis.SetTitle("Time (s)")
yaxis.SetTitleSize(0.05)
yaxis.SetLabelSize(0.05)

ROOT.gStyle.SetTitleFontSize(0.1)

l.Draw()
c.Draw()
# c.SaveAs(f"nsys_histogram_{env}.png")

NameError: name 'environs' is not defined

## A6000

## A2