# CUDA Kernel Overlappnig Analysis
Investigate how many times spend on compute, communication, and the wasted empty time on GPU. All kernels are put into two categories: compute or nccl(communication).

It will show several heatmaps, which help you quickly to figure out which rank has problem:

- Computation ratio
- NCCL ratio
- Overlap ratio
- 
- Computation in seconds(include overlapped)
- NCCL in seconds(include overlapped)
- Overlap in seconds

More to come: list by streams, list by devices, list by ranks


All times are in nanoseconds.

## Load Data

In [None]:
#General setup properties
import IPython.display
from IPython.display import display, HTML, Markdown
display(HTML("<style>.container { width:95% !important; }</style>"))

import pickle
import importlib
import os
import glob
import math
import re
import time
from collections import deque
import pandas as pd
import numpy as np
import sqlite3
#!pip3 install plotly
# Setting up plots
import plotly.express as px
import plotly.offline as pyo
from plotly.subplots import make_subplots

pyo.init_notebook_mode()

pd.options.plotting.backend = "plotly"
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 250)

import nsys_pres


# Notice
If you encounter file not found error for the following steps, please restart the kernel or re-run the recipe.

Below will init some helper functions for display the data.

In [None]:
#load the data
files_df = pd.read_parquet("files.parquet").rename_axis("Rank",axis='index')
type_dfs = [(name, pd.read_parquet(f"type_{name}.parquet")) for name in files_df['File']]


# Init some helper functions
def getOutliers(df, col_name):
    avg = df[col_name].mean()
    std = df[col_name].std()
    result = (df[col_name] - avg)/std
    retDict = {}
    #retDict['Backup'] = result
    # if result > 3, should be outlier
    retDict['Outlier'] = result[abs(result) >= 3].index.tolist()
    # 2~3, likely
    retDict['Likely'] = result[(2 <= abs(result)) & (abs(result) < 3)].index.tolist()
    # 1~2, not certain
    # <1, ignore
    return retDict

def showOutlierDesc(df, title):
    mdStr = [f'# {title}', 'Outliers | ', '----- | ']
    # Get all keys, for the first column of the table
    firstColKey = set()
    for col in df.columns:
        o = getOutliers(df, col)
        firstColKey.update(o.keys())

    # Preallocate the row
    for key in firstColKey:
        mdStr.append(f'{key} | ')
    
    # Create header for table
    for col in df.columns:
        mdStr[1] += f'{col} | '
        mdStr[2] += f'----- | '
        o = getOutliers(df, col)
        for idx, key in enumerate(firstColKey):
            mdStr[idx+3] += f'{o[key]} |'

    display(Markdown('\n'.join(mdStr)))
            

## Statistics for Target Operation

First we calculate the absolute number of each category (Compute, Communicate, Empty, Overlap) for each rank.

In [None]:
summary = []
for name, type_df in type_dfs:
    sum_df = type_df.groupby(['OverlapType']).agg(
        {"duration": "sum"}
    )
    dfrow = sum_df.transpose()
    dfrow['Name'] = name
    summary.append(dfrow)
summary_df = pd.concat(summary)
# Sort by column lambda
summary_df = summary_df.sort_values('Name',
                                    key=lambda x:x.str.split("rank", expand=True)[1].astype(int),
                                   ascending=False)
summary_df.set_index('Name', inplace=True)

Then we display one heatmap by seconds

In [None]:
# Display by seconds
fig = make_subplots(rows=1, cols=len(summary_df.columns),
                  subplot_titles=summary_df.columns,
                  shared_yaxes=True, horizontal_spacing=0.1)

for idx, colName in enumerate(summary_df.columns):
    fig.add_trace(
        px.imshow(summary_df[[colName]]/1e9,aspect="auto").data[0],
        col=idx+1, row=1
    )
fig.update_layout(height=800, title_text="Running seconds for each part.")
fig.show()

In [None]:
showOutlierDesc(summary_df, "Outlier detections by running seconds")

## Show percent of each step

Percentage of each step.

In [None]:
percent_df = summary_df.copy()
percent_df['total'] = summary_df.sum(axis=1)
for col in percent_df.columns:
    if col != 'total':
        percent_df[col] = percent_df[col]/percent_df['total']
percent_df=percent_df.drop('total', axis=1) # Drop total column

In [None]:
fig = make_subplots(rows=1, cols=len(percent_df.columns),
                  subplot_titles=percent_df.columns,
                  shared_yaxes=True, horizontal_spacing=0.1)

for idx, colName in enumerate(percent_df.columns):
    fig.add_trace(
        px.imshow(percent_df[[colName]],aspect="auto").data[0],
        col=idx+1, row=1
    )
fig.update_layout(height=800, title_text="Running percent for each part.")
fig.show()

In [None]:
showOutlierDesc(percent_df, "Outlier detections by percentage")

You can dive more about each time segment. For example, it can split into several smaller intervals, and you'll find the behavior pattern for each interval.

In [None]:
maxTime = 0
minTime = 123456789e10
for type_i in type_dfs:
    tmp = max(type_i[1]['end'])
    maxTime = max(maxTime, tmp)
    tmp = min(type_i[1]['start'])
    minTime = min(minTime, tmp)

timeSplits = 30
timeStamps = [x for x in range(minTime, maxTime, (maxTime-minTime)//timeSplits)]
print(minTime/1e9, maxTime/1e9)
# Fix last time stamp
timeStamps[-1] = maxTime
# print(timeStamps)
row_titles=[name for name, rank_df in type_dfs]


fig = make_subplots(rows=len(type_dfs), cols=1, 
                    x_title='Timeline', y_title='Each Rank',
                   shared_xaxes=True, vertical_spacing=0, row_titles=row_titles)

for idx, (name, rank_df) in enumerate(type_dfs):
    rank_df['time_int'] = pd.cut(rank_df['start'], bins=timeStamps, labels=[x/1e9 for x in timeStamps[:-1]])
    result = rank_df.groupby(['time_int', 'OverlapType'], observed=False).agg({"duration": "sum"}).reset_index()
    result['duration'] /= 1e9 # convert ns to s
    # print(result)
    for i in range(4):
        fig.add_trace(
            px.bar(result, x="time_int", y="duration", color="OverlapType", color_discrete_map={
                "Compute": "#76b900", "Communicate": "purple", "Overlap": "gray", "Empty": "black"},).data[i],
            row=idx+1, col=1
        )
fig.update_layout(height=1600, title_text="Running percent for each part.", showlegend=False, barmode='stack')
fig.update_xaxes(nticks=timeSplits-1) # change the ticks
fig.update_yaxes(showticklabels=False) # Hide y axis ticks
fig.for_each_annotation(lambda a:  a.update(x = -0.07, textangle=0) if a.text in row_titles else())
fig.show()