In [1]:
import json
from os import listdir
from os.path import isfile, join
import altair as alt

In [2]:
import pandas as pd
import numpy as np

In [3]:
import warnings
warnings.filterwarnings('ignore')

#### Load in Data
* Open all json files that contain traces and append them to a list with all of the traces.
    - This makes the data ready to use with pandas dataframes and easy to create visualizations with Altair


In [4]:
baseDirectory = 'data/synthetic/20210302-hipster-shop'
directories = listdir(baseDirectory)

In [5]:
traces = []
for directory in directories:
    thisDirectory = baseDirectory + '/' + directory
    try:
        with open(thisDirectory) as f:
          data = json.load(f)
    except:
        continue
    traces.append(data)

In [6]:
traces

[[{'traceId': '550997223f8c4b30',
   'parentId': '550997223f8c4b30',
   'id': '1234d6244fad26ab',
   'name': '/getcart',
   'timestamp': 1614738754173000,
   'duration': 190000,
   'localEndpoint': {'serviceName': 'cartservice', 'ipv4': '192.168.1.180'},
   'tags': {'http.method': 'GET',
    'http.url': 'http://cartservice/GetCart',
    'load_generator.seq_num': '14978',
    'region': 'us-east-1',
    'version': 'v5'}},
  {'traceId': '550997223f8c4b30',
   'parentId': '550997223f8c4b30',
   'id': 'fd6c6a8dad1dac47',
   'name': '/getrecommendations',
   'timestamp': 1614738754163000,
   'duration': 226000,
   'localEndpoint': {'serviceName': 'recommendationservice',
    'ipv4': '192.168.1.180'},
   'tags': {'http.method': 'GET',
    'http.url': 'http://recommendationservice/GetRecommendations',
    'load_generator.seq_num': '14979',
    'region': 'us-east-1',
    'version': 'v234'}},
  {'traceId': '550997223f8c4b30',
   'parentId': 'fd6c6a8dad1dac47',
   'id': '3deb074f82f2e50c',
   'na

In [7]:
names = []
durations = []
traceIDs = []
parentIDs = []
ids = []
traceErrors = []
for trace in traces:
    
    for element in trace:
        traceIDs.append(element['traceId'])
        ids.append(element['id'])
        try:
            parentIDs.append(element['parentId'])
        except:
            parentIDs.append('Na')
        try:
            traceErrors.append(element['tags']['error'])
        except:
            traceErrors.append('false')
            
        names.append(element['name'])
        durations.append(element['duration'])

In [8]:
traceDf = pd.DataFrame({'Resource Name' : names, 'Duration':durations, 'Trace_ID': traceIDs, 'ID': ids, 'Parent_ID': parentIDs, 'Error?':traceErrors})
traceDf

Unnamed: 0,Resource Name,Duration,Trace_ID,ID,Parent_ID,Error?
0,/getcart,190000,550997223f8c4b30,1234d6244fad26ab,550997223f8c4b30,false
1,/getrecommendations,226000,550997223f8c4b30,fd6c6a8dad1dac47,550997223f8c4b30,false
2,/getproducts,5000,550997223f8c4b30,3deb074f82f2e50c,fd6c6a8dad1dac47,false
3,/cart,334000,550997223f8c4b30,550997223f8c4b30,Na,false
4,/product,501000,592363a229596c88,592363a229596c88,Na,false
...,...,...,...,...,...,...
19558,/cart,397000,529885a8ac3c2592,529885a8ac3c2592,Na,false
19559,/cart,485000,b9ae10ad77e3ecee,b9ae10ad77e3ecee,Na,false
19560,/getproducts,95000,b9ae10ad77e3ecee,a2f5e2941ecb468e,449069260f276449,false
19561,/getcart,94000,b9ae10ad77e3ecee,5bb0e7a04dd86297,b9ae10ad77e3ecee,false


In [9]:
traceDf

Unnamed: 0,Resource Name,Duration,Trace_ID,ID,Parent_ID,Error?
0,/getcart,190000,550997223f8c4b30,1234d6244fad26ab,550997223f8c4b30,false
1,/getrecommendations,226000,550997223f8c4b30,fd6c6a8dad1dac47,550997223f8c4b30,false
2,/getproducts,5000,550997223f8c4b30,3deb074f82f2e50c,fd6c6a8dad1dac47,false
3,/cart,334000,550997223f8c4b30,550997223f8c4b30,Na,false
4,/product,501000,592363a229596c88,592363a229596c88,Na,false
...,...,...,...,...,...,...
19558,/cart,397000,529885a8ac3c2592,529885a8ac3c2592,Na,false
19559,/cart,485000,b9ae10ad77e3ecee,b9ae10ad77e3ecee,Na,false
19560,/getproducts,95000,b9ae10ad77e3ecee,a2f5e2941ecb468e,449069260f276449,false
19561,/getcart,94000,b9ae10ad77e3ecee,5bb0e7a04dd86297,b9ae10ad77e3ecee,false


In [10]:
traceSum = 0 
currTraceID = ''
duration_starts = []
duration_ends = []
duration_start = 0
duration_end = 0
for i in range(len(traceDf)):
    val = traceDf.iloc[i]
    if val['Trace_ID'] != currTraceID:
        duration_start = 0
        currTraceID = val['Trace_ID']
    else:
        duration_start = duration_end
    duration_end = duration_start +  val['Duration']
    duration_starts.append(duration_start)
    duration_ends.append(duration_end)

In [11]:
traceDf['duration_start'] = duration_starts
traceDf['duration_end'] = duration_ends
traceDf['Data Transfered'] = [round(np.random.uniform(4,10000),1) for i in range(len(traceDf))]
traceDf.head(2)

Unnamed: 0,Resource Name,Duration,Trace_ID,ID,Parent_ID,Error?,duration_start,duration_end,Data Transfered
0,/getcart,190000,550997223f8c4b30,1234d6244fad26ab,550997223f8c4b30,False,0,190000,6217.3
1,/getrecommendations,226000,550997223f8c4b30,fd6c6a8dad1dac47,550997223f8c4b30,False,190000,416000,3230.7


In [12]:
updatedTraces = pd.DataFrame()
for traceID in set(traceDf['Trace_ID']):
#     print(traceID)
    tes = traceDf[traceDf['Trace_ID'] == traceID]
    root = list(tes['Trace_ID'])[0]
    tes['order'] = [100 for x in range(len(tes))]
    tes['loc start'] = [100 for x in range(len(tes))]
    tes['loc end'] = [100 for x in range(len(tes))]

    tes
    order = []
    root = list(tes['Trace_ID'])[0]
    startLoc = 0
    order = 0
    for i in range(len(tes)):
        t = tes.iloc[i]
        if t['ID'] == root:
            t['order'] = order
            t['loc start'] = startLoc
            t['loc end'] = t['loc start'] + t['Duration']
            tes.iloc[i] = t
    c = 0
    currentRoot = root
    startLoc += 100000
    lastRoot = root
    lastRoots = []
    while c < len(tes) - 1:
        startLoc = tes[tes['ID'] == currentRoot]['loc start'] + 100000
        order = tes[tes['ID'] == currentRoot]['order']
        for i in range(len(tes)):
            t = tes.iloc[i]

            if t['Parent_ID'] == currentRoot:

                t['order'] = order + .5
                order = order + 1
                t['loc start'] = startLoc
                t['loc end'] = t['loc start'] + t['Duration']
                tes.iloc[i] = t
                lastRoots.append(t['ID'])
                c += 1
    #     order = order + 1
        currentRoot = lastRoots.pop(0)
#     print('done')
    updatedTraces = updatedTraces.append(tes)

In [13]:
traceDf = updatedTraces

In [14]:
# def plotTrace(traceID):
#     source = traceDf[traceDf['Trace_ID'] == traceID]
#     bars = alt.Chart(source, title='Trace: ' + traceID).mark_bar().encode(
#         y=alt.Y('Resource Name', type='nominal', sort=None),
#         x = alt.X("duration_start:Q", title= "Duration"),
#         x2 = "duration_end:Q",
#         color = 'Resource Name:N',
#         tooltip = ['Duration:Q', 'Data Transfered:Q','Error?:N']
#     )
#     text = bars.mark_text(
#         align='left',
#         baseline='middle',
#         color = 'black',
#         dx=-3  # Nudges text to right so it doesn't appear on top of the bar
#     ).encode(
#         color = 'Error?:N'
#         text='Error?:N'
#     )
#     return (bars + text).properties(height=300)

In [15]:
def plotTrace(traceID):
    source = traceDf[traceDf['Trace_ID'] == traceID]
    return alt.Chart(source, title='Trace: ' + traceID).mark_bar().encode(
        y=alt.Y('Resource Name', type='nominal', sort=None),
        x = alt.X("duration_start:Q", title= "Duration"),
        x2 = "duration_end:Q",
        color = 'Resource Name:N',
        tooltip = ['Duration:Q', 'Data Transfered:Q','Error?:N']
    )

In [16]:
def plotTraceTree(traceID):
    source = traceDf[traceDf['Trace_ID'] == traceID]
    return alt.Chart(source, title='Trace: ' + traceID).mark_bar().encode(
        y=alt.Y('Resource Name', type='nominal', sort=alt.SortField('order')),
        x = alt.X("loc start:Q", title= "Duration"),
        x2= "loc end:Q",
        color = 'Resource Name:N',
        tooltip = ['ID:N', 'Parent_ID:N', 'Duration:Q','Error?:N']
    )

In [17]:
plotTrace('ffea94949d425fe9')

In [18]:
plotTraceTree('ffea94949d425fe9')

In [19]:
plotTrace('b9ae10ad77e3ecee')

In [20]:
plotTraceTree('b9ae10ad77e3ecee')

In [21]:
plotTrace('106b33d6227a4e20')

In [22]:
plotTraceTree('106b33d6227a4e20')

AGGREGATE

In [23]:
rootTraces = traceDf[traceDf['Parent_ID'] == 'Na']
rootTraces 

Unnamed: 0,Resource Name,Duration,Trace_ID,ID,Parent_ID,Error?,duration_start,duration_end,Data Transfered,order,loc start,loc end
17754,/cart,219000,05572cbb875880a0,05572cbb875880a0,Na,false,49000,268000,4864.6,0.0,0,219000
16819,/cart,219000,c2fe2262950dcfef,c2fe2262950dcfef,Na,false,12000,231000,5981.9,0.0,0,219000
18563,/cart,440000,7141eb616885e332,7141eb616885e332,Na,false,358000,798000,9898.1,0.0,0,440000
12930,/cart,416000,19e53a15d27bb0e8,19e53a15d27bb0e8,Na,false,0,416000,2244.1,0.0,0,416000
17505,/cart,249000,e23a44e1cddedd2d,e23a44e1cddedd2d,Na,false,226000,475000,9848.3,0.0,0,249000
...,...,...,...,...,...,...,...,...,...,...,...,...
5905,/cart,286000,cab3c0c5034022f3,cab3c0c5034022f3,Na,false,0,286000,5897.1,0.0,0,286000
18223,/cart,207000,432b64aea6388eda,432b64aea6388eda,Na,false,244000,451000,8238.8,0.0,0,207000
7078,/cart,297000,8aafba8c1b40fe41,8aafba8c1b40fe41,Na,false,0,297000,4772.2,0.0,0,297000
551,/cart,446000,64c1d9eec42bd573,64c1d9eec42bd573,Na,false,487000,933000,9884.4,0.0,0,446000


In [24]:
roots = list(set(rootTraces['Resource Name']))
roots 

['/product', '/checkout', '/currency', '/shipping', '/cart']

In [25]:
avgDurations = []
errorPercentage = []
aggregateTraceDf = pd.DataFrame()
for root in roots:
    print(root)
    traces = rootTraces[rootTraces['Resource Name'] == root]
    ids = list(set(traces['Trace_ID']))
    print(len(traces))
    traceDf['keep'] = [True if x in ids else False for x in traceDf['Trace_ID']]
    thisRootTraces = traceDf[traceDf['keep']]
    print(len(thisRootTraces))
    thisRootTraces['intErrors?'] = [1 if x =='true' else 0 for x in thisRootTraces['Error?']]
    grouped = thisRootTraces.groupby('Resource Name').agg({'Duration': ['mean'], 'intErrors?':['sum','count'], 'Data Transfered':['mean'], 'loc start': 'mean','loc end': 'mean', 'order':'mean' })
    grouped = grouped.reset_index()
    grouped['root'] = [root] * len(grouped)
    cols = []
    for x in grouped.columns:
        if x[1] =='mean':
            grouped[x] = round(grouped[x],2)
            cols.append('Average ' + x[0])
        elif x[1] != '':
            cols.append(x[1])
        else:
            cols.append(x[0])
    grouped.columns = cols
    grouped['Error Rate'] = round(grouped['sum']/grouped['count'],3)
    grouped = grouped.rename(columns = {'mean': 'Average Duration', 'count':'Error Count'})
    cols = ['root','Resource Name', 'Average Duration','Error Count','Error Rate','Average Data Transfered', 'Average loc start', 'Average loc end',
           'Average order']
    grouped = grouped[cols]
    grouped    
    aggregateTraceDf = aggregateTraceDf.append(grouped)



/product
728
3640
/checkout
119
831
/currency
51
153
/shipping
121
363
/cart
3644
14576


In [26]:
aggregateTraceDf

Unnamed: 0,root,Resource Name,Average Duration,Error Count,Error Rate,Average Data Transfered,Average loc start,Average loc end,Average order
0,/product,/adrequest,246023.35,728,0.0,4830.63,100000,346023.35,1.07
1,/product,/getproducts,54748.63,1456,0.0,4975.35,150000,204748.63,1.96
2,/product,/getrecommendations,254630.49,728,0.0,4973.73,100000,354630.49,1.74
3,/product,/product,527575.55,728,0.0,5212.94,0,527575.55,0.0
0,/checkout,/address,52000.01,87,0.0,5130.73,200000,252000.01,3.01
1,/checkout,/checkout,1625361.34,119,0.403,4490.52,0,1625361.34,0.0
2,/checkout,/creditcardinfo,25609.21,87,0.0,4878.87,200000,225609.21,2.3
3,/checkout,/getcart,90103.45,87,0.0,4447.78,200000,290103.45,3.17
4,/checkout,/getconversion,155471.26,87,0.0,4506.25,200000,355471.26,3.44
5,/checkout,/money,50620.69,87,0.0,4391.9,300000,350620.69,3.94


In [27]:
def plotTraceAgg(traceID):
    source = aggregateTraceDf[aggregateTraceDf['root'] == traceID]
    return alt.Chart(source, title='Trace Type: ' + traceID).mark_bar().encode(
        y=alt.Y('Resource Name', type='nominal', sort=alt.SortField('Average order')),
        x = alt.X("Average loc start:Q", title= "Duration"),
        x2 = "Average loc end:Q",
        color = 'Resource Name',
        tooltip = ['Average Duration:Q', 'Average Data Transfered:Q', 'Error Rate']
    )

In [28]:
plotTraceAgg('/checkout')