In [1]:
import pandas as pd
import numpy as np

In [2]:
def load(file):
    """Load the experiment data from a CSV file"""
    
    schema = {
        'num_threads': np.int64(),
        'num_jsons': np.int64(),
        'json_bytes': np.int64(),
        'num_ipc': np.int64(),
        'ipc_bytes': np.int64(),
        'num_parsed': np.int64(),
        't_parse': np.float64(),
        't_resize': np.float64(),
        't_serialize': np.float64(),
        't_thread': np.float64(),
        't_enqueue': np.float64(),
        'status': np.int64()
    }
    
    df = pd.read_csv(file, dtype=schema)

    # Make sure there were no errors for converters.
    assert(df['status'].sum() == len(df.index))
        
    return df

In [3]:
def analyze(df):
    """Analyze the experiment data, deriving various metrics such as throughput."""
    # Calculate time spent within the thread as 'other'.
    df['t_other'] = df['t_thread'] - df[['t_parse', 't_resize', 't_serialize', 't_enqueue']].sum(axis=1)
    
    # Calculate throughput
    df['Throughput (in)'] = df['json_bytes'] / df['t_thread']
    df['Throughput (out)'] = df['ipc_bytes'] / df['t_thread']
    
    return df

In [4]:
def summarize(df):
    """Summarize the data from all threads into one row."""
    
    row = {'Threads': df['num_threads'].sum(),
             'JSONs': df['num_jsons'].sum(), 
             'Bytes (in)': df['json_bytes'].sum(), 
             'IPC messages': df['num_ipc'].sum(), 
             'Buffers parsed': df['num_parsed'].sum(), 
             # Take the mean of the time spent in threads:
             'Parse time': df['t_parse'].mean(),
             'Resize time': df['t_resize'].mean(), 
             'Serialize time': df['t_serialize'].mean(), 
             'Enqueue time': df['t_enqueue'].mean(), 
             'Other time': df['t_other'].mean(),
             'Thread time': df['t_thread'].mean(), 
             'Throughput (in)': df['Throughput (in)'].sum(),
             'Throughput (out)': df['Throughput (out)'].sum()}
              
    return row;

In [5]:
import glob

csv_files = []
for file in glob.glob("../experiments/data/throughput/threads/*.csv"):
    csv_files.append(file)

records = []
for file in csv_files:
    records.append(summarize(analyze(load(file))))

df = pd.DataFrame.from_records(records).sort_values(by='Threads').set_index('Threads')

display(df)

KeyError: 'Threads'

In [None]:
import matplotlib.pyplot as plt

plt.rcParams.update({
    "text.usetex": True,
    "font.family": "serif",
    "font.serif": ["Palatino"],
    "font.size": 14
})

In [None]:
df['Throughput (in)'].plot.bar(title='Throughput vs. Thread count')

In [None]:
df_times = df[['Parse time', 'Resize time', 'Serialize time', 'Enqueue time', 'Other time']]
df_times.rename(columns={'Parse time': 'Parse', 
                         'Resize time': 'Resize', 
                         'Serialize time': 'Serialize', 
                         'Enqueue time': 'Enqueue', 
                         'Other time': 'Other'})
ax = df_times.plot.bar(stacked=True, 
                        width=0.75,
                        title='Time spent per stage',
                        ylabel='Time (s)')

In [None]:
from pywaffle import Waffle

wd = df_times.sum()

plt.figure(
    FigureClass=Waffle,
    rows=1,
    columns=100,
    values=wd,
    legend={'loc': 'upper left', 'bbox_to_anchor': (1, 1)}
)

plt.tight_layout()

wd