# ALI Performance Tests on Blake
Performance comparison dashboard. Must be run in Jupyter notebook to interact with plots.

In [1]:
import datetime as dt
import glob
import numpy as np
import pandas as pd
import json

import plotly.graph_objects as go

# Import scripts
from json2timeline import json2dataframe
from models import find_chgpts
from basicstats import *
from utils import *
from plotutils import *

In [2]:
# hide_code_button()

In [3]:
# Load configuration file
with open('config.json') as jf:
    config = json.load(jf)
check_config(config)
for key,val in config.items():
        exec(key + '=val')

# Extract file names and collect data
files = glob.glob('json/ctest-*')
df = json2dataframe(files, cases, nproc, names, timers, metadata)

# Log-transform the data before modeling
xform = lambda x: np.log(x)
inv_xform = lambda x: np.exp(x)

# # Filter data by date if desired
# import datetime as dt
# df = df[df['date'] < dt.datetime.strptime('20191231', '%Y%m%d')]

In [4]:
#Print some information
print('Test cases:')
[print('  '+c) for c in cases]
print('Timers:')
[print('  '+n) for n in names]
print("Model threshold: %f" % threshold)

Test cases:
  ant-2-20km_ml_ls
  ant-2-20km_mu_ls
  ant-2-20km_mu_dls
  green-1-7km_fea_1ws
  green-1-7km_ml_ls_1ws
  green-1-7km_mu_ls_1ws
  green-1-7km_mu_dls_1ws
  green-1-7km_fea_mem
  green-1-7km_ml_ls_mem
  green-1-7km_mu_ls_mem
  green-1-7km_mu_dls_mem
Timers:
  Total Time
  Setup Time
  Total Fill Time
  Residual Fill
  Residual Fill Evaluate
  Residual Fill Export
  Jacobian Fill
  Jacobian Fill Evaluate
  Jacobian Fill Export
  NOX Total Preconditioner Construction
  NOX Total Linear Solve
Model threshold: 0.000100


In [5]:
# Find changepoints and format data to work nicely with plots
seqs = {case:{} for case in cases}
most_recent = df['date'].max()
events = {}
print('Finding changepoints')
for case in cases:
    print(case, end='')
    for name in names:
        print('.', end='')
        # Detect changepoints
        cols = ['date', name] + list(metadata)
        data = df.loc[df['case']==case, cols].dropna(subset=[name])
        data.reset_index(drop=True, inplace=True)
        data.rename(columns={name:'time'}, inplace=True)
        data['time'] = xform(data['time'])
        pts, _ = find_chgpts(data['time'], threshold=threshold)
        
        # Build dictionary of changepoints
        for d in data['date'].iloc[pts]:
            if d in events.keys():
                if case in events[d].keys():
                    events[d][case].append(name)
                else:
                    events[d][case] = [name]
            else:
                events[d] = {case: [name]}
                    
        # Calculate mean/std between changepoints
        mean, upper, lower = regime_ts(data['time'], pts)
        temp = {'mean': mean, 'upper': upper, 'lower': lower}
        seqs[case][name] = pd.concat((data, pd.DataFrame(temp)), axis=1)
    print()
clear_output()

# Sort and print recent events
events = {k:events[k] for k in sorted(events.keys())}
print('Events in the most recent %d days:' % recency)
recent_events = print_events(events, most_recent, recency)

Events in the most recent 10 days:
02/19/2020:
    ant-2-20km_mu_ls: NOX Total Linear Solve
    green-1-7km_ml_ls_1ws: Total Time
                           Total Fill Time
                           Residual Fill
                           Jacobian Fill
    green-1-7km_mu_ls_1ws: Total Fill Time
                           Residual Fill
                           Jacobian Fill
    green-1-7km_mu_dls_1ws: Total Time
                            Total Fill Time
                            Residual Fill
                            Jacobian Fill
    green-1-7km_fea_mem: Residual Fill
    green-1-7km_ml_ls_mem: Total Fill Time
                           Residual Fill
                           Jacobian Fill
    green-1-7km_mu_ls_mem: Total Time
                           Total Fill Time
                           Residual Fill
                           Residual Fill Export
                           Jacobian Fill
                           NOX Total Preconditioner Construction
    green-1-7

In [6]:
# Plot results
lines = ['time', 'mean'] # 'upper', 'lower']
colors = ['darkred', 'midnightblue']
modes = ['markers', 'lines', 'lines', 'lines']
dashes = ['solid', 'solid', 'dot', 'dot']

fig = go.FigureWidget()
# Create series on plot
for line, mode, dash in zip(lines, modes, dashes):
    for c, color in zip(cases[:2], colors):
        first = c == cases[0]
        if line == 'time':
            fig.add_trace(go.Scatter(
                x=seqs[c][names[0]]['date'],
                y=inv_xform(seqs[c][names[0]][line]),
                mode=mode,
                line=dict(color=color, dash=dash, width=1.5),
                marker_symbol='square' if first else 'circle',
                name=c,
                legendgroup='g1' if first else 'g2',
                customdata=seqs[c][names[0]][['date']+list(metadata)],
                hovertemplate=
                "Date: %{customdata[0]}<br>" +
#                 "Albany compiler: %{customdata[1]}<br>" +
                "Albany commit: %{customdata[2]}<br>" +
                "Trilinos commit: %{customdata[3]}" +
                "<extra></extra>",
            ))
        else:
            fig.add_trace(go.Scatter(
                x=seqs[c][names[0]]['date'],
                y=inv_xform(seqs[c][names[0]][line]),
                mode=mode,
                line=dict(color=color, dash=dash, width=1.5),
                name=line,
                legendgroup='g1' if first else 'g2',
                hoverinfo='skip'
            ))

fig = fig.update_layout(
    title='Nightly test performance',
    xaxis_title='Simulation Date',
    yaxis_title='Wall-clock Time (s)'
)


In [7]:
# Look at paired data
def paired_data(c1, c2, name):
    df1, df2 = seqs[c1][name].set_index('date'), seqs[c2][name].set_index('date')
    df = df1.join(df2, lsuffix='_c1', rsuffix='_c2', how='inner')
    df.reset_index(inplace=True)
    df.rename(columns={'date':'date_c1'}, inplace=True)
    timediff = (df['time_c2'] - df['time_c1'])
#     timediff = (-df['time_c1'] + df['time_c2'])/df['time_c1']
    df = df[[c+'_c1' for c in ['date']+list(metadata)]]
    df.columns = ['date']+list(metadata)
    df['time'] = timediff 
    pts, _ = find_chgpts(df['time'], threshold=threshold)

    # Calculate mean/std between changepoints
    mean, upper, lower = regime_ts(df['time'], pts, std_error=True)
    temp = {'mean': mean, 'upper': upper, 'lower': lower}
    df = pd.concat((df, pd.DataFrame(temp)), axis=1)
    return df

pair_df = paired_data(cases[0], cases[1], names[0])
lines_all = ['time', 'mean', 'upper', 'lower']
pair_color = 'rebeccapurple'

diff_fig = go.FigureWidget(layout_yaxis_tickformat=',.1%')
for line, mode, dash in zip(lines_all, modes, dashes):
    if line == 'time':
        diff_fig.add_trace(go.Scatter(
            x=pair_df['date'],
            y=inv_xform(pair_df[line])-1,
            mode=mode,
            line=dict(color=pair_color, dash=dash, width=1.5),
            marker_symbol='circle',
            name='Time Difference',
            customdata=pair_df[['date']+list(metadata)],
            hovertemplate=
            "Date: %{customdata[0]}<br>" +
#                 "Albany compiler: %{customdata[1]}<br>" +
            "Albany commit: %{customdata[2]}<br>" +
            "Trilinos commit: %{customdata[3]}" +
            "<extra></extra>",
        ))
    else:
        diff_fig.add_trace(go.Scatter(
            x=pair_df['date'],
            y=inv_xform(pair_df[line])-1,
            mode=mode,
            line=dict(color=pair_color, dash=dash, width=1.5),
            name=line,
            hoverinfo='skip'
        ))

diff_fig = diff_fig.update_layout(
    shapes=hv_line('h',0),
    title='Analysis of % performance difference (relative to baseline)',
    xaxis_title='Simulation Date',
    yaxis_title='Difference (s)'
)

In [8]:
def latest_data(df):
    n = df.shape[0]
    pts = []
    for i in reversed(range(n)):
        if df['mean'][i] == df['mean'][n-1]:
            pts.append(df['time'][i])
        else:
            break
    return df['date'][i+1], np.array(pts)

# Create histogram of recent data from two test cases
hist = go.FigureWidget(layout_xaxis_tickformat=',.1%')
hist.add_trace(go.Histogram(
    x=inv_xform(latest_data(pair_df)[1])-1,
    name='Difference in ' + names[0],
    marker_color=pair_color
))
hist = hist.update_layout(
    shapes=hv_line('v',0),
    barmode='overlay',
    title='Performance differences since latest changepoint',
    xaxis_title='Wall-clock Time (s)',
    yaxis_title='Count',
    legend_orientation='h',
    legend=dict(x=0, y=1.11, bgcolor=None)
)

In [9]:
from ipywidgets import Output
# Create a textbox that will show t-test results, and update it
textbox = Output(layout={'border': '1px solid black', 'width': '40%'})
date2str = lambda date: dt.datetime.strftime(date, '%b %d')

def print_summary(x, indent=2, pct=False):
    N = len(x)
    mean, std = trimmed_stats(x, var=False)
    print(' '*indent, end='')
    if pct:
        print('N   : %d\n  mean: %.2f%%\n  std : %.2f%%' % (N, 100*mean, 100*std))
    else:
        print('N   : %d\n  mean: %.2f\n  std : %.2f' % (N, mean, std))

def update_textbox(c1=cases[0], c2=cases[1], n=names[0]):
    with textbox:
        textbox.clear_output()
        d1, x1 = latest_data(seqs[c1][n])
        d2, x2 = latest_data(seqs[c2][n])
        dp, xp = latest_data(paired_data(c1, c2, n))
        signif = lambda pval: '*'*(int(pval<0.05)+int(pval<0.01)+int(pval<0.001))
        _, paired_pval = ttest(xp, with_pval=True)
        
        # Summary statistics
        print('Data since latest changepoints')
        print('Baseline (since %s):\n  %s' % (date2str(d1), c1))
        print_summary(inv_xform(x1))
        print('Comparison (since %s):\n  %s' % (date2str(d2), c2))
        print_summary(inv_xform(x2))

        print('Paired observations (since %s):' % date2str(dp))
        print_summary(inv_xform(xp)-1, pct=True)
        print('  t-test p-value: %.2g%s' % (paired_pval, signif(paired_pval)))
        if len(xp) > 2:
            mp, sp = trimmed_stats(xp, var=False)
            r = sp*tdist.isf(0.01/2, len(xp)-2)/np.sqrt(len(xp))
            print('  99%% CI: (%.2f%%, %.2f%%)' % 
                  (100*(inv_xform(mp-r)-1), 100*(inv_xform(mp+r)-1)))
        else:
            print('  Not enough data for confidence interval')
        
update_textbox()

In [10]:
def update_figdata(figdata, **kwargs):
    for k, v in kwargs.items():
        figdata[k]=v

# Function that will update all chart elements based on dropdowns
def update(Baseline=cases[0], Comparison=cases[1], Timer=names[0]):
    c1, c2, n = Baseline, Comparison, Timer
    pair_df = paired_data(c1, c2, n)
    with fig.batch_update():
        i = 0
        for line, mode, dash in zip(lines, modes, dashes):
            update_figdata(fig.data[i], x=seqs[c1][n]['date'], y=inv_xform(seqs[c1][n][line])-1,
                           name=c1 if i<2 else line, customdata=seqs[c1][n][['date']+list(metadata)])
            i += 1
            
            update_figdata(fig.data[i], x=seqs[c2][n]['date'], y=inv_xform(seqs[c2][n][line])-1,
                           name=c1 if i<2 else line, customdata=seqs[c2][n][['date']+list(metadata)])      
            i += 1
    with diff_fig.batch_update():
        for i, line in enumerate(lines_all):
            update_figdata(diff_fig.data[i], x=pair_df['date'], y=inv_xform(pair_df[line])-1,
                           customdata=pair_df[['date']+list(metadata)])  
    with hist.batch_update():
        update_figdata(hist.data[0], x=inv_xform(latest_data(paired_data(c1, c2, n))[1])-1)
    update_textbox(c1, c2,n)

In [11]:
# Create dashboard
from ipywidgets import interactive, HBox, VBox, HTML, Layout
widget = interactive(update, Baseline=list(cases), Comparison=list(cases), Timer=list(names))
controls = HBox(widget.children[:-1], layout = Layout(flex_flow='row wrap'))
report = VBox([
    controls, 
    fig,
    diff_fig,
    HBox([hist, textbox])
])
update()
report

VBox(children=(HBox(children=(Dropdown(description='Baseline', options=('ant-2-20km_ml_ls', 'ant-2-20km_mu_ls'…

### Nightly test performance
Changepoints are estimated using a generalized likelihood ratio method on each timer. 
* Markers: recorded wall-clock time
* Solid line: average wall-clock time between changepoints

### Analysis of performance difference
Observations from the two cases are joined by date, and we take the difference of their times. A generalized likelihood ratio test is used to determine changepoints in the difference.
* Markers: recorded wall-clock time
* Solid line: average wall-clock time between changepoints
* Dotted lines: upper and lower bounds of a 99% confidence interval for the average

### Histogram of differences since latest changepoint
Using data since the most recently detected changepoint, we plot a histogram of the difference in performance.

### Statistical findings
Summary statistics for the individual timers are shown since their most recent changepoint. For performance differences, we consider data since the most recent changepoint in the difference time series. We use a t-test to evaluate whether the difference in performance is statistically significant. One, two, and three asterisks indicate significance levels of 0.05, 0.01, and 0.001 respectively. We also include a 99% confidence interval for the difference in performance.