In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sn
import numpy as np
from matplotlib.patches import PathPatch

def adjust_box_widths(g, fac):
    """
    Adjust the widths of a seaborn-generated boxplot.
    """

    # iterating through Axes instances
    for ax in g.axes:

        # iterating through axes artists:
        for c in ax.get_children():

            # searching for PathPatches
            if isinstance(c, PathPatch):
                # getting current width of box:
                p = c.get_path()
                verts = p.vertices
                verts_sub = verts[:-1]
                xmin = np.min(verts_sub[:, 0])
                xmax = np.max(verts_sub[:, 0])
                xmid = 0.5*(xmin+xmax)
                xhalf = 0.5*(xmax - xmin)

                # setting new width of box
                xmin_new = xmid-fac*xhalf
                xmax_new = xmid+fac*xhalf
                verts_sub[verts_sub[:, 0] == xmin, 0] = xmin_new
                verts_sub[verts_sub[:, 0] == xmax, 0] = xmax_new

                # setting new width of median line
                for l in ax.lines:
                    if np.all(l.get_xdata() == [xmin, xmax]):
                        l.set_xdata([xmin_new, xmax_new])

In [6]:
methods = ["Postgres", "SafeBound"]
#benchmarks = ['JOBLight','JOBLightRanges', "JOBM", "Stats"]
benchmarks = ["Stats"]
runs = list(range(1,6))
inferenceResults = {method : {benchmark : dict() for benchmark in benchmarks} for method in methods}
rootFileDirectory = "/mnt/ExtraDrive1/SafeBound/Data/Results/"
for method in methods:
    for benchmark in benchmarks:
        for run in runs:
            querySize = pd.read_csv(rootFileDirectory + benchmark+"_Sizes.csv")
            resultFile = None
            if method == 'SafeBound':
                resultFile = rootFileDirectory + method + "_Inference_" + str(run) + "_" + benchmark + ".csv"
                if benchmark in []:
                    resultData = pd.DataFrame()
                    inferenceResults[method][benchmark][run] = resultData
                    continue
            elif method == 'Postgres':
                resultFile = rootFileDirectory + method + "_Inference_" +  str(run) + "_" + benchmark + ".csv"
                if benchmark in [""]:
                    resultData = pd.DataFrame()
                    inferenceResults[method][benchmark][run] = resultData
                    continue


            resultData = pd.read_csv(resultFile)
            resultData = resultData.merge(querySize, on='QueryLabel')
            resultData["JoinSize"] = resultData["JoinSize"].astype(float)
            resultData["Estimate"] =  resultData["Estimate"].astype(float)
            resultData["RelativeError"] = resultData["Estimate"]/resultData["JoinSize"]
            resultData["QError"] = np.maximum(resultData["RelativeError"], 1./resultData["RelativeError"])
            resultData["Benchmark"] = benchmark
            resultData["Method"] = method
            resultData["Run"] = run
            inferenceResults[method][benchmark][run] = resultData
            

FileNotFoundError: [Errno 2] No such file or directory: '/mnt/ExtraDrive1/SafeBound/Data/Results/Stats_Sizes.csv'

In [99]:
allData = pd.concat([inferenceResults[method][benchmark][run] for method in methods for benchmark in benchmarks for run in runs])

In [100]:
p05Error = allData.groupby(['Method', 'Benchmark', 'Run']).quantile(.05)
p50Error = allData.groupby(['Method', 'Benchmark', 'Run']).quantile(.50)
p95Error = allData.groupby(['Method', 'Benchmark', 'Run']).quantile(.95)
topLineData = p05Error.join(p50Error["RelativeError"], lsuffix="p05",rsuffix="p50").join(p95Error["RelativeError"])
topLineData["RelativeErrorp95"] = topLineData["RelativeError"]
topLineData = topLineData.reset_index()

In [101]:
topLineData[((topLineData["Method"]=="SafeBound") & (topLineData["Run"]==4)) | ((topLineData["Method"]=="Postgres")& (topLineData["Run"]==2))| ((topLineData["Method"]=="Postgres2D")& (topLineData["Run"]==2))| ((~topLineData["Method"].isin(["Postgres", "SafeBound", "Postgres2D"])))].groupby(["Method", "Benchmark"]).mean()[['RelativeErrorp05','RelativeErrorp50','RelativeErrorp95']]

Unnamed: 0_level_0,Unnamed: 1_level_0,RelativeErrorp05,RelativeErrorp50,RelativeErrorp95
Method,Benchmark,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
BayesCard,JOBLight,0.269774,1.001832,6.674691
BayesCard,Stats,0.148015,0.827293,10.31433
NeuroCard,JOBLight,0.256539,0.795068,2.05
NeuroCard,JOBLightRanges,0.015624,0.700949,1.615702
NeuroCard,JOBM,0.003453,0.333613,2.700392
PessemisticCardinality,JOBLight,2.873601,16.646252,152.5293
PessemisticCardinality,JOBLightRanges,1.331667,25.066667,576.5016
PessemisticCardinality,JOBM,25.374013,10295.569595,1977142.0
PessemisticCardinality,Stats,1.043217,43.810061,69764.58
Postgres,JOBLight,0.001261,0.106706,0.9849187


In [2]:
runData[(runData["Method"]=="SafeBound")&(runData["Run"]==4)].groupby('Benchmark')["RelativeError"].median()

NameError: name 'runData' is not defined

In [103]:
runData[(runData["Method"]=="Postgres")].groupby('Benchmark')["RelativeError"].median()

Benchmark
JOBLight             16.646252
JOBLightRanges       25.066667
JOBM              10295.569595
Stats                43.810061
Name: RelativeError, dtype: float64

In [1]:
hueOrder =['SafeBound', "TrueCardinality", 'Postgres']
palette = ["C1", "C7", "C0", "C2", "C5", "C4", "C6"]
sn.set(font_scale = 1.3)
runData = allData[((allData["Method"]=='SafeBound')&(allData["Run"]==4))|((allData["Method"].isin(['Postgres']))&(allData["Run"]==2))]
inferencePlot = sn.catplot(x='Benchmark', y='RelativeError', hue='Method', kind='box', data=runData, height=5, aspect = 6/4, width=.8,palette=palette, hue_order=hueOrder)
inferencePlot.set(yscale='log', xlabel="")
inferencePlot.map(plt.axhline, y=1, ls='--', c='grey')
adjust_box_widths(inferencePlot.figure, 0.8)

NameError: name 'sn' is not defined