In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd
import sys
sys.path.append("..")

#from cardinality_estimation.featurizer import Featurizer

from query_representation.query import *
from query_representation.utils import *
from cardinality_estimation.dataset import *

from torch.utils import data
import pickle

import glob
import random
import os
import json
import time
import matplotlib.pyplot as plt
import networkx as nx

In [None]:

#TESTDIR = os.path.join(os.path.join("..", "queries"), "imdb-unique-plans")
#RESULTDIR = os.path.join("..", "results")
#make_dir(RESULTDIR)

TRAINDIR = os.path.join(os.path.join("/flash1/pari/MyCEB", "queries"), "imdb-unique-plans")

RTDIRS = ["/flash1/pari/MyCEB/runtime_plans/pg"]

In [None]:
rtdfs = []

for RTDIR in RTDIRS:    
    rdirs = os.listdir(RTDIR)
    for rd in rdirs:
        rtfn = os.path.join(RTDIR, rd, "Runtimes.csv")
        if os.path.exists(rtfn):
            rtdfs.append(pd.read_csv(rtfn))
rtdf = pd.concat(rtdfs)
print("Num RTs: ", len(rtdf))

# Query loading helper functions

In [None]:
def load_qrep(fn):
    assert ".pkl" in fn
    try:
        with open(fn, "rb") as f:
            query = pickle.load(f)
    except:
        print(fn + " failed to load...")
        exit(-1)

    query["subset_graph"] = \
            nx.OrderedDiGraph(json_graph.adjacency_graph(query["subset_graph"]))
    query["join_graph"] = json_graph.adjacency_graph(query["join_graph"])
    if "subset_graph_paths" in query:
        query["subset_graph_paths"] = \
                nx.OrderedDiGraph(json_graph.adjacency_graph(query["subset_graph_paths"]))

    return query


def load_qdata(fns):
    qreps = []
    for qfn in fns:
        qrep = load_qrep(qfn)
        #qrep = load_sql_qrep(qfn)
        # TODO: can do checks like no queries with zero cardinalities etc.
        qreps.append(qrep)
        template_name = os.path.basename(os.path.dirname(qfn))
        qrep["name"] = os.path.basename(qfn)
        qrep["template_name"] = template_name
    return qreps

def get_query_fns(basedir, template_fraction=1.0, sel_templates=None):
    fns = []
    tmpnames = list(glob.glob(os.path.join(basedir, "*")))
    print(tmpnames)
    assert template_fraction <= 1.0
    
    for qi,qdir in enumerate(tmpnames):
        if os.path.isfile(qdir):
            print(qdir)
            continue
        template_name = os.path.basename(qdir)
        if sel_templates is not None and template_name not in sel_templates:
            continue
        
        # let's first select all the qfns we are going to load
        qfns = list(glob.glob(os.path.join(qdir, "*.pkl")))
        qfns.sort()
        num_samples = max(int(len(qfns)*template_fraction), 1)
        random.seed(1234)
        qfns = random.sample(qfns, num_samples)
        fns += qfns
    return fns

# Evaluation helper functions

In [None]:
def omega_approx(beta):
    """Return an approximate omega value for given beta. Equation (5) from Gavish 2014."""
    return 0.56 * beta**3 - 0.95 * beta**2 + 1.82 * beta + 1.43

def svht(X, sigma=None, sv=None):
    """Return the optimal singular value hard threshold (SVHT) value.
    `X` is any m-by-n matrix. `sigma` is the standard deviation of the 
    noise, if known. Optionally supply the vector of singular values `sv`
    for the matrix (only necessary when `sigma` is unknown). If `sigma`
    is unknown and `sv` is not supplied, then the method automatically
    computes the singular values."""

    try:
        m,n = sorted(X.shape) # ensures m <= n
    except:
        raise ValueError('invalid input matrix')
    beta = m / n # ratio between 0 and 1
    if sigma is None: # sigma unknown
        if sv is None:
            sv = svdvals(X)
        sv = np.squeeze(sv)
        if sv.ndim != 1:
            raise ValueError('vector of singular values must be 1-dimensional')
        return np.median(sv) * omega_approx(beta)
    else: # sigma known
        return lambda_star(beta) * np.sqrt(n) * sigma

# find tau star hat when sigma is unknown
# tau = svht(D, sv=sv)

# # find tau star when sigma is known
# tau = svht(D, sigma=0.5)

# Load queries

In [None]:
# set template_fraction <= 1.0 to test quickly w/ smaller datasets
# train_qfns = get_query_fns(TRAINDIR, template_fraction = 0.001)
# val_qfns = get_query_fns(VALDIR, template_fraction = 1.0)
# test_qfns = get_query_fns(TESTDIR, template_fraction = 1.0)

#qfns = get_query_fns(TRAINDIR, template_fraction = 1.0, sel_templates=None)

qfns = get_query_fns(TRAINDIR, template_fraction = 1.0, sel_templates=None)
print(len(qfns))
qdata = load_qdata(qfns)

In [None]:
q = qdata[0]
sg = q["subset_graph"]
sg.nodes()

In [None]:
from collections import defaultdict
import numpy

subplan_data = defaultdict(list)

rowkeys = set()

cur_exps = []

qid = 0
newqs = {}

for qi, qrep in enumerate(qdata):
    for node in qrep["subset_graph"].nodes():
        rowkeys.add(node)
        
    tmp = rtdf[rtdf["qname"] == qrep["name"]]
    
    if len(tmp) != 0:
        for explain in tmp["exp_analyze"].values:
            newqs[len(cur_exps)] = qid
            cur_exps.append(explain)
        qid += 1
    
rowkeys = list(rowkeys)
rowkeys.sort()
rowidxs = {rk:ri for ri,rk in enumerate(rowkeys)}

In [None]:
rtdf

In [None]:
rtmat = np.zeros((len(rowidxs), qid))
print(rtmat.shape)

# ['tables', 'aliases', 'Plan Rows', 'Actual Rows', 'total_time', 'cur_time', 
#  'Node Type', 'Total Cost', 'cur_cost', 'node_label', 'scan_type']

for ei, exp in enumerate(cur_exps):
    try:
        exp = eval(exp)
    except:
        continue
        
    G = explain_to_nx(exp)
    cur_qid = newqs[ei]
    for node,ndata in G.nodes(data=True):
        cal = ndata["aliases"]
        cal.sort()
        rt = ndata["cur_time"]
        #print(ndata.keys())
        rtmat[rowidxs[tuple(cal)], cur_qid] = rt
        
rtmat = rtmat[~np.all(rtmat == 0, axis=1)]

In [None]:
rtmat.shape

In [None]:
nz = np.count_nonzero(rtmat)
tot = rtmat.shape[0]*rtmat.shape[1]
print("Non Zero Fraction: ", nz / tot)

In [None]:
P, S, Q = np.linalg.svd(rtmat, full_matrices=False)
print(S.shape)
print(S.round(2))

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.plot(S)
plt.yscale("log")
plt.show()

In [None]:
cds = np.cumsum(S) / np.sum(S)
r90 = np.min(np.where(cds > 0.90))
print("90% explained by: ", r90)

tau = svht(rtmat, sv=S)
rank = np.sum(S > tau)
print("Noise cut-off: ", rank)

In [None]:
import copy
logrtmat = copy.deepcopy(rtmat)
logrtmat += 1

logrtmat = np.log(logrtmat)
P, S, Q = np.linalg.svd(logrtmat, full_matrices=False)
print(S.shape)
print(S.round(2))

In [None]:
cds = np.cumsum(S) / np.sum(S)
r90 = np.min(np.where(cds > 0.90))
print("90% explained by: ", r90)

tau = svht(rtmat, sv=S)
rank = np.sum(S > tau)
print("Noise cut-off: ", rank)