In [20]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd

from query_representation.query import load_qrep
from cardinality_estimation.featurizer import Featurizer

import glob
import random
import os
import json
import time

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Setup file paths / Download query data

In [21]:
import errno
def make_dir(directory):
    try:
        os.makedirs(directory)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

In [22]:
# TODO
TRAINDIR = os.path.join(os.path.join("", "queries"), "mlsys1-train")
VALDIR = os.path.join(os.path.join("", "queries"), "mlsys1-val")
TESTDIR = os.path.join(os.path.join("", "queries"), "mlsys1-test")

RESULTDIR = os.path.join("", "results")
make_dir(RESULTDIR)

# Query loading helper functions

In [23]:
def load_qdata(fns):
    qreps = []
    for qfn in fns:
        qrep = load_qrep(qfn)
        # TODO: can do checks like no queries with zero cardinalities etc.
        qreps.append(qrep)
        template_name = os.path.basename(os.path.dirname(qfn))
        qrep["name"] = os.path.basename(qfn)
        qrep["template_name"] = template_name
    return qreps

def get_query_fns(basedir, template_fraction=1.0):
    fns = []
    tmpnames = list(glob.glob(os.path.join(basedir, "*")))
    assert template_fraction <= 1.0
    
    for qi,qdir in enumerate(tmpnames):
        if os.path.isfile(qdir):
            continue
        template_name = os.path.basename(qdir)
        # let's first select all the qfns we are going to load
        qfns = list(glob.glob(os.path.join(qdir, "*.pkl")))
        qfns.sort()
        num_samples = max(int(len(qfns)*template_fraction), 1)
        random.seed(1234)
        qfns = random.sample(qfns, num_samples)
        fns += qfns
    return fns

# Evaluation helper functions

In [24]:
def eval_alg(alg, eval_funcs, qreps, samples_type, result_dir="./results/"):
    '''
    '''
    np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})

    alg_name = alg.__str__()
    exp_name = alg.get_exp_name()
    ests = alg.test(qreps)

    for efunc in eval_funcs:
        rdir = None
        if result_dir is not None:
            rdir = os.path.join(result_dir, exp_name)
            make_dir(rdir)

        errors = efunc.eval(qreps, ests, samples_type=samples_type,
                result_dir=rdir,
                num_processes = -1,
                alg_name = alg_name)

        print("{}, {}, #samples: {}, {}: mean: {}, median: {}, 99p: {}"\
                .format(samples_type, alg, len(errors),
                    efunc.__str__(),
                    np.round(np.mean(errors),3),
                    np.round(np.median(errors),3),
                    np.round(np.percentile(errors,99),3)))

# Load queries

In [25]:
# set template_fraction <= 1.0 to test quickly w/ smaller datasets
trainqs = load_qdata(get_query_fns(TRAINDIR, template_fraction = 1.0))
valqs = load_qdata(get_query_fns(VALDIR, template_fraction = 1.0))
testqs = load_qdata(get_query_fns(TESTDIR, template_fraction = 1.0))

print("Loaded {} training queries, {} validation queries, {} test queries".\
      format(len(trainqs), len(valqs), len(testqs)))

Loaded 1256 training queries, 621 validation queries, 1260 test queries


# Explore Queries (TODO)

In [26]:
# Choose particular query, and show its properties + what exactly is cardinality estimation.

# Evaluation Functions

In [27]:
from evaluation.eval_fns import QError, SimplePlanCost
EVAL_FNS = []
EVAL_FNS.append(QError())
EVAL_FNS.append(SimplePlanCost())

# Evaluating baseline / heuristic estimates (TODO)
### shows example of true cardinalities, and postgresql estimates; introduces Q-Error and PlanCost

# Helper function for initializing featurizer
### Featurizer object contains information about the db, e.g., tables, joins, columns, how to featurize predicate filters etc.

In [28]:
def init_featurizer(featurization_type):
    # Load database specific data, e.g., information about columns, tables etc.
    dbdata_fn = os.path.join(TRAINDIR, "dbdata.json")
    featurizer = Featurizer(None, None, None, None, None)
    with open(dbdata_fn, "r") as f:
        dbdata = json.load(f)
    featurizer.update_using_saved_stats(dbdata)

    # ynormalization: takes log(y) for all target values, y.
    featurizer.setup(ynormalization="log",
            featurization_type=featurization_type)
    featurizer.update_ystats(trainqs)
    return featurizer

In [29]:
TRAINDIR

'queries/mlsys1-train'

# RandomForest model

In [30]:
from cardinality_estimation.algs import RandomForest
featurizer = init_featurizer("combined")
rf = RandomForest(grid_search = False,
                n_estimators = 10,
                max_depth = 6)
rf.train(trainqs, valqs=None, testqs=None,
    featurizer=featurizer, result_dir=RESULTDIR)

Extracting features took:  93.56864786148071


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 16 concurrent workers.


building tree 1 of 10
building tree 2 of 10
building tree 3 of 10
building tree 4 of 10
building tree 5 of 10
building tree 6 of 10
building tree 7 of 10
building tree 8 of 10building tree 9 of 10
building tree 10 of 10



[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:   44.7s remaining:  1.7min
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   45.1s finished


In [None]:
# evaluate model
eval_alg(rf, EVAL_FNS, trainqs, "train")
eval_alg(rf, EVAL_FNS, valqs, "val")

# TODO: should submit these for the leaderboard?
preds = rf.test(testqs)

Experiment name will be:  RandomForest24825937
Extracting features took:  95.80298805236816


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:    0.1s remaining:    0.1s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.1s finished


train, RandomForest, #samples: 546873, QError: mean: 133.507, median: 3.925, 99p: 620.755
train, RandomForest, #samples: 1256, SimplePlanCost: mean: 3777338.513, median: 981477.66, 99p: 41279502.652
Extracting features took:  46.78105711936951


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done   3 out of  10 | elapsed:    0.0s remaining:    0.1s
[Parallel(n_jobs=10)]: Done  10 out of  10 | elapsed:    0.0s finished


val, RandomForest, #samples: 271741, QError: mean: 226.26, median: 3.97, 99p: 735.144
val, RandomForest, #samples: 621, SimplePlanCost: mean: 3149672.976, median: 899305.046, 99p: 30179417.41


# XGBoost

In [100]:
from cardinality_estimation.algs import XGBoost
featurizer = init_featurizer("combined")
xgb = XGBoost(grid_search=False, tree_method="hist",
                       subsample=1.0, n_estimators = 100,
                       max_depth=10, lr = 0.01)
xgb.train(trainqs, valqs=None, testqs=None,
    featurizer=featurizer, result_dir=RESULTDIR)

Extracting features took:  22.899195909500122


In [None]:
# evaluate model
eval_alg(xgb, EVAL_FNS, trainqs, "train")
eval_alg(xgb, EVAL_FNS, valqs, "val")


# TODO: test set prdictions; should submit these for the leaderboard?
preds = xgb.test(testqs)

# Fully Connected Neural Network

In [89]:
from cardinality_estimation.fcnn import FCNN
featurizer = init_featurizer("combined")
fcnn = FCNN(max_epochs = 10,
     lr=0.0001,
     mb_size = 512,
     weight_decay = 0.0,
     result_dir = "./results",
     num_hidden_layers=4,
     optimizer_name="adamw",
     clip_gradient=20.0,
     loss_func_name = "mse",
     hidden_layer_size = 256)

fcnn.train(trainqs, valqs=None, testqs=None,
    featurizer=featurizer, result_dir=RESULTDIR)

Extracting features took:  22.69448685646057
SimpleRegression(
  (layers): ModuleList(
    (0): Sequential(
      (0): Linear(in_features=515, out_features=256, bias=True)
      (1): ReLU()
    )
    (1): Sequential(
      (0): Linear(in_features=256, out_features=256, bias=True)
      (1): ReLU()
    )
    (2): Sequential(
      (0): Linear(in_features=256, out_features=256, bias=True)
      (1): ReLU()
    )
    (3): Sequential(
      (0): Linear(in_features=256, out_features=256, bias=True)
      (1): ReLU()
    )
    (4): Sequential(
      (0): Linear(in_features=256, out_features=1, bias=True)
      (1): Sigmoid()
    )
  )
)
training samples: 133784, feature length: 515, model size: 1.318916,
        hidden_layer_size: 256
Epoch 0 took 2.22, Avg Loss: 0.009701
Epoch 1 took 2.69, Avg Loss: 0.00216
Epoch 2 took 2.37, Avg Loss: 0.001335
Epoch 3 took 2.49, Avg Loss: 0.00103
Epoch 4 took 2.48, Avg Loss: 0.000851
Epoch 5 took 2.46, Avg Loss: 0.00075
Epoch 6 took 2.5, Avg Loss: 0.000655

In [90]:
# evaluate model
eval_alg(fcnn, EVAL_FNS, trainqs, "train")
eval_alg(fcnn, EVAL_FNS, valqs, "val")

# TODO: test set prdictions; should submit these for the leaderboard?
preds = fcnn.test(testqs)

Experiment name will be:  FCNN1017090249
Extracting features took:  22.05372905731201
train, FCNN, #samples: 133784, QError: mean: 1.834, median: 1.347, 99p: 6.04
train, FCNN, #samples: 307, SimplePlanCost: mean: 2868145.107, median: 471279.311, 99p: 48213382.76


# Multi Set Convolutional Network

## Notes

* Introduced by Kipf et al. in this [paper](https://arxiv.org/abs/1809.00677). Architecture based on [Deep Sets](https://arxiv.org/abs/1703.06114).
* Does not reserve an exact mapping for features on a particular table / column. Treats table features, join features, and predicate features as set of vectors. Has practical benefits over the flat 1d featurization (see discussion in README). But requires each batch to have same shape; thus a lot of the smaller query features need to be padded with zeros, which makes the memory consumption become much larger (can probably improve this somehow).
* load_padded_mscn_feats = True (see MSCN initialization below), loads these padded sets in memory; takes more RAM, but is faster; load_padded_mscn_feats = False, pads the vectors as needed --- takes longer to train (TODO: current python implementation can be improved).

In [104]:
from cardinality_estimation.mscn import MSCN

featurizer = init_featurizer("set")

# load_padded_mscn_feats = True means all the fea
mscn = MSCN(max_epochs = 10,
     load_padded_mscn_feats = False,
     lr=0.0001,
     mb_size = 512,
     weight_decay = 0.0,
     result_dir = "./results",
     optimizer_name="adamw",
     clip_gradient=20.0,
     loss_func_name = "mse",
     hidden_layer_size = 256)

mscn.train(trainqs, valqs=None, testqs=None,
    featurizer=featurizer, result_dir=RESULTDIR)

Extracting features took:  27.066265106201172
SetConv(
  (sample_mlp1): Linear(in_features=15, out_features=256, bias=True)
  (sample_mlp2): Linear(in_features=256, out_features=256, bias=True)
  (predicate_mlp1): Linear(in_features=55, out_features=256, bias=True)
  (predicate_mlp2): Linear(in_features=256, out_features=256, bias=True)
  (join_mlp1): Linear(in_features=43, out_features=256, bias=True)
  (join_mlp2): Linear(in_features=256, out_features=256, bias=True)
  (out_mlp1): Linear(in_features=768, out_features=256, bias=True)
  (out_mlp2): Linear(in_features=256, out_features=1, bias=True)
)
training samples: 133784, model size: 1.696772,
        hidden_layer_size: 256
Epoch 0 took 90.57, Avg Loss: 0.012292
Epoch 1 took 95.94, Avg Loss: 0.005715
Epoch 2 took 97.58, Avg Loss: 0.0046
Epoch 3 took 93.91, Avg Loss: 0.004018
Epoch 4 took 93.38, Avg Loss: 0.003625
Epoch 5 took 96.69, Avg Loss: 0.003289
Epoch 6 took 91.85, Avg Loss: 0.003012
Epoch 7 took 94.12, Avg Loss: 0.002765
Epo

In [102]:
# evaluate model
eval_alg(mscn, EVAL_FNS, trainqs, "train")
eval_alg(mscn, EVAL_FNS, valqs, "val")

# TODO: test set prdictions; should submit these for the leaderboard?
preds = mscn.test(testqs)

Experiment name will be:  MSCN132345184
Extracting features took:  104.61881613731384
train, MSCN, #samples: 133784, QError: mean: 37.971, median: 2.293, 99p: 45.606
train, MSCN, #samples: 307, SimplePlanCost: mean: 3125948.222, median: 505817.82, 99p: 37967613.474
