In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import pandas as pd 
from vega_datasets import data
from src.space.DataModel import Attribute

df = data.movies()

# name attributes
name_attrs = [
    col
    for col in df.columns    
    if (df[col].dtype == "object" and df[col].nunique() < 10) or df[col].dtype != "object"
]
dic = {
    col: "C" if df[col].dtype == "object" else "Q"
    for col in df.columns
}
dic[None] = None

df = df[name_attrs]
df.info()

In [None]:
from src.oracle import ColumbusProbOracle, OracleWeight
from src.ProbColumbus import (
    ProbColumbus,
    ColumbusConfig,
    SamplingWeight,
    chart_type,
    agg_type,
    ProbabilisticNode,
)

oracle_weight = OracleWeight(
    specificity=1.0,
    uniqueness=1.0,
    coverage=1.0,
    interestingness=1.0,
)
oracle = ColumbusProbOracle(oracle_weight)
prob = ProbColumbus(df, ColumbusConfig())
print(len(chart_type), len(agg_type))
# def func(weight, n_samples, n_dashboards):
#     dashboards = [
#         prob.sample_max_n(n_samples, weight)
#         for _ in range(n_dashboards)
#     ]
#     scores = [prob.infer(dashboard, oracle, ["attr_IMDB_Votes", "rect"]) for dashboard in dashboards]
#     print(np.max(scores), np.mean(scores))
#     sc = np.mean(scores)
#     return -sc


# def func_max(weight, n_samples):
#     dashboard = prob.sample_max_n(n_samples, weight)
#     score = prob.infer(dashboard, oracle, ["attr_IMDB_Votes", "rect"])
#     print(score)
#     return -score


In [None]:
from skopt.space import Categorical, Space
from skopt import Optimizer
from skopt.utils import use_named_args
from src.ChartMap import chart_map
from tqdm import tqdm
import json


EPOCH = 50
NUM_CHARTS = 6

attrs = [None] + name_attrs

opt_space = Space(
    [
        Categorical(attrs[1:], name="x"),
        Categorical(attrs, name="y"),
        Categorical(attrs, name="z"),
        Categorical(chart_type, name="ct"),
        Categorical(agg_type, name="at"),
    ]
)

all_space = [
    (x, y, z, ct, at)
    for x in attrs[1:]
    for y in attrs
    for z in attrs
    for ct in chart_type
    for at in agg_type
]
# filter all_space using chart_map
filtered_space = []

chart_map = set([tuple(chart) for chart in chart_map])

for x in tqdm(all_space):
    checker = tuple([dic[attr] for attr in x[:3]] + [x[3], x[4]])
    for chart in chart_map:
        if checker in chart_map and len(x) == len(set(x)):
            filtered_space.append(json.dumps(x))
            break


In [None]:
space = Space([
    Categorical(filtered_space, name=f"chart{i}") for i in range(NUM_CHARTS)
])

opt_dashboard = Optimizer(space, "gp", n_initial_points=10, acq_optimizer="auto")


for i in range(200):
    x = opt_dashboard.ask()
    x_loads = [json.loads(x_) for x_ in x]
    x_nodes = [prob.get_node(x_) for x_ in x_loads]
    print(x_loads)
    score = prob.infer(x_nodes, oracle, ["attr_IMDB_Votes", "rect"])
    opt_dashboard.tell(x, -score)

opt_dashboard.get_result()
    

In [None]:
opt_cat = Optimizer(opt_space, random_state=42, base_estimator="GP", acq_func="EI")
n = 0
for i in range(50):
    dashboard: list[ProbabilisticNode] = []
    xs = []
    while len(dashboard) < NUM_CHARTS:
        x = opt_space.rvs()[0]
        checker = [dic[attr] for attr in x[:3]] + [x[3], x[4]]
        for chart in chart_map:
            if all([checker[i] == chart[i] for i in range(5)]) and len(checker) == len(
                set(checker)
            ):
                xs.append(x)
                dashboard.append(prob.get_node(x))
                break
    score = prob.infer(dashboard, oracle, ["attr_IMDB_Votes", "rect"])
    opt_cat.tell(xs, [-score for _ in range(NUM_CHARTS)])


In [None]:
opt_cat.get_result()

In [None]:
def func_cat(x):
    return -prob.infer(x, oracle, ["attr_IMDB_Votes", "rect"])
    
    
for i in range(100):
    x = opt_cat.ask(n_points=6)
    processed_x = [prob.get_node(sample) for sample in x]
    y = func_cat(processed_x)
    opt_cat.tell(processed_x, y)



In [None]:
from skopt import Optimizer
from skopt.space import Space, Real
from IPython.display import clear_output

attrs = [None] + name_attrs

x = [Real(0.01, 1.0, name=f"x{i}") for i in range(len(attrs[1:]))]
y = [Real(0.01, 1.0, name=f"y{i}") for i in range(len(attrs))]
z = [Real(0.01, 1.0, name=f"z{i}") for i in range(len(attrs))]
ct = [Real(0.01, 1.0, name=f"ct{i}") for i in range(len(chart_type))]
at = [Real(0.01, 1.0, name=f"at{i}") for i in range(len(agg_type))]


opt_weight = Space([*x, *y, *z, *ct, *at])


opt = Optimizer(opt_weight, base_estimator="GP", n_initial_points=10, acq_func="EI")


weights = []

for i in range(100):
    params = opt.ask()
    x = np.array(params)
    weight = SamplingWeight(
        x=x[0 : len(attrs) - 1],
        y=x[len(attrs) - 1 : 2 * len(attrs) - 1],
        z=x[2 * len(attrs) - 1 : 3 * len(attrs) - 1],
        ct=x[3 * len(attrs) - 1 : 3 * len(attrs) - 1 + len(chart_type)],
        at=x[3 * len(attrs) - 1 + len(chart_type) :],
        attr=attrs,
        chart_type=chart_type,
        agg_type=agg_type,
    )

    weights.append(weight)

    # clear_output(wait=True)
    # display(weight.visualize())
    y = func_max(weight, 6)
    opt.tell(params, y)


In [None]:
print(opt.get_result())
func_vals = opt.get_result().func_vals
print(np.argmin(func_vals), np.min(func_vals))

In [None]:
import altair as alt

x = {
    chart_type[i] : [w.ct[i] for w in weights]
    for i in range(len(chart_type))
}

x["epoch"] = list(range(len(weights)))
x["func_vals"] = [-w for w in opt.get_result().func_vals]

data = pd.DataFrame.from_dict(x)
data.head()

In [None]:
max_weight = weights[np.argmin(func_vals)]

max_weight.visualize()


In [None]:
chart = (
    alt.Chart(data)
    .mark_line()
    .encode(
        x="epoch",
        y=alt.Y(alt.repeat("row"), type="quantitative"),
        color=alt.datum(alt.repeat("row")),
    )
    .properties(width=900)
    .repeat(row=["func_vals"] + chart_type)
)
chart


In [None]:
func(max_weight, 6, 50)
func(
    SamplingWeight(
        attr=attrs,
        chart_type=chart_type,
        agg_type=agg_type,
        x=np.ones(len(attrs) - 1) / (len(attrs) - 1),
        y=np.ones(len(attrs)) / len(attrs),
        z=np.ones(len(attrs)) / len(attrs),
        ct=np.ones(len(chart_type)) / len(chart_type),
        at=np.ones(len(agg_type)) / len(agg_type),
    ),
    6,
    50,
)
