In [7]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pandas as pd
from src.ChartMap import chart_type, agg_type
from vega_datasets import data

from src.oracle import ColumbusOracle, OracleWeight, OracleResult
from src.generator.Generator import (
    Explorer,
    SamplingWeight,
    VisualizationNode,
)
from IPython.display import clear_output
from collections import Counter
import altair as alt

df = data.movies()
df = df[[
    col
    for col in df.columns
    if (df[col].dtype == "object" and df[col].nunique() < 10)
    or df[col].dtype != "object"
]]


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [13]:


oracle_weight = OracleWeight(
    specificity=1.0,
    interestingness=1.0,
    diversity=1.0,
    coverage=2.0,
    conciseness=1.0,
)
oracle = ColumbusOracle(df, oracle_weight)
expl = Explorer(df)



attr_names = [None if attr == None else attr.name for attr in expl.attrs]


n_epoch = 1000
n_dashboards = 100
halving = 0.1

raw_means = []
raw_maxs = []
norm_means = []
norm_maxs = []

n_charts = []

conjugate_priors = SamplingWeight(
    x=np.ones(len(attr_names)),
    y=np.ones(len(attr_names)),
    z=np.ones(len(attr_names)),
    ct=np.ones(len(chart_type)),
    at=np.ones(len(agg_type)),
    n_chart=8.0,
)

def mean(l):
    return sum(l) / len(l)

for epoch in range(n_epoch):
    
    n_charts = [max(np.random.normal(conjugate_priors.n_chart, 1), 2) for _ in range(n_dashboards)]
    candidate: list[list[VisualizationNode]] = [expl.sample_n(round(n_chart), conjugate_priors) for n_chart in n_charts]
    
    
    results: list[OracleResult] = [oracle.get_result(dashboard, set(["IMDB_Votes", "boxplot"])) for dashboard in candidate]
    
    
    specificity = np.array([result.specificity for result in results])
    interestingness = np.array([result.interestingness for result in results])
    conciseness = np.array([result.conciseness for result in results])
    
    diversity = np.array([result.diversity for result in results])
    coverage = np.array([result.coverage for result in results])
    
    raw_scores = specificity +  interestingness + conciseness + diversity + coverage
    
    # z normalize each scores
    norm_specificity = (specificity - specificity.mean()) / specificity.std()
    norm_interestingness = (interestingness - interestingness.mean()) / interestingness.std()
    norm_conciseness = (conciseness - conciseness.mean()) / conciseness.std()
    norm_diversity = (diversity - diversity.mean()) / diversity.std()
    norm_coverage = (coverage - coverage.mean()) / coverage.std()
    
    normalized_scores = norm_specificity +  norm_interestingness + norm_conciseness + norm_diversity + norm_coverage
    # normalized_scores = list(normalized_scores)
    
    
    candi_n_score = list(zip(candidate, normalized_scores))
    candi_n_score = sorted(candi_n_score, key=lambda x: x[1], reverse=True)
    
    
    raw_maxs.append(raw_scores.max())
    raw_means.append(raw_scores.mean())
    norm_maxs.append(normalized_scores.max())
    norm_means.append(normalized_scores.mean())
    
    # halving
    halved_candidate = candi_n_score[: int(n_dashboards * halving)]
    
    counters = [Counter() for _ in range(5)]
    counters[0][None] = 0
    counters[1][None] = 0
    counters[2][None] = 0
    
    halved_n_charts = []
    
    
    for dashboard in halved_candidate:
        halved_n_charts.append(len(dashboard[0]))
        for s in dashboard[0]:           
            ct, x, y, z, at = s.sample
            chart = [x, y, z, ct, at]
            
            for i in range(3):
                if chart[i] is None:
                    counters[i][None] += 1
                elif chart[i].name in counters[i]:
                    counters[i][chart[i].name] += 1
                else:
                    counters[i][chart[i].name] = 1

            for i in [3,4]:
                if chart[i] in counters[i]:
                    counters[i][chart[i]] += 1
                else:
                    counters[i][chart[i]] = 1
    
    # update conjucate prior with liklihood by counter
    x = np.array([counters[0][attr] for attr in attr_names])
    y = np.array([counters[1][attr] for attr in attr_names])
    z = np.array([counters[2][attr] for attr in attr_names])
    ct = np.array([counters[3][c] for c in chart_type])
    at = np.array([counters[4][a] for a in agg_type])
    halved_n_charts = np.array(halved_n_charts)
    
    observed = int(n_dashboards * halving)
    
    prior_mean = conjugate_priors.n_chart
    prior_var = 1
    
    halved_mean = halved_n_charts.mean()
    halved_var = halved_n_charts.var()
    
    posterior_mean = (prior_mean / prior_var + halved_mean * observed / halved_var) / (1 / prior_var + observed / halved_var)
    posterior_var = (1 / prior_var + observed / halved_var) ** -1
    
    conjugate_priors.x += x
    conjugate_priors.y += y
    conjugate_priors.z += z
    conjugate_priors.ct += ct
    conjugate_priors.at += at
    conjugate_priors.n_chart =  posterior_mean
    
    n_charts.append(posterior_mean)
    # visualize mean and max using altair
   
    print(len(range(epoch+1)))
    print(len(raw_means))
    print(len(raw_maxs))
    print(len(norm_means))
    print(len(norm_maxs))
    print(len(n_charts))
    data = pd.DataFrame(
        {
            "epoch": range(epoch+1),
            "raw_means": raw_means,
            "raw_max": raw_maxs,
            "norm_means": norm_means,
            "norm_max": norm_maxs,
            "n_charts": n_charts
        }
    )
    line = alt.Chart(data).mark_line().encode(
        x="epoch",
    )
    clear_output(wait=True)
    print(f"Epoch {epoch}")
    display(
        line.encode(y=alt.Y("raw_means", scale=alt.Scale(zero=False))) |
        line.encode(y=alt.Y("raw_max",scale=alt.Scale(zero=False))) |
        line.encode(y=alt.Y("n_charts",scale=alt.Scale(zero=False))) 
        )
    display(pd.DataFrame(conjugate_priors.x / conjugate_priors.x.sum(), index=attr_names).T)
    display(pd.DataFrame(conjugate_priors.y / conjugate_priors.y.sum(), index=attr_names).T)
    display(pd.DataFrame(conjugate_priors.z / conjugate_priors.z.sum(), index=attr_names).T)
    display(pd.DataFrame(conjugate_priors.ct / conjugate_priors.ct.sum(), index=chart_type).T)
    display(pd.DataFrame(conjugate_priors.at / conjugate_priors.at.sum(), index=agg_type).T)

    display(pd.DataFrame([specificity.mean(), interestingness.mean(), conciseness.mean(), diversity.mean(), coverage.mean()], index=["specificity", "interestingness", "conciseness", "diversity", "coverage"]).T)
    display(pd.DataFrame([specificity.max(), interestingness.max(), conciseness.max(), diversity.max(), coverage.max()], index=["specificity", "interestingness", "conciseness", "diversity", "coverage"]).T)
    
    maxnodes: list[VisualizationNode] = candi_n_score[0][0]
    max_res = oracle.get_result(maxnodes, set(["IMDB_Votes", "boxplot"]))
    print(max_res)
    altairs = [node.get_altair().properties(width=100, height=100) for node in maxnodes]

    rows: list[alt.HConcatChart] = [
        alt.hconcat(*altairs[i : i + 4]).resolve_scale(
            color="independent"
        )
        for i in range(0, len(altairs), 4)
    ]
    display(alt.vconcat(*rows))
    

1
1
1
1
1
101


ValueError: All arrays must be of the same length

In [None]:
import numpy as np
from numpy.random import dirichlet, multinomial

# Define the Dirichlet prior parameters
alpha = [1, 1, 1]

# Generate some data from a Multinomial distribution
n = 10
p = [0.3, 0.4, 0.3] # true probabilities
counts = multinomial(n, p)

new_counts = multinomial(n, dirichlet(alpha + counts))

print("Observed counts:", counts)
print("Posterior parameters:", new_counts)
print("New counts:", new_counts)



print(multinomial(1, dirichlet(alpha + counts)))

Observed counts: [3 3 4]
Posterior parameters: [5 1 4]
New counts: [5 1 4]
[0 1 0]
