In [2]:
from generate_data import generate
from pyspark.sql import SparkSession

from thundera.metadata import Table
from thundera.metrics import generate_table_metrics


def get_spark() -> SparkSession:
    return (
        SparkSession.builder.master("local[1]")
        .appName("local-tests")
        .config("spark.executor.cores", "1")
        .config("spark.executor.instances", "1")
        .config("spark.sql.shuffle.partitions", "1")
        .config("spark.driver.bindAddress", "127.0.0.1")
        .getOrCreate()
    )


spark = get_spark()
data = generate(spark, 10_000)
data.show()

                                                                                

+------------------+-------------------+------------------+
|             var_a|              var_b|             var_c|
+------------------+-------------------+------------------+
| 77.67771538053506| 19.837057355315537|15.667246693155075|
|            -999.0|             -101.0|108.08428039264697|
| 94.86255767379843|             -101.0|              -9.0|
|            -999.0|             -101.0| 26.58943753669811|
|107.54339241883065|             -101.0|17.357876768380176|
|107.81127663332678| 17.868750055419184| 12.26477800468196|
| 99.07930705135796|             -101.0| 31.90839603043737|
|            -999.0|-25.319551909461097|16.587545336699765|
| 91.07322451961426|             -201.0| 7.508058710577704|
|101.17535029403079|             -101.0|  36.2727877982264|
| 90.01198041017726|             -101.0|              -9.0|
|             -99.0|             -101.0|10.252950938926164|
|              NULL|             -101.0|26.002635343828143|
|114.78735410210774|             -101.0|

In [3]:
table = Table.from_yaml("./metadata.yml")
metric_data = generate_table_metrics(data, table)
metric_data

                                                                                

{'var_a': {'histogram': {'values': [(1, 66.06343324348504, 1.0),
    (2, 68.45591960684908, 2.0),
    (3, 71.33356492685698, 23.0),
    (4, 73.82702645212825, 34.0),
    (5, 77.06390408174018, 74.0),
    (6, 80.28287663354416, 121.0),
    (7, 82.73517588563716, 164.0),
    (8, 85.03496442723578, 217.0),
    (9, 88.09246503240571, 566.0),
    (10, 91.06759528498849, 474.0),
    (11, 93.23987494237602, 485.0),
    (12, 95.19768175661727, 516.0),
    (13, 97.30840434731583, 624.0),
    (14, 99.56452757144345, 631.0),
    (15, 101.54278413744109, 621.0),
    (16, 103.55843364236765, 583.0),
    (17, 105.65374291657623, 536.0),
    (18, 107.89391592999331, 529.0),
    (19, 110.04884469538214, 365.0),
    (20, 112.15167021917081, 274.0),
    (21, 114.27726789182005, 222.0),
    (22, 116.82817625762068, 244.0),
    (23, 120.0218773896262, 120.0),
    (24, 122.5816379902633, 51.0),
    (25, 124.70865066228637, 26.0),
    (26, 126.85718154812864, 11.0),
    (27, 129.06356387743023, 5.0),
    (2

In [45]:
import altair as alt
import pandas as pd 
field = "var_a"
domain = "values"

data = metric_data[field]["histogram"][domain]
data = pd.DataFrame([dict(zip(("bin", "center", "count"), item)) for item in data])




    bin      center  count  center_diff
0     1   66.063433    1.0          NaN
1     2   68.455920    2.0     2.392486
2     3   71.333565   23.0     2.877645
3     4   73.827026   34.0     2.493462
4     5   77.063904   74.0     3.236878
5     6   80.282877  121.0     3.218973
6     7   82.735176  164.0     2.452299
7     8   85.034964  217.0     2.299789
8     9   88.092465  566.0     3.057501
9    10   91.067595  474.0     2.975130
10   11   93.239875  485.0     2.172280
11   12   95.197682  516.0     1.957807
12   13   97.308404  624.0     2.110723
13   14   99.564528  631.0     2.256123
14   15  101.542784  621.0     1.978257
15   16  103.558434  583.0     2.015650
16   17  105.653743  536.0     2.095309
17   18  107.893916  529.0     2.240173
18   19  110.048845  365.0     2.154929
19   20  112.151670  274.0     2.102826
20   21  114.277268  222.0     2.125598
21   22  116.828176  244.0     2.550908
22   23  120.021877  120.0     3.193701
23   24  122.581638   51.0     2.559761




In [19]:
import altair as alt
import pandas as pd
# Sample data (bin, center, count) as a list of dictionaries
data = [
    {'bin': 1, 'center': 66.06343324348504, 'count': 1.0},
    {'bin': 2, 'center': 68.45591960684908, 'count': 2.0},
    {'bin': 3, 'center': 71.33356492685698, 'count': 23.0},
    {'bin': 4, 'center': 73.82702645212825, 'count': 34.0},
    {'bin': 5, 'center': 77.06390408174018, 'count': 74.0}
]

data = pd.DataFrame(data)

# Create a basic bar chart
histogram = alt.Chart(data).mark_bar().encode(
    x=alt.X('center:Q', title='Bin Center'),
    y=alt.Y('count:Q', title='Count')
).properties(
    title='Histogram of Binned Data'
)

histogram.show()
