In [2]:
import duckdb

In [3]:
import polars as pl

In [4]:
from random import choice
import random
from string import ascii_lowercase, digits

import numpy as np
from shapely import box
import geopandas as gpd

from srai.constants import FEATURES_INDEX
from srai.embedders.contextual_count_embedder import ContextualCountEmbedder
from srai.embedders.count_embedder import CountEmbedder
from srai.h3 import ring_buffer_h3_regions_gdf
from srai.joiners.intersection_joiner import IntersectionJoiner
from srai.neighbourhoods.h3_neighbourhood import H3Neighbourhood
from srai.regionalizers.h3_regionalizer import H3Regionalizer
from srai.regionalizers.s2_regionalizer import S2Regionalizer


H3_RESOLUTION = 5
# TODO: increase after rewriting s2 regionalizer
S2_RESOLUTION = 6  # 13
H3_DISTANCE = 6

chars = ascii_lowercase + digits
columns = ["".join(choice(chars) for _ in range(8)) for _ in range(100)]
values = ["".join(choice(chars) for _ in range(8)) for _ in range(100)]

area = gpd.GeoDataFrame(geometry=[box(5.818355, 46.037418, 24.363277, 52.769854)], crs=4326)

h3_regions = H3Regionalizer(resolution=H3_RESOLUTION).transform(area)
print(f"H3 regions: {len(h3_regions)}")
buffered_h3_regions = ring_buffer_h3_regions_gdf(h3_regions, H3_DISTANCE)
print(f"Buffered H3 regions: {len(buffered_h3_regions)}")

s2_regions = S2Regionalizer(resolution=S2_RESOLUTION).transform(area)
print(f"S2 regions: {len(s2_regions)}")

data = np.full((len(s2_regions), len(columns)), None)
for i in range(len(s2_regions)):
    data[i, random.randint(0, len(columns) - 1)] = random.choice(values)

s2_regions[columns] = data
s2_regions.index.rename(FEATURES_INDEX, inplace=True)

joint = IntersectionJoiner().transform(buffered_h3_regions, s2_regions)
print(f"Joint: {len(joint)}")

count_embeddings = CountEmbedder(
    count_subcategories=True,
).transform(buffered_h3_regions, s2_regions, joint)

count_embeddings

# embeddings = ContextualCountEmbedder(
#     neighbourhood=H3Neighbourhood(),
#     neighbourhood_distance=H3_DISTANCE,
#     count_subcategories=True,
#     concatenate_vectors=False,
# ).transform(buffered_h3_regions, s2_regions, joint)

H3 regions: 4332
┌────────────────────┐
│     region_id      │
│       uint64       │
├────────────────────┤
│ 599534078974230527 │
│ 599534084342939647 │
│ 599534884280598527 │
│ 599534886428082175 │
│ 599533068583174143 │
│ 599534259362856959 │
│ 599533061066981375 │
│ 599534972327428095 │
│ 599534832740990975 │
│ 599533044960854015 │
│          ·         │
│          ·         │
│          ·         │
│ 599507235428630527 │
│ 599507215027535871 │
│ 599522371967123455 │
│ 599522213053333503 │
│ 599521966092713983 │
│ 599522311837581311 │
│ 599507304148107263 │
│ 599512430191575039 │
│ 599522282846552063 │
│ 599522028369739775 │
├────────────────────┤
│     6078 rows      │
│     (20 shown)     │
└────────────────────┘

┌────────────────────┬───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│     region_id      │                                                                

Saving count embeddings: 100%|██████████| 6078/6078 [00:00<00:00, 18729.03it/s]


ParquetDataTable (6078 rows, 74 columns)
  Parquet files (29.7K):
    files/pdt/CountEmbedder_20251005_222829_910431_embeddings/20251005_222829_924131.parquet (29.7K)
  Index columns:
    region_id
  Persists: False
┌────────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬───────────────────┬──

In [5]:
count_embeddings.parquet_paths

[PosixPath('files/pdt/CountEmbedder_20251005_222829_910431_embeddings/20251005_222829_924131.parquet')]

In [6]:
print("Dense array size:", count_embeddings.to_dataframe().values.nbytes / 1e6, "MB")

Dense array size: 1.799088 MB


In [7]:
columns = count_embeddings.columns

csr_rows = []
csr_cols = []
csr_values = []

print("Columns:", columns)
for batch in count_embeddings.to_duckdb(with_row_number=True).fetch_arrow_reader():
    pl_df = pl.from_arrow(batch)
    for row in pl_df.iter_rows(named=True):
        row_idx = row["row_number"] - 1  # duckdb row_number starts from 1
        for col_idx, column_name in enumerate(columns):
            val = row[column_name]
            if val > 0:
                csr_rows.append(row_idx)
                csr_cols.append(col_idx)
                csr_values.append(val)
            # if row[column_name] is not None:
            #     assert row[column_name] in values, f"Unexpected value {row[column_name]} in column {column_name}"
    # for _, row in batch:
    #     print(row)
    #     break

print("CSR size:", len(csr_values), "elements")

Columns: ['0u9mmdus_r4rh2fnj', '0zfgzr8p_s2v79i0n', '2qw1z51w_gzhm2r24', '3mlu2axu_9cqeb29w', '3rcq344v_xp9rrp6q', '468xdj0b_avm15fg2', '4f1tw0ub_q24612h6', '5ldhw295_zytfrh0a', '6japdd26_lswyb219', '6japdd26_upgdw64n', '776oid2s_w5b6cy21', '8y99bap8_0iowkapl', '8y99bap8_gva6ua0l', '9fygvmuj_o8vc1jbg', '9iww5myc_gzhm2r24', '9iww5myc_p5nofpon', '9qwcqwwa_zv5blt45', 'bbr0hwjx_a1l3lupe', 'bbr0hwjx_b5wfatud', 'byjv7h24_5xtxbknw', 'byjv7h24_8l8f7nc1', 'dxlv9lfn_gva6ua0l', 'dzg5v1bd_wlaz0jdw', 'f1k6m28c_4uycys5t', 'f1k6m28c_y78yd0k1', 'ffvmll8t_e3a5nyle', 'g45o6382_vnedbdqt', 'gdcbikpc_68pspoh8', 'hi1lche4_avm15fg2', 'hzaysf27_1vn21nku', 'hzaysf27_hxlyeg9c', 'idyk4kpu_avm15fg2', 'idyk4kpu_vfke2xq5', 'j4inguh7_avm15fg2', 'jj0jf1tg_81600mpz', 'jj0jf1tg_9ars2xxm', 'kip9qv00_ei54kv5x', 'kip9qv00_zgapoly3', 'kjwugzuc_5xtxbknw', 'lbdpbit6_uh9j7vqg', 'lnfio4mo_8hszcggo', 'lnfio4mo_phvg273e', 'lnfio4mo_w5b6cy21', 'lw84b354_gdptbh0q', 'mpsymros_zv5blt45', 'n2sh6ysy_174colw4', 'n2sh6ysy_xf24732r', 'nl

In [8]:
max(csr_cols), max(csr_rows), len(data), len(columns)

(73, 6077, 74, 74)

In [9]:
import scipy.sparse as sp

n_rows = count_embeddings.rows
n_cols = len(count_embeddings.columns)
coo = sp.coo_matrix((csr_values, (csr_rows, csr_cols)), shape=(n_rows, n_cols))
csr = coo.tocsr()

In [10]:
# 2️⃣ Sparse memory (CSR)
csr_mem = (
    csr.data.nbytes +
    csr.indices.nbytes +
    csr.indptr.nbytes
)
print("CSR matrix size:", csr_mem / 1e6, "MB")

# 3️⃣ Sparse memory (COO)
coo_mem = (
    coo.data.nbytes +
    coo.row.nbytes +
    coo.col.nbytes
)
print("COO matrix size:", coo_mem / 1e6, "MB")

CSR matrix size: 0.10864 MB
COO matrix size: 0.112432 MB


In [11]:
import pandas as pd

df = pd.DataFrame.sparse.from_spmatrix(
    csr,
    index=count_embeddings.to_duckdb().select("region_id").fetchnumpy()["region_id"],
    columns=count_embeddings.columns,
)
df

Unnamed: 0,0u9mmdus_r4rh2fnj,0zfgzr8p_s2v79i0n,2qw1z51w_gzhm2r24,3mlu2axu_9cqeb29w,3rcq344v_xp9rrp6q,468xdj0b_avm15fg2,4f1tw0ub_q24612h6,5ldhw295_zytfrh0a,6japdd26_lswyb219,6japdd26_upgdw64n,...,vt0vf5js_y456swfv,wg3l1rxy_dmblerzf,xhu6rxfx_im4n67wj,xxk6dxd8_e0azhft1,y1afl3mu_wkvnf0x4,yyazroev_vfke2xq5,zf6lhvbh_1tn72adh,zf6lhvbh_zgapoly3,zuk8e6t0_p5nofpon,zuk8e6t0_tvp38aeh
599534250772922367,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
599534246477955071,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
599534258289115135,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
599534259362856959,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
599534260436598783,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599522028369739775,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
599522026222256127,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
599522014411096063,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
599512171419795455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [14]:
df

Unnamed: 0,0u9mmdus_r4rh2fnj,0zfgzr8p_s2v79i0n,2qw1z51w_gzhm2r24,3mlu2axu_9cqeb29w,3rcq344v_xp9rrp6q,468xdj0b_avm15fg2,4f1tw0ub_q24612h6,5ldhw295_zytfrh0a,6japdd26_lswyb219,6japdd26_upgdw64n,...,vt0vf5js_y456swfv,wg3l1rxy_dmblerzf,xhu6rxfx_im4n67wj,xxk6dxd8_e0azhft1,y1afl3mu_wkvnf0x4,yyazroev_vfke2xq5,zf6lhvbh_1tn72adh,zf6lhvbh_zgapoly3,zuk8e6t0_p5nofpon,zuk8e6t0_tvp38aeh
599534250772922367,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
599534246477955071,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
599534258289115135,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
599534259362856959,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
599534260436598783,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599522028369739775,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
599522026222256127,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
599522014411096063,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
599512171419795455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [13]:
import pyarrow as pa

arrow_sparse_csr_matrix = pa.SparseCSRMatrix.from_scipy(csr)
arrow_sparse_csr_matrix

<pyarrow.SparseCSRMatrix>
type: int64
shape: (6078, 74)

In [15]:
ddf = count_embeddings.to_dataframe()

print(
    "DF dense size:",
    (ddf.index.nbytes + ddf.columns.nbytes + sum(ddf[c].values.nbytes for c in ddf.columns)) / 1e6,
    "MB",
)

DF dense size: 1.848304 MB


In [16]:
print("sparse: {:0.2f} mb".format(ddf.memory_usage().sum() / 1e6))

sparse: 1.85 mb


In [17]:
print("sparse: {:0.2f} mb".format(df.memory_usage().sum() / 1e6))

sparse: 0.13 mb


In [19]:
ddf.join(df, rsuffix="_sparse") #.memory_usage().sum() / 1e6

Unnamed: 0_level_0,0u9mmdus_r4rh2fnj,0zfgzr8p_s2v79i0n,2qw1z51w_gzhm2r24,3mlu2axu_9cqeb29w,3rcq344v_xp9rrp6q,468xdj0b_avm15fg2,4f1tw0ub_q24612h6,5ldhw295_zytfrh0a,6japdd26_lswyb219,6japdd26_upgdw64n,...,vt0vf5js_y456swfv_sparse,wg3l1rxy_dmblerzf_sparse,xhu6rxfx_im4n67wj_sparse,xxk6dxd8_e0azhft1_sparse,y1afl3mu_wkvnf0x4_sparse,yyazroev_vfke2xq5_sparse,zf6lhvbh_1tn72adh_sparse,zf6lhvbh_zgapoly3_sparse,zuk8e6t0_p5nofpon_sparse,zuk8e6t0_tvp38aeh_sparse
region_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
599534250772922367,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
599534246477955071,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
599534258289115135,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
599534259362856959,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
599534260436598783,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
599522028369739775,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
599522026222256127,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
599522014411096063,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
599512171419795455,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
print(
    "DF sparse size:",
    (df.index.nbytes + df.columns.nbytes + sum(df[c].values.nbytes for c in df.columns)) / 1e6,
    "MB",
)


DF sparse size: 0.133532 MB


In [69]:
csr

<6078x73 sparse matrix of type '<class 'numpy.int64'>'
	with 7027 stored elements in Compressed Sparse Row format>

In [None]:
import numpy as np
import scipy.sparse as sp

rows = []
cols = []
values = []

count_embeddings.to_dataframe().values

# Imagine you iterate over your data rows
for row_idx, (region, feature_dict) in enumerate(data):
    # feature_dict = {col_idx: value, ...}, e.g. {12: 0.7, 1134: 0.2}
    for col_idx, val in feature_dict.items():
        if val != 0:
            rows.append(row_idx)
            cols.append(col_idx)
            values.append(val)

# Build sparse matrix once
n_rows = len(data)
n_cols = 5000  # number of feature columns
coo = sp.coo_matrix((values, (rows, cols)), shape=(n_rows, n_cols))
csr = coo.tocsr()

In [36]:
rel = count_embeddings.to_duckdb()
rel.columns[1:]

from rq_geo_toolkit.duckdb import sql_escape

cols = list(map(lambda c: f'"{c}"', rel.columns[1:4346]))

print(f"""
    SELECT x.region_id, {', '.join(cols)}
    FROM ({rel.sql_query()}) x
    """)

r = duckdb.sql(
    f"""
    SELECT region_id, array_value({', '.join(cols)}) AS embedding
    FROM read_parquet('files/pdt/CountEmbedder_20251005_132917_045489_embeddings/20251005_132917_049375.parquet')
    """
)

# rel.to_parquet('column_embeddings.parquet')
# r.to_parquet('array_embeddings.parquet')
r


    SELECT x.region_id, "0b16al5m_pc6lq9wc", "0nixrlu9_bgns60vw", "3wgpd5ad_3y2clp84", "3wgpd5ad_w5buyetg", "48vy2w1k_2ys8pd5g", "48vy2w1k_74s33y8e", "4smv2jgh_shfmgqi8", "4smv2jgh_ym9sco76", "4tzfyead_wxuk6e44", "4tzfyead_xntadju4", "50bt2na8_brxrtjpa", "60zgda5f_5gain4pv", "67c9sc9g_7rde8gjc", "67c9sc9g_pzf02ewq", "6c8oveka_ezpad6o3", "6c8oveka_i8mkxhsl", "7jt067pt_nq9k9wbm", "7n5je5gh_eggy2uln", "7n5je5gh_mo6f7mq5", "7n5je5gh_rjrli8w5", "8ahd0ng1_3isifew0", "8dat1vu5_hqk6ti27", "cxczdr8y_7rde8gjc", "cxczdr8y_hqk6ti27", "ezc3hsfa_dpilnhaq", "ezc3hsfa_et1yo75u", "ezc3hsfa_pzf02ewq", "f7zhotb3_0yv8nhid", "fn2q80tc_eggy2uln", "fv6wohyj_48ivbwww", "fv6wohyj_lywptyo2", "fv6wohyj_ml80psmi", "fv6wohyj_q7zwxk6f", "j29qhpqq_2ys8pd5g", "j29qhpqq_7bdfdfy2", "j29qhpqq_xrmajflf", "j29qhpqq_z3jyilxm", "jwtg1bwd_871rzt5o", "jyfpfjg0_qiutw38v", "k90exuss_mo6f7mq5", "k90exuss_nfg843sf", "kj8hiitl_e92rwd5e", "lkiepu34_4tsb1nuf", "lkiepu34_w5bq3et4", "lkiepu34_wdjjm1zs", "lx74klw2_wl39y35s", "mco83znt

┌────────────────────┬─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┐
│     region_id      │                                                                                                          embedding                                                                                                          │
│       uint64       │                                                                                                         integer[73]                                                                                                         │
├────────────────────┼─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────┤
│ 599534250772922367

In [33]:
duckdb.read_parquet("array_embeddings.parquet").pl().cast(
    {"embedding": pl.Array(pl.Int32, shape=73)}
).head(2).select(pl.col("embedding")).sum() #.group_by("region_id").agg(pl.col("embedding").sum())

embedding
"array[i32, 73]"
""


In [18]:
pl.read_parquet('column_embeddings.parquet')

region_id,0b16al5m_pc6lq9wc,0nixrlu9_bgns60vw,3wgpd5ad_3y2clp84,3wgpd5ad_w5buyetg,48vy2w1k_2ys8pd5g,48vy2w1k_74s33y8e,4smv2jgh_shfmgqi8,4smv2jgh_ym9sco76,4tzfyead_wxuk6e44,4tzfyead_xntadju4,50bt2na8_brxrtjpa,60zgda5f_5gain4pv,67c9sc9g_7rde8gjc,67c9sc9g_pzf02ewq,6c8oveka_ezpad6o3,6c8oveka_i8mkxhsl,7jt067pt_nq9k9wbm,7n5je5gh_eggy2uln,7n5je5gh_mo6f7mq5,7n5je5gh_rjrli8w5,8ahd0ng1_3isifew0,8dat1vu5_hqk6ti27,cxczdr8y_7rde8gjc,cxczdr8y_hqk6ti27,ezc3hsfa_dpilnhaq,ezc3hsfa_et1yo75u,ezc3hsfa_pzf02ewq,f7zhotb3_0yv8nhid,fn2q80tc_eggy2uln,fv6wohyj_48ivbwww,fv6wohyj_lywptyo2,fv6wohyj_ml80psmi,fv6wohyj_q7zwxk6f,j29qhpqq_2ys8pd5g,j29qhpqq_7bdfdfy2,j29qhpqq_xrmajflf,j29qhpqq_z3jyilxm,jwtg1bwd_871rzt5o,jyfpfjg0_qiutw38v,k90exuss_mo6f7mq5,k90exuss_nfg843sf,kj8hiitl_e92rwd5e,lkiepu34_4tsb1nuf,lkiepu34_w5bq3et4,lkiepu34_wdjjm1zs,lx74klw2_wl39y35s,mco83znt_0yv8nhid,mco83znt_4j65job5,np0fikhz_dpilnhaq,rfawy5i4_q7zwxk6f,rz13h953_krhu9ytl,rz13h953_pc6lq9wc,s2b74iyy_3y2clp84,s8ubpxq7_ml80psmi,s8ubpxq7_nq9k9wbm,sg1ldket_3y2clp84,sg1ldket_kgfggin6,si3mzhrb_3244n3e0,si3mzhrb_shfmgqi8,vk9fboc0_6hef0xil,vtb006xw_pbmwqrj6,xxvz3y93_2ys8pd5g,xyrisv0x_v8lui8q4,yad8umuq_38uirniw,yad8umuq_5l3urahe,yc43qqwm_r6ni9zm8,yg50uwjt_hpc70mvu,yrxg5c5u_6benz02z,ywnfb582_pc6lq9wc,ywnfb582_r6ni9zm8,zwvnk2p0_6hef0xil,zwwmei0m_0yv8nhid,zwwmei0m_lywptyo2
u64,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32,i32
599534250772922367,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
599534246477955071,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
599534258289115135,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
599534259362856959,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
599534260436598783,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…,…
599522028369739775,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
599522026222256127,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
599522014411096063,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
599512171419795455,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [17]:
pl.read_parquet('array_embeddings.parquet')

region_id,embedding
u64,list[struct[1]]
599534250772922367,"[{0}, {0}, … {0}]"
599534246477955071,"[{0}, {0}, … {0}]"
599534258289115135,"[{0}, {0}, … {0}]"
599534259362856959,"[{0}, {0}, … {0}]"
599534260436598783,"[{0}, {0}, … {0}]"
…,…
599522028369739775,"[{0}, {0}, … {0}]"
599522026222256127,"[{0}, {0}, … {0}]"
599522014411096063,"[{0}, {0}, … {0}]"
599512171419795455,"[{0}, {0}, … {0}]"


In [16]:
pl.read_parquet('array_embeddings.parquet', schema={'region_id': pl.UInt64, 'embedding': pl.Array(pl.Int32, shape=73)})

SchemaError: data type mismatch for column embedding: expected: array[i32, 73], found: list[struct[1]]

In [None]:
pl_df = count_embeddings.to_duckdb().pl()
cols = pl_df.columns[1:]
pl_df.select(pl.str)
# pl_df

TypeError: unhashable type: 'list'

In [None]:
pl_df

In [4]:
duckdb.sql(
    """
    WITH arrays as (
        SELECT UNNEST([
            [1, 5, 7],
            [4, 6, 1],
            [6, 0, 7],
            [5, 3, 2]
        ]) as values
    )
    SELECT SUM(values) FROM arrays
    """
)

BinderException: Binder Error: No function matches the given name and argument types 'sum(INTEGER[])'. You might need to add explicit type casts.
	Candidate functions:
	sum(DECIMAL) -> DECIMAL
	sum(BOOLEAN) -> HUGEINT
	sum(SMALLINT) -> HUGEINT
	sum(INTEGER) -> HUGEINT
	sum(BIGINT) -> HUGEINT
	sum(HUGEINT) -> HUGEINT
	sum(DOUBLE) -> DOUBLE
