In [None]:
from typing import Iterable, Optional, Set, List
from pathlib import Path
import hashlib
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.compute as pc
from deltalake import DeltaTable, write_deltalake

In [None]:
# ---------- helpers ----------
def _ensure_array(col) -> pa.Array:
    """Always return a pa.Array"""
    if isinstance(col, pa.ChunkedArray):
        return col.chunk(0) if col.num_chunks == 1 else col.combine_chunks()
    return col

def _resolve_cols(
    all_cols: Set[str],
    include_cols: Optional[Iterable[str]],
    exclude_cols: Optional[Iterable[str]],
) -> List[str]:
    if include_cols is None:
        chosen = set(all_cols)
    else:
        include_cols = set(include_cols)
        missing = include_cols - all_cols
        if missing:
            raise ValueError(f"Column(S) not found: {sorted(missing)}")
        chosen = set(include_cols)

    if exclude_cols:
        exclude_cols = set(exclude_cols)
        missing_ex = exclude_cols - all_cols
        if missing_ex:
            raise ValueError(f"Excluded column(s) not found: {sorted(missing_ex)}")
        chosen -= exclude_cols

    cols = sorted(chosen)
    if not cols:
        raise ValueError("No columns available to be hashed (check include/exclude).")
    return cols

def _prepare_arrays_for_hash(tbl: pa.Table, cols: Iterable[str], null_token: str) -> List[pa.Array]:
    """Normaliseer elke gekozen kolom naar Arrow string, met vaste NULL-afhandeling."""
    arrays: List[pa.Array] = []
    for c in cols:
        arr = _ensure_array(tbl[c])
        # dictionary/binary veilig naar string
        if pa.types.is_dictionary(arr.type):
            arr = pc.cast(arr, pa.large_string())
        else:
            target = pa.large_string() if pa.types.is_binary(arr.type) else pa.string()
            arr = pc.cast(arr, target)
        arr = pc.fill_null(arr, null_token)
        arrays.append(arr)
    return arrays

def _hash_joined(joined: pa.Array, algo: str) -> pa.Array:
    """
    Gebruikt Python's hashlib.
    Retourneert pa.StringArray met hex-digests.
    """
    name = algo.lower()
    # Python hashlib (batch-gewijs om geheugen te beperken)
    n = len(joined)
    batch = 200_000  # pas aan naar wens
    out_hex: List[str] = []
    if name not in ("sha256", "md5"):
        raise ValueError("Unsupported algo. Gebruik 'sha256' of 'md5'.")

    for start in range(0, n, batch):
        chunk = joined.slice(start, min(batch, n - start)).to_pylist()  # list[str]
        if name == "sha256":
            out_hex.extend(hashlib.sha256(s.encode("utf-8")).hexdigest() for s in chunk)
        else:
            out_hex.extend(hashlib.md5(s.encode("utf-8")).hexdigest() for s in chunk)
    return pa.array(out_hex, type=pa.string())

In [None]:
def add_row_hash_to_parquet(
    src_path: str,
    dest_path: str,
    include_cols: Optional[Iterable[str]] = None,  # None => hele rij
    exclude_cols: Optional[Iterable[str]] = None,
    new_col: str = "row_hash",
    algo: str = "sha256",      # 'sha256' of 'md5'
    null_token: str = "‚àÖ",
    sep: str = "\x1f",
    parquet_write_kwargs: Optional[dict] = None,
) -> None:
    tbl: pa.Table = pq.read_table(src_path).combine_chunks()
    if new_col in tbl.column_names:
        raise ValueError(f"Kolom '{new_col}' bestaat al in het Parquetbestand.")

    cols = _resolve_cols(set(tbl.column_names), include_cols, exclude_cols)
    arrays = _prepare_arrays_for_hash(tbl, cols, null_token)

    # Join per rij ‚Äî belangrijk: arrays als *args en separator positioneel (PyArrow 17)
    joined = pc.binary_join_element_wise(*arrays, sep)

    # Hash (Arrow-kernel of fallback)
    digest_hex = _hash_joined(joined, algo)

    tbl_hashed = tbl.append_column(new_col, digest_hex)
    pq.write_table(tbl_hashed, dest_path, **(parquet_write_kwargs or {}))

In [None]:
def add_row_hash_to_delta_table(
    table_path: str,
    include_cols: Optional[Iterable[str]] = None,  # None => hele rij
    exclude_cols: Optional[Iterable[str]] = None,
    new_col: str = "row_hash",
    algo: str = "sha256",
    null_token: str = "‚àÖ",
    sep: str = "\x1f",
    mode: str = "overwrite",
    write_kwargs: Optional[dict] = None,
) -> int:
    dt = DeltaTable(table_path)
    tbl: pa.Table = dt.to_pyarrow_table().combine_chunks()
    if new_col in tbl.column_names:
        raise ValueError(f"Kolom '{new_col}' bestaat al in de Delta-tabel.")

    cols = _resolve_cols(set(tbl.column_names), include_cols, exclude_cols)
    arrays = _prepare_arrays_for_hash(tbl, cols, null_token)

    joined = pc.binary_join_element_wise(*arrays, sep)
    digest_hex = _hash_joined(joined, algo)

    tbl_hashed = tbl.append_column(new_col, digest_hex)
    write_deltalake(table_path, tbl_hashed, mode=mode, **(write_kwargs or {}))
    return DeltaTable(table_path).version()

In [None]:
# # Gebruik
# src = "/lakehouse/default/Files/test/ods_reports/2025/10/03/run_20251003T060110071/AOV_PRODUCT_OVERZICHT/AOV_PRODUCT_OVERZICHT_00000.parquet"
# dest = "/lakehouse/default/Files/test/ods_reports/2025/10/03/run_20251003T060110071/AOV_PRODUCT_OVERZICHT/AOV_PRODUCT_OVERZICHT_00000_hashed.parquet"

# add_row_hash_to_parquet(src, dest, include_cols=None, exclude_cols=None)


In [1]:
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from collections import defaultdict

def calculate_size_by_level(base_path, aggregation_level, level_filters=None, max_workers=20):
    """
    Bereken storage per folder op een specifiek niveau met optionele filters.
    Geoptimaliseerd: filtert tijdens os.walk() voor maximale snelheid.
    """
    
    if level_filters is None:
        level_filters = {}
    
    size_by_path = defaultdict(lambda: {'size': 0, 'files': 0})
    
    def get_aggregation_key(full_path, base, level):
        """Bepaal de aggregatie key op basis van het niveau"""
        relative = os.path.relpath(full_path, base)
        parts = relative.split(os.sep)
        return os.sep.join(parts[:level]) if len(parts) >= level else relative
    
    def process_folder(folder_path):
        """Verwerk √©√©n folder"""
        local_sizes = defaultdict(lambda: {'size': 0, 'files': 0})
        
        try:
            for item in os.listdir(folder_path):
                item_path = os.path.join(folder_path, item)
                if os.path.isfile(item_path):
                    try:
                        size = os.path.getsize(item_path)
                        key = get_aggregation_key(folder_path, base_path, aggregation_level)
                        local_sizes[key]['size'] += size
                        local_sizes[key]['files'] += 1
                    except:
                        pass
        except:
            pass
        
        return local_sizes
    
    # Slim verzamelen van folders met filtering tijdens walk
    all_folders = []
    
    for root, dirs, files in os.walk(base_path):
        # Bepaal huidige niveau
        relative = os.path.relpath(root, base_path)
        if relative == '.':
            current_parts = []
        else:
            current_parts = relative.split(os.sep)
        
        current_level = len(current_parts)
        
        # Filter subdirectories op basis van filters
        if current_level + 1 in level_filters:
            filter_value = level_filters[current_level + 1]
            # Behoud alleen directories die matchen met de filter
            dirs[:] = [d for d in dirs if d == filter_value]
        
        # Check of huidige folder aan alle filters voldoet
        matches = True
        for level, filter_value in level_filters.items():
            if level <= len(current_parts):
                if current_parts[level - 1] != filter_value:
                    matches = False
                    break
        
        if matches:
            all_folders.append(root)
    
    print(f"üîç Relevante folders na filtering: {len(all_folders):,}")
    print(f"üìä Aggregatie niveau: {aggregation_level}")
    if level_filters:
        print(f"üîé Filters actief:")
        level_names = {1: "source", 2: "year", 3: "month", 4: "day", 5: "run_ts", 6: "table"}
        for level, value in sorted(level_filters.items()):
            print(f"   - Niveau {level} ({level_names.get(level, 'unknown')}): {value}")
    print("‚öôÔ∏è  Berekenen...\n")
    
    # Parallel processing
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_folder, folder) for folder in all_folders]
        
        for future in as_completed(futures):
            local_result = future.result()
            for key, data in local_result.items():
                size_by_path[key]['size'] += data['size']
                size_by_path[key]['files'] += data['files']
    
    return dict(size_by_path)

# ============================================================================
# CONFIGURATIE
# ============================================================================

base_path = "/lakehouse/default/Files/greenhouse_sources"

# Stel aggregatie niveau in
aggregation_level = 3  # 1=source, 2=year, 3=month, 4=day, 5=run_ts, 6=table

# Stel filters in (None voor geen filter, of dict met level: waarde)
# Voorbeelden:
# level_filters = None                              # Geen filters
# level_filters = {1: "anva_concern"}              # Alleen anva_concern
# level_filters = {1: "anva_concern", 2: "2025"}   # anva_concern in 2025
# level_filters = {2: "2025", 3: "10"}             # Alle sources in 2025/10
#level_filters = None
level_filters = {2: "2025", 3: "10"}

# Bereken
results = calculate_size_by_level(base_path, aggregation_level, level_filters)

# Sorteer en toon resultaten
sorted_results = sorted(results.items(), key=lambda x: x[1]['size'], reverse=True)

print(f"\n{'='*93}")
print(f"üìà Resultaten (top 20 - gesorteerd op grootte)")
print(f"{'='*93}")
print(f"{'Path':<50} {'Files':>10} {'Size (MB)':>15} {'Size (GB)':>15}")
print(f"{'-'*93}")

for path, data in sorted_results[:20]:
    size_mb = data['size'] / (1024**2)
    size_gb = data['size'] / (1024**3)
    print(f"{path:<50} {data['files']:>10,} {size_mb:>15.2f} {size_gb:>15.2f}")

# Totalen
total_size = sum(d['size'] for d in results.values())
total_files = sum(d['files'] for d in results.values())
print(f"{'-'*93}")
print(f"{'TOTAAL':<50} {total_files:>10,} {total_size/(1024**2):>15.2f} {total_size/(1024**3):>15.2f}")
print(f"\n‚úì Aantal unieke paden op niveau {aggregation_level}: {len(results):,}")


StatementMeta(, 20c1d4c1-3390-4047-a06c-d08d6d296c91, 3, Finished, Available, Finished)

üîç Relevante folders na filtering: 96
üìä Aggregatie niveau: 3
üîé Filters actief:
   - Niveau 2 (year): 2025
   - Niveau 3 (month): 10
‚öôÔ∏è  Berekenen...


üìà Resultaten (top 20 - gesorteerd op grootte)
Path                                                    Files       Size (MB)       Size (GB)
---------------------------------------------------------------------------------------------
vizier/2025/10                                             53         2328.62            2.27
anva_meeus/2025/10                                         70          131.60            0.13
---------------------------------------------------------------------------------------------
TOTAAL                                                    123         2460.21            2.40

‚úì Aantal unieke paden op niveau 3: 2
