# Minimal Gzip + JSON Indexer (Overlap-Safe, Unique Top-Level Keys)

This notebook indexes **top-level keys** in large `.json.gz` files without a full JSON parse.

## What was fixed
- **No array-as-scalar bugs**: value classification happens only after the first non-whitespace token after `:`.
- **Unique top-level keys persisted across chunks**: each top-level key is processed/logged once across the whole stream.
- **Overlap-safe state retention**: if a token spans chunks (e.g., an incomplete key or string scalar), the notebook preserves only the unfinished fragment; otherwise it keeps a small fixed overlap tail.
- **Progress reporting** based on compressed bytes read.


## Install dependency

```bash
pip install indexed_gzip
```

In [1]:
import json
import logging
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, Optional, Set, Tuple

import indexed_gzip as igzip

# ---------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s"
)
LOG = logging.getLogger(__name__)

# ---------------------------------------------------------------------
# Data Structures
# ---------------------------------------------------------------------

@dataclass
class JsonIndex:
    """Minimal structural index for a top-level JSON object."""
    top_level_offsets: Dict[str, int] = field(default_factory=dict)
    scalar_values: Dict[str, str] = field(default_factory=dict)

    # Persisted across all chunks: each top-level key should be observed once.
    seen_top_level_keys: Set[str] = field(default_factory=set)

# ---------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------

def is_whitespace(ch: str) -> bool:
    return ch in " \t\n\r"

# ---------------------------------------------------------------------
# Core Indexing Logic
#   - Defers value classification until first non-whitespace token
#   - Logs progress based on compressed bytes
#   - Persists unique top-level keys (only log/process once across all chunks)
#   - Overlap-safe: if a token is incomplete at a chunk boundary, keep exactly
#     the unfinished fragment; otherwise keep a fixed overlap tail.
# ---------------------------------------------------------------------

def build_index(
    path_gz: Path,
    json_index_file: Path,
    gzip_index_file: Optional[Path] = None,
    spacing: int = 300 * 1024,
    chunk_size: int = 256 * 1024,
    overlap_size: int = 4096,
    progress_every_mb: int = 256,
    log_new_keys: bool = True,
) -> JsonIndex:
    """Build a minimal index of top-level JSON keys from a .json.gz file.

    Captures:
      - Offsets for top-level arrays (position where '[' begins in decompressed stream)
      - Scalar values for top-level scalars (strings, numbers, true/false/null)

    Key guarantees:
      - Each top-level key is processed once across all chunks (tracked in seen_top_level_keys).
      - Overlap handling retains state only when needed: if a token spans chunks,
        the unfinished fragment is preserved; otherwise we keep the last overlap_size chars.

    Progress:
      - Based on compressed bytes read from the .gz file (fh.fileobj.tell()).
    """

    gz_size = path_gz.stat().st_size
    report_interval = max(1, progress_every_mb) * 1024 * 1024
    next_report = report_interval

    LOG.info(
        "Indexing %s (compressed size %.2f GB)",
        path_gz,
        gz_size / (1024 ** 3),
    )

    index = JsonIndex()

    with igzip.IndexedGzipFile(
        filename=str(path_gz),
        index_file=str(gzip_index_file) if gzip_index_file else None,
        spacing=spacing,
    ) as fh:

        buf = ""
        # pos is the decompressed character position corresponding to buf[0]
        pos = 0

        depth = 0
        in_string = False
        escape = False

        current_key: Optional[str] = None
        awaiting_value = False

        while True:
            chunk = fh.read(chunk_size)
            if not chunk:
                break

            # -----------------------------
            # Progress reporting (compressed bytes)
            # -----------------------------
            try:
                comp_pos = fh.fileobj.tell()
                if comp_pos >= next_report:
                    pct = (comp_pos / gz_size) * 100 if gz_size else 0.0
                    LOG.info(
                        "Progress: %.1f%% (%.2f / %.2f GB compressed) | unique_top_keys=%d",
                        pct,
                        comp_pos / (1024 ** 3),
                        gz_size / (1024 ** 3),
                        len(index.seen_top_level_keys),
                    )
                    next_report += report_interval
            except Exception:
                pass

            text = chunk.decode("utf-8", errors="ignore")
            buf += text

            i = 0
            keep_from: Optional[int] = None  # if set, preserve buf[keep_from:] (unfinished token)

            while i < len(buf):
                ch = buf[i]

                # ---------------------------------------------------------
                # Awaiting value classification (only for a NEW top-level key)
                # ---------------------------------------------------------
                if awaiting_value and depth == 1 and current_key is not None:
                    if is_whitespace(ch):
                        i += 1
                        continue

                    # Array value -> record offset at '['
                    if ch == "[":
                        index.top_level_offsets[current_key] = pos + i
                        current_key = None
                        awaiting_value = False
                        i += 1
                        continue

                    # Object value at top level -> not a scalar; ignore but maintain depth
                    if ch == "{":
                        current_key = None
                        awaiting_value = False
                        depth += 1
                        i += 1
                        continue

                    # Scalar value
                    start = i

                    if ch == '"':
                        # Need to parse a quoted string scalar safely across chunks
                        in_string = True
                        escape = False
                        i += 1
                        while i < len(buf):
                            c2 = buf[i]
                            if escape:
                                escape = False
                            elif c2 == "\\":
                                escape = True
                            elif c2 == '"':
                                in_string = False
                                i += 1
                                break
                            i += 1

                        if in_string:
                            # string scalar spans chunks: keep from its starting quote
                            keep_from = start
                            break

                        # After string, consume whitespace; do NOT eat delimiters
                        while i < len(buf) and is_whitespace(buf[i]):
                            i += 1

                        value = buf[start:i].strip()
                        index.scalar_values[current_key] = value
                        current_key = None
                        awaiting_value = False
                        continue

                    else:
                        # Non-string scalar: read until ',' or '}' (top-level object end)
                        while i < len(buf) and buf[i] not in ",}":
                            i += 1
                        value = buf[start:i].strip()
                        index.scalar_values[current_key] = value
                        current_key = None
                        awaiting_value = False
                        continue

                # ---------------------------------------------------------
                # General string handling (outside of the scalar-string path)
                # ---------------------------------------------------------
                if in_string:
                    if escape:
                        escape = False
                    elif ch == "\\":
                        escape = True
                    elif ch == '"':
                        in_string = False
                    i += 1
                    continue

                # ---------------------------------------------------------
                # Key detection (top-level keys only): "key" :
                # ---------------------------------------------------------
                if ch == '"':
                    key_quote_start = i  # for boundary preservation if needed
                    start = i + 1
                    end = buf.find('"', start)
                    if end == -1:
                        # key name spans chunks: keep from starting quote
                        keep_from = key_quote_start
                        break

                    key = buf[start:end]
                    i = end + 1

                    # Skip whitespace
                    while i < len(buf) and is_whitespace(buf[i]):
                        i += 1

                    # Confirm ':' and top-level depth
                    if i < len(buf) and buf[i] == ":" and depth == 1:
                        # Persist only once across all chunks
                        if key not in index.seen_top_level_keys:
                            index.seen_top_level_keys.add(key)
                            if log_new_keys:
                                LOG.info("Discovered top-level key: %s", key)

                            current_key = key
                            awaiting_value = True
                        else:
                            # Key already handled; do not enter awaiting_value
                            current_key = None
                            awaiting_value = False

                        i += 1  # consume ':'
                    continue

                # ---------------------------------------------------------
                # Structural depth tracking
                # ---------------------------------------------------------
                if ch == "{":
                    depth += 1
                elif ch == "}":
                    depth -= 1

                i += 1

            # ---------------------------------------------------------
            # Overlap retention
            #   - If token unfinished, keep exactly from keep_from onward.
            #   - Otherwise keep a fixed tail overlap.
            # ---------------------------------------------------------
            if keep_from is not None:
                # Advance pos by the amount we are discarding
                pos += keep_from
                buf = buf[keep_from:]
            else:
                if len(buf) > overlap_size:
                    discard = len(buf) - overlap_size
                    pos += discard
                    buf = buf[-overlap_size:]

    # Persist index (include seen keys count for auditing)
    json_index_file.write_text(
        json.dumps(
            {
                "offsets": index.top_level_offsets,
                "scalars": index.scalar_values,
                "seen_top_level_keys_count": len(index.seen_top_level_keys),
                "seen_top_level_keys": sorted(index.seen_top_level_keys),
            },
            indent=2,
        )
    )

    LOG.info("Index written to %s", json_index_file)
    return index

# ---------------------------------------------------------------------
# Convenience Wrapper (avoid FileNotFoundError if .gzidx doesn't exist)
# ---------------------------------------------------------------------

def build_index_wrapper(input_file: Path, output_dir: Optional[Path] = None) -> JsonIndex:
    output_dir = output_dir or input_file.parent

    json_index_file = output_dir / f"{input_file.name}.index.json"
    gzip_index_file = output_dir / f"{input_file.name}.gzidx"

    # Only pass index_file if it already exists (some versions error otherwise).
    gzip_index_arg = gzip_index_file if gzip_index_file.exists() else None

    return build_index(
        path_gz=input_file,
        json_index_file=json_index_file,
        gzip_index_file=gzip_index_arg,
    )


## Example usage

Update the path below to point to your `.json.gz` file.


In [2]:
from pathlib import Path

# Example:
input_file = Path(r"D:\\_ingested_2026-01_720_27B0_in-network-rates_01_of_57.json.gz")
idx = build_index_wrapper(input_file)
print(idx.top_level_offsets)
print(idx.scalar_values)


2026-01-26 15:05:37,890 | INFO | Indexing D:\_ingested_2026-01_720_27B0_in-network-rates_01_of_57.json.gz (compressed size 0.02 GB)
2026-01-26 15:05:37,898 | INFO | Discovered top-level key: reporting_entity_name
2026-01-26 15:05:37,899 | INFO | Discovered top-level key: reporting_entity_type
2026-01-26 15:05:37,899 | INFO | Discovered top-level key: last_updated_on
2026-01-26 15:05:37,900 | INFO | Discovered top-level key: version
2026-01-26 15:05:37,901 | INFO | Discovered top-level key: provider_references
2026-01-26 15:05:37,928 | INFO | Discovered top-level key: in_network
2026-01-26 15:05:43,416 | INFO | Discovered top-level key: negotiation_arrangement
2026-01-26 15:05:43,417 | INFO | Discovered top-level key: name
2026-01-26 15:05:43,418 | INFO | Discovered top-level key: billing_code_type
2026-01-26 15:05:43,418 | INFO | Discovered top-level key: billing_code_type_version
2026-01-26 15:05:43,419 | INFO | Discovered top-level key: billing_code
2026-01-26 15:05:43,420 | INFO | D

{'provider_references': 195, 'in_network': 539926, 'negotiated_rates': 39167079, 'negotiated_prices': 40104521, 'billing_code_modifier': 41414778, 'service_code': 41415061}
{'reporting_entity_name': '"Blue Cross and Blue Shield of Minnesota"', 'reporting_entity_type': '"Health insurance Issuer"', 'last_updated_on': '"2025-11-23"', 'version': '"1.3.1"', 'negotiation_arrangement': '"ffs"', 'name': '"OUTPATIENT CARE"', 'billing_code_type': '"CPT"', 'billing_code_type_version': '"2025"', 'billing_code': '"99283"', 'description': '"EMERGENCY DEPARTMENT VISIT FOR THE EVALUATION AND MANAGEMENT OF A PATIENT, WHICH REQUIRES A MEDICALLY APPROPRIATE HISTORY AND/OR EXAMINATION AND LOW LEVEL OF MEDICAL DECISION MAKING "', 'billing_class': '"professional"', 'negotiated_type': '"negotiated"', 'negotiated_rate': '73.77', 'expiration_date': '"9999-12-31"'}
