# Random-access `.json.gz` scanning with igzip index + chunked key counting

This notebook does **two** things on a large `json.gz`:

1. Builds an **indexed gzip** (igzip) random-access index so you can later seek to decompressed offsets efficiently.
2. Streams the **decompressed** JSON text in chunks, uses a regex to extract **JSON object keys** (`"key":`), maintains **running counts**, and records **byte offsets** (in the decompressed stream) for each key occurrence.

Design notes:
- The scan is performed on the **decompressed** byte stream (the same address space the igzip index typically targets).
- To handle keys split across chunk boundaries, the scanner keeps a small **tail overlap** (default 2 KB) and counts any regex match whose **end** crosses into the new chunk.
- Offsets are recorded as:
  - `abs_offset`: decompressed byte offset from the start of the JSON text
  - `chunk_index`: `abs_offset // chunk_size`
  - `offset_in_chunk`: `abs_offset % chunk_size`


In [1]:
# --- Dependencies ---
# The most common Python package providing "igzip-like" random access is `indexed_gzip`.
# If it's not installed in your environment, uncomment and run the pip install cell.

# !pip -q install indexed_gzip

import re
import json
from collections import Counter, defaultdict
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, Iterable, List, Tuple, Optional
import indexed_gzip
import os, time

In [2]:
# --- Open an indexed gzip file (igzip) ---
# `indexed_gzip` provides an IndexedGzipFile that can build and export an index.

def open_indexed_gzip(gz_path: str):
    # `drop_handles=False` helps keep handles for faster access in some environments.
    f = indexed_gzip.IndexedGzipFile(gz_path)
    return f


In [3]:
# --- Build + export igzip index ---

def build_and_export_index(gz_path: str, index_path: str):
    """Build a full index and export it to a file.

    Notes:
      - Building a full index can take time on huge files, but enables fast seeking later.
      - Index format is specific to `indexed_gzip`.
    """
    f = open_indexed_gzip(gz_path)
    # Build full index over the compressed file
    f.build_full_index()
    # Export index to disk
    index_path = str(index_path)
    f.export_index(index_path)
    f.close()
    return index_path


def open_with_existing_index(gz_path: str, index_path: str):
    """Open an indexed gzip using a pre-built exported index."""
    import indexed_gzip
    f = indexed_gzip.IndexedGzipFile(gz_path)
    f.import_index(index_path)
    return f


## Chunked regex scan for JSON keys

We treat the gzip stream as UTF-8 JSON text.

Regex:
- We look for JSON object keys: `"<string>":`
- Pattern is tolerant of escaped quotes inside strings.

Boundary handling:
- Keep a **tail overlap** (`overlap_bytes`, default 2048).
- For each chunk, scan `tail + chunk`.
- Count any match whose **end position** is **after** the tail length; this guarantees we count boundary-spanning keys exactly once.

Offset attribution:
- `abs_offset` is computed as `chunk_start_abs - len(tail) + match.start()`
  - `chunk_start_abs` is the decompressed offset where the current chunk begins.


In [33]:
# --- Scanner implementation ---

KEY_PATTERN = re.compile(rb'"((?:\\.|[^"\\]){1,512})"\s*:')

@dataclass
class KeyHit:
    key: str
    abs_offset: int          # decompressed absolute byte offset
    chunk_index: int
    offset_in_chunk: int


def _decode_json_string_bytes(b: bytes) -> str:
    s = b.decode("utf-8", errors="replace")
    try:
        return json.loads('"' + s.replace('"', '\\"') + '"')
    except Exception:
        return s


def scan_unique_keys_with_offsets(
    gz_path: str,
    index_path: str | None = None,
    chunk_size: int = 8 * 1024 * 1024,
    overlap_bytes: int = 2048,
    progress_every_chunks: int = 20,
):

    total_compressed = os.path.getsize(gz_path)
    start_time = time.time()

    if index_path:
        f = open_with_existing_index(gz_path, index_path)
    else:
        f = open_indexed_gzip(gz_path)

    seen_once: dict[str, int] = {}
    seen_multiple: set[str] = set()

    tail = b""
    abs_read = 0
    chunk_index = 0

    while True:
        chunk = f.read(chunk_size)
        if not chunk:
            break

        chunk_start_abs = abs_read
        abs_read += len(chunk)

        combined = tail + chunk
        tail_len = len(tail)

        for m in KEY_PATTERN.finditer(combined):
            if m.end() <= tail_len:
                continue

            key = _decode_json_string_bytes(m.group(1))
            abs_offset = chunk_start_abs - tail_len + m.start()
            if abs_offset < 0:
                abs_offset = m.start()

            if key in seen_multiple:
                continue

            if key in seen_once:
                del seen_once[key]
                seen_multiple.add(key)
            else:
                seen_once[key] = abs_offset

        tail = combined[-overlap_bytes:] if overlap_bytes > 0 else b""
        chunk_index += 1

        if progress_every_chunks and (chunk_index % progress_every_chunks == 0):
            elapsed = time.time() - start_time
            msg = (
                f"Scanned {chunk_index:,} chunks | "
                f"{abs_read:,} decompressed bytes | "
            )

            # --- compressed % progress ---
            try:
                compressed_pos = f.fileobj().tell()
                pct = 100.0 * compressed_pos / total_compressed
                msg += f"{pct:6.2f}% compressed | "
            except Exception:
                msg += "compressed % unavailable | "

            msg += f"{elapsed:,.1f}s elapsed"
            print(msg)

    f.close()

    unique_hits = {
        k: KeyHit(
            key=k,
            abs_offset=o,
            chunk_index=o // chunk_size,
            offset_in_chunk=o % chunk_size,
        )
        for k, o in seen_once.items()
    }

    return unique_hits

In [34]:
# # --- Example usage ---

# # 1) Set paths
gz_path = "D:\\2026-01_254_39D0_in-network-rates_4_of_5.json.gz"
index_path = f"{gz_path}.gzi"

# 2) Build index (optional if already built)
build_and_export_index(gz_path, index_path)

# 3) Scan keys + offsets
unique_hits = scan_unique_keys_with_offsets(
    gz_path=gz_path,
    index_path=index_path,
    chunk_size=8*1024*1024,
    overlap_bytes=2048,
)

len(unique_hits), list(unique_hits.items())

Scanned 20 chunks, 167,772,160 decompressed bytes...
Scanned 40 chunks, 335,544,320 decompressed bytes...
Scanned 60 chunks, 503,316,480 decompressed bytes...
Scanned 80 chunks, 671,088,640 decompressed bytes...
Scanned 100 chunks, 838,860,800 decompressed bytes...
Scanned 120 chunks, 1,006,632,960 decompressed bytes...
Scanned 140 chunks, 1,174,405,120 decompressed bytes...
Scanned 160 chunks, 1,342,177,280 decompressed bytes...
Scanned 180 chunks, 1,509,949,440 decompressed bytes...
Scanned 200 chunks, 1,677,721,600 decompressed bytes...
Scanned 220 chunks, 1,845,493,760 decompressed bytes...
Scanned 240 chunks, 2,013,265,920 decompressed bytes...
Scanned 260 chunks, 2,181,038,080 decompressed bytes...
Scanned 280 chunks, 2,348,810,240 decompressed bytes...
Scanned 300 chunks, 2,516,582,400 decompressed bytes...
Scanned 320 chunks, 2,684,354,560 decompressed bytes...
Scanned 340 chunks, 2,852,126,720 decompressed bytes...
Scanned 360 chunks, 3,019,898,880 decompressed bytes...
Scanne

Scanned 2900 chunks, 24,326,963,200 decompressed bytes...
Scanned 2920 chunks, 24,494,735,360 decompressed bytes...
Scanned 2940 chunks, 24,662,507,520 decompressed bytes...
Scanned 2960 chunks, 24,830,279,680 decompressed bytes...
Scanned 2980 chunks, 24,998,051,840 decompressed bytes...
Scanned 3000 chunks, 25,165,824,000 decompressed bytes...
Scanned 3020 chunks, 25,333,596,160 decompressed bytes...
Scanned 3040 chunks, 25,501,368,320 decompressed bytes...
Scanned 3060 chunks, 25,669,140,480 decompressed bytes...
Scanned 3080 chunks, 25,836,912,640 decompressed bytes...
Scanned 3100 chunks, 26,004,684,800 decompressed bytes...
Scanned 3120 chunks, 26,172,456,960 decompressed bytes...
Scanned 3140 chunks, 26,340,229,120 decompressed bytes...
Scanned 3160 chunks, 26,508,001,280 decompressed bytes...
Scanned 3180 chunks, 26,675,773,440 decompressed bytes...
Scanned 3200 chunks, 26,843,545,600 decompressed bytes...
Scanned 3220 chunks, 27,011,317,760 decompressed bytes...
Scanned 3240 c

Scanned 5740 chunks, 48,150,609,920 decompressed bytes...
Scanned 5760 chunks, 48,318,382,080 decompressed bytes...
Scanned 5780 chunks, 48,486,154,240 decompressed bytes...
Scanned 5800 chunks, 48,653,926,400 decompressed bytes...
Scanned 5820 chunks, 48,821,698,560 decompressed bytes...
Scanned 5840 chunks, 48,989,470,720 decompressed bytes...
Scanned 5860 chunks, 49,157,242,880 decompressed bytes...
Scanned 5880 chunks, 49,325,015,040 decompressed bytes...
Scanned 5900 chunks, 49,492,787,200 decompressed bytes...
Scanned 5920 chunks, 49,660,559,360 decompressed bytes...
Scanned 5940 chunks, 49,828,331,520 decompressed bytes...
Scanned 5960 chunks, 49,996,103,680 decompressed bytes...
Scanned 5980 chunks, 50,163,875,840 decompressed bytes...
Scanned 6000 chunks, 50,331,648,000 decompressed bytes...
Scanned 6020 chunks, 50,499,420,160 decompressed bytes...
Scanned 6040 chunks, 50,667,192,320 decompressed bytes...
Scanned 6060 chunks, 50,834,964,480 decompressed bytes...
Scanned 6080 c

(5,
 [('reporting_entity_name',
   KeyHit(key='reporting_entity_name', abs_offset=1, chunk_index=0, offset_in_chunk=1)),
  ('reporting_entity_type',
   KeyHit(key='reporting_entity_type', abs_offset=56, chunk_index=0, offset_in_chunk=56)),
  ('last_updated_on',
   KeyHit(key='last_updated_on', abs_offset=108, chunk_index=0, offset_in_chunk=108)),
  ('version',
   KeyHit(key='version', abs_offset=140, chunk_index=0, offset_in_chunk=140)),
  ('in_network',
   KeyHit(key='in_network', abs_offset=62964453082, chunk_index=7505, offset_in_chunk=7950042))])

In [38]:
unique_hits

{'reporting_entity_name': KeyHit(key='reporting_entity_name', abs_offset=1, chunk_index=0, offset_in_chunk=1),
 'reporting_entity_type': KeyHit(key='reporting_entity_type', abs_offset=56, chunk_index=0, offset_in_chunk=56),
 'last_updated_on': KeyHit(key='last_updated_on', abs_offset=108, chunk_index=0, offset_in_chunk=108),
 'version': KeyHit(key='version', abs_offset=140, chunk_index=0, offset_in_chunk=140),
 'in_network': KeyHit(key='in_network', abs_offset=62964453082, chunk_index=7505, offset_in_chunk=7950042)}

In [7]:
def read_window_at_decompressed_offset(
    gz_path: str,
    abs_offset: int,
    index_path: Optional[str] = None,
    window: int = 4096,
):
    if index_path:
        f = open_with_existing_index(gz_path, index_path)
    else:
        f = open_indexed_gzip(gz_path)

    f.seek(abs_offset)
    data = f.read(window)
    f.close()
    return data

In [25]:
def read_span_between_offsets(
    f,
    start_offset: int,
    end_offset: int | None,
    extra: int = 256,
    max_bytes: int = 16 * 1024 * 1024,  # safety cap
) -> bytes:
    if start_offset < 0:
        raise ValueError("start_offset must be >= 0")

    f.seek(start_offset)

    if end_offset is not None:
        if end_offset <= start_offset:
            raise ValueError("end_offset must be greater than start_offset")
        n = (end_offset - start_offset) + extra
        return f.read(n)

    # end_offset is None → bounded read
    return f.read(max_bytes)

In [9]:
def _parse_up_to_n_array_elements_best_effort(text: str, array_start_idx: int, n: int):
    """
    Best-effort parse of up to n elements from a JSON array starting at '['.
    Returns a list with 0..n elements depending on how much is available in `text`.
    Never raises due to truncation; it stops when it can't continue.
    """
    assert text[array_start_idx] == "["
    decoder = json.JSONDecoder()

    i = array_start_idx + 1  # after '['
    out = []

    def skip_ws(idx):
        while idx < len(text) and text[idx].isspace():
            idx += 1
        return idx

    i = skip_ws(i)

    # Empty array
    if i < len(text) and text[i] == "]":
        return []

    while len(out) < n:
        i = skip_ws(i)
        if i >= len(text):
            break

        # End of array
        if text[i] == "]":
            break

        # Attempt decode one element; if fails (likely truncation), stop
        try:
            val, consumed = decoder.raw_decode(text[i:])
        except Exception:
            break

        out.append(val)
        i += consumed

        i = skip_ws(i)
        if i >= len(text):
            break

        # Consume delimiter if present; if not, stop best-effort
        if text[i] == ",":
            i += 1
            continue
        elif text[i] == "]":
            break
        else:
            break

    return out


def extract_scalar_or_first_n_from_span(key: str, span_bytes: bytes, n: int = 5):
    """
    Best-effort extraction from span_bytes containing '"key": <value>'.

    Behavior:
      - If key not found in span_bytes: return None
      - If value is scalar and fully present: return scalar
      - If scalar is truncated/unparseable: return None
      - If value is array: return up to first n elements that can be parsed (0..n)
        If array elements are truncated: returns what it can (possibly [])
    """
    if not span_bytes:
        return None

    key_bytes = b'"' + key.encode("utf-8") + b'"'

    # Find the key in raw bytes
    p = span_bytes.find(key_bytes)
    if p < 0:
        return None

    # Find colon after the key
    c = span_bytes.find(b":", p + len(key_bytes))
    if c < 0:
        return None

    # Move to first non-whitespace after colon
    i = c + 1
    L = len(span_bytes)
    while i < L and span_bytes[i] in b" \t\r\n":
        i += 1
    if i >= L:
        return None

    first = span_bytes[i:i+1]

    # Decode a working text view from value-start onward
    # (If there are encoding issues earlier in the span, we avoid them by decoding from i forward.)
    tail_bytes = span_bytes[i:]

    try:
        tail_text = tail_bytes.decode("utf-8", errors="strict")
    except UnicodeDecodeError:
        tail_text = tail_bytes.decode("utf-8", errors="replace")

    decoder = json.JSONDecoder()

    # Array case: parse up to n elements best-effort
    if first == b"[":
        # We decode from '[' onward, so index 0 is '['
        t = tail_text
        if not t or t[0] != "[":
            return []

        out = []
        pos = 1  # after '['

        def skip_ws(s, j):
            while j < len(s) and s[j].isspace():
                j += 1
            return j

        pos = skip_ws(t, pos)

        # Empty array
        if pos < len(t) and t[pos] == "]":
            return []

        while len(out) < n:
            pos = skip_ws(t, pos)
            if pos >= len(t):
                break
            if t[pos] == "]":
                break

            try:
                val, consumed = decoder.raw_decode(t[pos:])
            except Exception:
                # truncated or invalid element
                break

            out.append(val)
            pos += consumed
            pos = skip_ws(t, pos)

            if pos >= len(t):
                break
            if t[pos] == ",":
                pos += 1
                continue
            if t[pos] == "]":
                break

            # Unexpected delimiter → stop best-effort
            break

        return out

    # Scalar/object case: parse one JSON value best-effort
    try:
        value, _ = decoder.raw_decode(tail_text)
    except Exception:
        return None

    # If you truly only want scalars (not dict/list), enforce:
    if isinstance(value, (list, dict)):
        return None

    return value

In [43]:
targets = list(unique_hits.items())
for i, (key_name, hit) in enumerate(targets):
    start = hit.abs_offset

    # Determine end offset (next key's start, or None for last key)
    try:
        next_hit = targets[i + 1][1]
        end = next_hit.abs_offset
    except IndexError:
        end = None  # Last key, no end boundary
    print(key_name)
    print(start,end)
    # Read span and extract value
#     span = read_span_between_offsets(f, start, end, extra=512)
#     val = extract_scalar_or_first_n_from_span(key_name, span, n=1)

reporting_entity_name
1 56
reporting_entity_type
56 108
last_updated_on
108 140
version
140 62964453082
in_network
62964453082 None


In [29]:
val[0]

{'negotiation_arrangement': 'ffs',
 'name': 'dmin influenza virus vac',
 'billing_code_type': 'HCPCS',
 'billing_code_type_version': '2025',
 'billing_code': 'G0008',
 'description': 'Administration of Influenza Virus Vaccine',
 'negotiated_rates': [{'provider_groups': [{'npi': [1699730218],
     'tin': {'type': 'ein', 'value': '20-0336147'}},
    {'npi': [1780799684], 'tin': {'type': 'ein', 'value': '27-1913972'}},
    {'npi': [1538572144, 1235340779, 1588616551],
     'tin': {'type': 'ein', 'value': '43-1980122'}},
    {'npi': [1114033016, 1346509718, 1669057535],
     'tin': {'type': 'ein', 'value': '47-3756681'}},
    {'npi': [1891880928, 1518910892],
     'tin': {'type': 'ein', 'value': '63-0697924'}},
    {'npi': [1114362688,
      1174943138,
      1740283266,
      1528379716,
      1386847374,
      1861495384],
     'tin': {'type': 'ein', 'value': '63-0772886'}},
    {'npi': [1366570640, 1528052818, 1518317106, 1033103189],
     'tin': {'type': 'ein', 'value': '63-0795136'}},