# Compare `main_schema.json` vs Spark-Inferred Schema by Group

This notebook intentionally **does not use `main_schema.json` for parsing**.
It infers schema directly from each group's `*.json.gz` payload and then compares to that group's `main_schema.json`.

In [1]:
from pathlib import Path
import os
import json
import yaml

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, ArrayType

CONFIG_PATH = Path("../config.yaml")
with CONFIG_PATH.open("r", encoding="utf-8") as f:
    cfg = yaml.safe_load(f)

plan_download_dir = Path(cfg["paths"]["plan_download_directory"])
python_executable = cfg.get("spark", {}).get("python_executable")
if python_executable:
    os.environ["PYSPARK_PYTHON"] = python_executable
    os.environ["PYSPARK_DRIVER_PYTHON"] = python_executable
    print(f"spark.python_executable: {python_executable}")
print(f"plan_download_directory: {plan_download_dir}")
print(f"exists: {plan_download_dir.exists()}")

spark.python_executable: C:\Users\ichoi\Documents\notitle\.venv\Scripts\python.exe
plan_download_directory: D:\payer_mrf_2\workspace
exists: True


In [2]:
group_dirs = sorted([p for p in plan_download_dir.iterdir() if p.is_dir() and p.name.startswith("group_")])
print(f"Found {len(group_dirs)} group folder(s)")

for g in group_dirs:
    files = sorted(g.glob("*.json.gz"))
    schema = g / "main_schema.json"
    print(f"[group={g.name}] files={len(files)} main_schema_exists={schema.exists()}")

Found 1 group folder(s)
[group=group_e2a4f730934b] files=26 main_schema_exists=True


In [3]:
try:
    spark.stop()
except Exception:
    pass

builder = SparkSession.builder.appName("SchemaCompare-Inferred-vs-Main")
if python_executable:
    builder = builder.config("spark.pyspark.python", python_executable)
    builder = builder.config("spark.pyspark.driver.python", python_executable)
spark = builder.getOrCreate()
spark.sparkContext.setLogLevel("WARN")
print("Spark session ready")

Spark session ready


In [4]:
def flatten_schema(dtype, prefix=""):
    """Return {path: simpleTypeString} for all fields recursively."""
    out = {}
    if isinstance(dtype, StructType):
        for f in dtype.fields:
            name = f"{prefix}.{f.name}" if prefix else f.name
            out.update(flatten_schema(f.dataType, name))
        return out
    if isinstance(dtype, ArrayType):
        out[prefix] = f"array<{dtype.elementType.simpleString()}>"
        # also flatten element if struct for deeper visibility
        if isinstance(dtype.elementType, StructType):
            out.update(flatten_schema(dtype.elementType, f"{prefix}[]"))
        return out
    out[prefix] = dtype.simpleString()
    return out

def infer_schema_from_group(group_dir: Path):
    """Infer schema directly from JSON text payloads in *.json.gz files."""
    json_files = sorted(group_dir.glob("*.json.gz"))
    if not json_files:
        return None

    # Infer directly from JSON file paths (multiLine JSON), without using main_schema.
    inferred_df = spark.read.option("multiLine", "true").json([str(p) for p in json_files])
    return inferred_df.schema

def load_main_schema(group_dir: Path):
    p = group_dir / "main_schema.json"
    if not p.exists():
        return None
    with p.open("r", encoding="utf-8") as f:
        return StructType.fromJson(json.load(f))

In [5]:
rows = []

for g in group_dirs:
    print(f"\n[group={g.name}] starting comparison")
    inferred = infer_schema_from_group(g)
    main_schema = load_main_schema(g)

    if inferred is None:
        print(f"[group={g.name}] no json.gz files found")
        rows.append({"group": g.name, "status": "no_json_files"})
        continue
    if main_schema is None:
        print(f"[group={g.name}] main_schema.json missing")
        rows.append({"group": g.name, "status": "missing_main_schema"})
        continue

    inf_map = flatten_schema(inferred)
    main_map = flatten_schema(main_schema)

    inf_keys = set(inf_map.keys())
    main_keys = set(main_map.keys())

    only_in_inferred = sorted(inf_keys - main_keys)
    only_in_main = sorted(main_keys - inf_keys)
    common = sorted(inf_keys & main_keys)
    type_mismatch = sorted([k for k in common if inf_map[k] != main_map[k]])

    print(f"[group={g.name}] inferred_fields={len(inf_keys)} main_fields={len(main_keys)}")
    print(f"[group={g.name}] only_in_inferred={len(only_in_inferred)} only_in_main={len(only_in_main)} type_mismatch={len(type_mismatch)}")

    rows.append({
        "group": g.name,
        "status": "ok",
        "inferred_field_count": len(inf_keys),
        "main_field_count": len(main_keys),
        "only_in_inferred_count": len(only_in_inferred),
        "only_in_main_count": len(only_in_main),
        "type_mismatch_count": len(type_mismatch),
        "only_in_inferred_sample": only_in_inferred[:10],
        "only_in_main_sample": only_in_main[:10],
        "type_mismatch_sample": [{"field": k, "inferred": inf_map[k], "main": main_map[k]} for k in type_mismatch[:10]],
    })


[group=group_e2a4f730934b] starting comparison
[group=group_e2a4f730934b] inferred_fields=23 main_fields=20
[group=group_e2a4f730934b] only_in_inferred=7 only_in_main=4 type_mismatch=3


In [6]:
try:
    import pandas as pd
    display(pd.DataFrame(rows))
except Exception:
    for r in rows:
        print(r)

Unnamed: 0,group,status,inferred_field_count,main_field_count,only_in_inferred_count,only_in_main_count,type_mismatch_count,only_in_inferred_sample,only_in_main_sample,type_mismatch_sample
0,group_e2a4f730934b,ok,23,20,7,4,3,"[last_updated_on, provider_references, provide...","[last_updated_on.value, reporting_entity_name....","[{'field': 'in_network', 'inferred': 'array<st..."


In [7]:
output_path = Path("schema_compare_results.json")
with output_path.open("w", encoding="utf-8") as f:
    json.dump(rows, f, indent=2)
print(f"Wrote comparison report: {output_path.resolve()}")

Wrote comparison report: C:\Users\ichoi\Documents\Unraveled\mrf_final_output\notebooks\schema_compare_results.json
