In [0]:
import dlt
from pyspark.sql.functions import (
    col, expr, to_date, year, current_date, upper, trim, regexp_replace, when, lit, sha2, concat_ws, lower
)
from pyspark.sql.types import (
    StringType, IntegerType, DoubleType, DateType, TimestampType
)

# --- Bronze Claims (Batch Read) ---
@dlt.table(
        name="bronze_claims_batch",
        comment="Raw claims ingested from CSV file using batch read.",
        table_properties={
                "quality": "bronze",
                "data_source": "claims_csv_batch"
        }
)
def bronze_claims_batch():
        schema = "ClaimID STRING, MemberID STRING, ProviderID STRING, ClaimDate DATE, ServiceDate DATE, Amount Double, status STRING, ICD10Codes STRING, CPTCodes STRING, ClaimType STRING, SubmissionChannel STRING, Notes STRING, IngestTimestamp TIMESTAMP"
        return (
                spark.read
                .format("csv")
                .option("header", "true")
                .schema(schema)
                .load("/Volumes/jk_unity/default/volume/claims_batch.csv")
        )

# --- Bronze members (Batch Read) ---
@dlt.table(
        name="bronze_members_batch",
        comment="Raw members ingested from CSV file using batch read.",
        table_properties={
                "quality": "bronze",
                "data_source": "members_batch"
        }
)
def bronze_members_batch():
        schema = "MemberID STRING, Name STRING, DOB DATE, Gender STRING, Region STRING, PlanType STRING, EffectiveDate DATE, Email STRING, IsActive INTEGER, LastUpdated DATE"
        return (
                spark.read
                .format("csv")
                .option("header", "true")
                .schema(schema)
                .load("/Volumes/jk_unity/default/volume/members.csv")
        )

# --- Bronze diagnosis_ref (Batch Read) ---
@dlt.table(
        name="bronze_diagnosis_ref_batch",
        comment="Raw diagnosis_ref ingested from CSV file using batch read.",
        table_properties={
                "quality": "bronze",
                "data_source": "diagnosis_ref_batch"
        }
)
def bronze_diagnosis_ref_batch():
        schema = "Code STRING, Description STRING"
        return (
                spark.read
                .format("csv")
                .option("header", "true")
                .schema(schema)
                .load("/Volumes/jk_unity/default/volume/diagnosis_ref.csv")
        )

# --- Bronze providers (Batch Read) ---
@dlt.table(
        name="bronze_providers_batch",
        comment="Raw providers ingested from JSON file using batch read.",
        table_properties={
                "quality": "bronze",
                "data_source": "providers_batch"
        }
)
def bronze_providers_batch():
        return (
                spark.read
                .format("json")
#                .option("inferSchema", "true")
                .load("/Volumes/jk_unity/default/volume/providers.json")
        )

# --- Bronze Claims (Streaming Read) ---
@dlt.table(
        name="bronze_claims_stream",
        comment="Raw claims ingested from JSON file using streaming read.",
        table_properties={
                "quality": "bronze",
                "data_source": "claims_json_stream"
        }
)
def bronze_claims_stream():
        schema = "ClaimID STRING, MemberID STRING, ProviderID STRING, ClaimDate DATE, ServiceDate DATE, Amount Double, status STRING, ICD10Codes STRING, CPTCodes STRING, ClaimType STRING, SubmissionChannel STRING, Notes STRING, IngestTimestamp TIMESTAMP"
        return (
                spark.readStream
                .format("json")
                .schema(schema)
                .load("/Volumes/jk_unity/default/volume/claims_stream/")
        )
        
# The error is fixed by changing the path in spark.readStream.load() to a directory, not a file.