In [0]:
# This cell ingests data into bronze via batch processing
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, BooleanType

# Source file path
github_files = "/Volumes/leomar/1bronze/github_files"

# Get file list from github_files
Files = dbutils.fs.ls(github_files)
#print(Files)

# Batch files
batch_files = [f for f in Files]
# print(batch_files)

# Define schema with nullable = true for providers
provider_schema = StructType([
    StructField("ProviderID", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Specialties", ArrayType(StringType()), True),
    StructField("Locations", ArrayType(StructType([ # Specify array values inside the Locations column
        StructField("Address", StringType(), True),
        StructField("City", StringType(), True),
        StructField("State", StringType(), True)
    ])), True),
    StructField("IsActive", BooleanType(), True),
    StructField("TIN", StringType(), True),
    StructField("LastVerified", StringType(), True)
])

# Displays data ingestion process
for file_info in batch_files:
    file_path = file_info.path
    file_name = file_info.name
    base_table_name = file_name.split('.')[0]
    
    print(f"\nStarting ingestion for: {file_name}")
    
    # Get file format
    if file_name.endswith('.csv'):
        df = (spark.read
              .format("csv")
              .option("header", "true")
              .option("inferSchema", "true")
              .option("escape", '"')
              .option("quote", '"')
              .load(file_path)
             )
        
    elif file_name.endswith('.json'):
        df = (spark.read
            .format("json")
            .option("mode", "PERMISSIVE") 
            .option("columnNameOfCorruptRecord", "_corrupt_record")
            .load(file_path)
            )
        df = spark.read.schema(provider_schema).json(file_path)
    else:
        print(f"Unsupported file format for: {file_name}...")
        continue

    # Add metadata columns - USE UC-COMPATIBLE METHOD
    from pyspark.sql.functions import current_timestamp, lit
    df_with_metadata = (df
        .withColumn("_source_file", lit(file_path))  # Use lit() instead of input_file_name()
        .withColumn("_ingestion_timestamp", current_timestamp())
    )
    
    full_target_table_name = f"leomar.1bronze.{base_table_name}"
    
    # Write the DataFrame to the Bronze Delta Table
    (df_with_metadata.write
     .format("delta")
     .mode("overwrite") 
     .option("overwriteSchema", "true")
     .saveAsTable(full_target_table_name)
    )
    
    print(f"Successfully ingested {file_name} into table: {full_target_table_name}")
    print(f"Number of records written: {df.count()}")

print("\nAll files ingested succesfully!")