# Bronze Layer Data Processing

## Ingest Raw Data
Capture incoming data from various sources without applying transformations.

In [None]:
# Example: read raw CSV files from landing zone
raw_df = spark.read.format('csv').option('header', 'true').load('/mnt/landing/raw_files/*.csv')

## Raw Data Storage
Save the raw data to a persistent format for auditing and reprocessing.

In [None]:
# Persist raw data in Parquet format in the Bronze layer
raw_df.write.format('parquet').mode('append').save('/mnt/bronze/raw_table')

## Partitioning
Organize the data by a relevant criterion to optimize queries and storage.

In [None]:
# Write data partitioned by ingestion date
raw_df.withColumn('ingest_date', current_date()) \      .write.format('parquet') \      .partitionBy('ingest_date') \      .mode('append') \      .save('/mnt/bronze/partitioned_raw_table')

## Basic Validation
Check for file integrity and record ingestion metadata.

In [None]:
from pyspark.sql.functions import input_file_name, current_timestamp

validated_df = raw_df.withColumn('source_file', input_file_name()) \                     .withColumn('ingest_time', current_timestamp())

## Initial Deduplication
Identify and mark duplicate records.

In [None]:
dedup_df = validated_df.dropDuplicates()

## Flexible Schema
Apply a minimal or automatically inferred schema for semi-structured data.

In [None]:
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([StructField('id', StringType(), True)])
flex_df = spark.read.json('/mnt/landing/json_files/', schema=schema, multiLine=True)