# Bronze to Silver Data Pipeline

This notebook ingests raw data from CSV files and creates the bronze table for further processing.

In [None]:
import dlt
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Initialize Spark session
spark = SparkSession.builder.appName("BronzeToSilver").getOrCreate()

In [None]:
# Define the bronze table
@dlt.table(
    name="bronze_table",
    comment="Raw sales data ingested from CSV files"
)
def bronze_table():
    # Define schema for the sales data
    schema = StructType([
        StructField("transaction_id", StringType(), True),
        StructField("date", StringType(), True),
        StructField("category", StringType(), True),
        StructField("product_name", StringType(), True),
        StructField("sales_amount", DoubleType(), True),
        StructField("quantity", IntegerType(), True),
        StructField("customer_id", StringType(), True)
    ])
    
    # Read CSV data from Unity Catalog Volume
    # Path: /Volumes/main/default/demo/sample_sales_data.csv
    df = spark.read \
        .option("header", "true") \
        .option("inferSchema", "false") \
        .schema(schema) \
        .csv("/Volumes/main/default/demo/sample_sales_data.csv")
    
    # Add metadata columns 
    df_with_metadata = df.withColumn("ingestion_timestamp", current_timestamp()) \
                        .withColumn("source_file", lit("sample_sales_data.csv"))
    
    return df_with_metadata

In [None]:
# Create sample data for testing (uncomment to generate test data)
"""
sample_data = [
    ("TXN001", "2024-01-01", "Electronics", "Laptop", 1200.00, 1, "CUST001"),
    ("TXN002", "2024-01-02", "Clothing", "T-Shirt", 25.99, 2, "CUST002"),
    ("TXN003", "2024-01-03", "Electronics", "Mouse", 15.50, 1, "CUST003"),
    ("TXN004", "2024-01-04", "Books", "Python Guide", 45.00, 1, "CUST001"),
    ("TXN005", "2024-01-05", "Clothing", "Jeans", 89.99, 1, "CUST004"),
    ("TXN006", "2024-01-06", "Electronics", "Keyboard", 75.00, 1, "CUST005"),
    ("TXN007", "2024-01-07", "Books", "Data Science", 55.00, 2, "CUST002"),
    ("TXN008", "2024-01-08", "Clothing", "Jacket", 199.99, 1, "CUST003"),
    ("TXN009", "2024-01-09", "Electronics", "Monitor", 350.00, 1, "CUST006"),
    ("TXN010", "2024-01-10", "Books", "Machine Learning", 65.00, 1, "CUST004")
]

columns = ["transaction_id", "date", "category", "product_name", "sales_amount", "quantity", "customer_id"]
sample_df = spark.createDataFrame(sample_data, columns)

# Save sample data to Unity Catalog Volume (run this once to create test data)
sample_df.coalesce(1).write.mode("overwrite").option("header", "true").csv("/Volumes/main/default/demo/sample_sales_data.csv")
"""

print("Bronze layer setup complete. Data will be ingested from /Volumes/main/default/demo/sample_sales_data.csv")