In [0]:
# ==============================================================================
# STEP 1: DEFINE FILE PATHS (Configuration)
# ==============================================================================
# WHAT: Define string variables pointing to the raw CSV files in the 'Volumes'.
# WHY:  Separating configuration (paths) from logic (reading) makes the code cleaner.
#       'Volumes' in Unity Catalog are the standard landing zone for raw files.
cust_info_path = '/Volumes/workspace/bronze/raw_resources/crm/cust_info.csv'
prd_info_path = '/Volumes/workspace/bronze/raw_resources/crm/prd_info.csv'
sales_details_path = '/Volumes/workspace/bronze/raw_resources/crm/sales_details.csv'

# ==============================================================================
# STEP 2: READ RAW DATA (Ingestion)
# ==============================================================================
# WHAT: Read the CSVs into Spark DataFrames.
#       header=True: Uses the first row of the CSV as column names.
#       inferSchema=True: Spark scans the file to guess if columns are Integer, String, etc.
# WHY:  This converts unstructured text files into structured DataFrames we can query.
#       Note: 'inferSchema' is expensive for huge files but perfect for initial raw loading.
cust_info_df = spark.read.csv(cust_info_path, header=True, inferSchema=True)
prd_info_df = spark.read.csv(prd_info_path, header=True, inferSchema=True)
sales_details_df = spark.read.csv(sales_details_path, header=True, inferSchema=True)

# ==============================================================================
# STEP 3: VISUALIZE (Sanity Check)
# ==============================================================================
# WHAT: Trigger an action to show the first 1000 rows of the DataFrames in the UI.
# WHY:  Allows the engineer to visually confirm the data loaded correctly 
#       (e.g., checking if headers are aligned and characters aren't garbled).
display(cust_info_df)
display(prd_info_df)
display(sales_details_df)

# ==============================================================================
# STEP 4: WRITE TO BRONZE (Persistence)
# ==============================================================================
# WHAT: Write the DataFrames to permanent tables in the Unity Catalog ('workspace.bronze').
#       mode('overwrite'): If the table exists, replace it entirely.
# WHY:  1. 'saveAsTable' registers the data in the Metastore so you can query it with SQL later.
#       2. 'overwrite' ensures Idempotency: You can run this notebook 10 times, 
#          and the result is always the same (no duplicate rows appended).
#       3. 'crm_' prefix: Namespaces the tables so we know they came from the CRM system.
cust_info_df.write.mode('overwrite').saveAsTable('workspace.bronze.crm_cust_info')
prd_info_df.write.mode('overwrite').saveAsTable('workspace.bronze.crm_prd_info')
sales_details_df.write.mode('overwrite').saveAsTable('workspace.bronze.crm_sales_details')

# ==============================================================================
# STEP 5: VERIFY (Post-Write Check)
# ==============================================================================
# WHAT: Read the *newly created tables* back from the database and display them.
# WHY:  This confirms the write operation was successful and the data is strictly 
#       available in the Bronze layer for downstream (Silver) processing.
display(spark.table('workspace.bronze.crm_cust_info'))
display(spark.table('workspace.bronze.crm_prd_info'))
display(spark.table('workspace.bronze.crm_sales_details'))

In [0]:
# Read each ERP CSV file separately into its own DataFrame
erp_cust_az12_path = '/Volumes/workspace/bronze/raw_resources/erp/CUST_AZ12.csv'
erp_loc_a101_path = '/Volumes/workspace/bronze/raw_resources/erp/LOC_A101.csv'
erp_px_cat_g1v2_path = '/Volumes/workspace/bronze/raw_resources/erp/PX_CAT_G1V2.csv'

erp_cust_az12_df = spark.read.csv(erp_cust_az12_path, header=True, inferSchema=True)
erp_loc_a101_df = spark.read.csv(erp_loc_a101_path, header=True, inferSchema=True)
erp_px_cat_g1v2_df = spark.read.csv(erp_px_cat_g1v2_path, header=True, inferSchema=True)

display(erp_cust_az12_df)
display(erp_loc_a101_df)
display(erp_px_cat_g1v2_df)

# Write each ERP DataFrame to a Unity Catalog table in Bronze schema with source-system prefix
erp_cust_az12_df.write.mode('overwrite').saveAsTable('workspace.bronze.erp_cust_az12')
erp_loc_a101_df.write.mode('overwrite').saveAsTable('workspace.bronze.erp_loc_a101')
erp_px_cat_g1v2_df.write.mode('overwrite').saveAsTable('workspace.bronze.erp_px_cat_g1v2')

# Verify writes by reading the tables
display(spark.table('workspace.bronze.erp_cust_az12'))
display(spark.table('workspace.bronze.erp_loc_a101'))
display(spark.table('workspace.bronze.erp_px_cat_g1v2'))