In [0]:
%sql
-- BRONZE LAYER INGESTION LOGIC
-- Select Catalog and Schema
-- Set the working environment using SQL, pointing to the relevant catalog and schema for saving the Bronze table.

use catalog vsqproject;
use schema bronze;

In [0]:
import requests
import json
from pyspark.sql import functions as F

Configs

In [0]:
# Retrieve Secure Creds from Key Vault
# This avoids hardcoding sensitive credentials.             
password = dbutils.secrets.get(scope="krsna-scope", key="postgresdbpass")
username = dbutils.secrets.get(scope="krsna-scope", key="dbusername")
jdbc_url = dbutils.secrets.get(scope="krsna-scope", key="jdbcurl")

Summary 
--
In this notebook, we connect securely to a Azure SQL Server database using credentials stored in Azure Key Vault.
We ingest raw product data into our Bronze layer, which acts as the raw landing zone. As part of this process, approximately 600 rows are ingested into the table, preserving the original structure and content.
This helps maintain data integrity for auditability and traceability before applying any transformations in the Silver layer.

In [0]:
df = spark.read.format('csv').option('header','true').load('s3://sourcekrsna/vsqproject/raw_stores/raw_stores.csv')
# df.show()
df.write.mode("overwrite").saveAsTable("raw_stores")

In [0]:
%python
connection_properties = {
  "user": username,
  "password": password,
  "driver": "org.postgresql.Driver"
}

df = spark.read.jdbc(url=jdbc_url, table="vsqproject.products", properties=connection_properties)
df.write.mode("overwrite").option("mergeSchema", "true").saveAsTable("raw_products")

In [0]:
url = "https://my.api.mockaroo.com/raw_customers.json?key=519b2bf0"
response = requests.get(url)
data = response.json()
file_path = "s3://sourcekrsna/vsqproject/raw_customers/raw_customers.json"
json_data = json.dumps(data, indent = 4)
dbutils.fs.put(file_path, json_data, overwrite=True)

df = spark.read.format("json").option('multiLine', True).load("s3://sourcekrsna/vsqproject/raw_customers/raw_customers.json")
df = df.withColumn("customerid", F.col('customerid').cast('string'))
df.write.mode("overwrite").saveAsTable("raw_customers")

In [0]:
df = (
    spark.readStream
    .format('cloudFiles')
    .option('cloudFiles.format', 'csv')
    .option('cloudFiles.schemaLocation', 's3://bronzebucketkrsna/vsqproject/raw_sales/schema/')
    .load('s3://sourcekrsna/vsqproject/raw_sales/')
)

df_clean = df.drop('_rescued_data')

df_clean.writeStream \
    .format('delta') \
    .option('checkpointLocation', 's3://bronzebucketkrsna/vsqproject/raw_sales/checkpoint/') \
    .trigger(once=True) \
    .table('vsqproject.bronze.raw_sales')