In [5]:
import sys
if "/mnt/c/Users/HP/Learning/DE_Development" not in sys.path:
    sys.path.append("/mnt/c/Users/HP/Learning/DE_Development")

In [8]:
from pyspark.sql import functions as F, types as T
from pyspark.sql import SparkSession
import yaml, os, sys
from src.utils.spark_utils import create_spark


    
config_path = '/mnt/c/Users/HP/Learning/DE_Development/config/pipeline_config_wsl.yaml'

with open(config_path, "r") as f:
    config_file = yaml.safe_load(f)

base_path = config_file["paths"]["base_path"]
print(base_path)
landing_batch = config_file["paths"]["landing"]["batch"].replace("${paths.base_path}", base_path)
landing_batch_sales = os.path.join(landing_batch, "daily_sales_20240115.csv")
print(landing_batch_sales)
bronze_sales_path = config_file["paths"]["bronze"]["sales"].replace("${paths.base_path}", base_path)
print(bronze_sales_path)

spark = create_spark(config_file["spark"]["app_name"], config_file["spark"]["configs"])



# Define schema matching your actual columns
sales_schema = T.StructType([
    T.StructField("transaction_id", T.StringType(), True),
    T.StructField("date", T.StringType(), True),
    T.StructField("product_id", T.StringType(), True),
    T.StructField("store_id", T.StringType(), True),
    T.StructField("quantity", T.IntegerType(), True),
    T.StructField("total_amount", T.DoubleType(), True),
    T.StructField("payment_method", T.StringType(), True),  # Keep as string, convert later
    T.StructField("region", T.StringType(), True),
])


sales_raw = (spark.read
    .option("header", True)
    .schema(sales_schema)
    .csv(landing_batch_sales))
sales_raw.show(5)



sales_bronze = (sales_raw
                  .withColumn("ingestion_timestamp", F.current_timestamp())
                  .withColumn("source_file", F.input_file_name())
                  .withColumn("batch_date", F.current_date())
                  .withColumn("record_hash", F.sha2(F.concat_ws("||", *[F.coalesce(F.col(c).cast("string"), F.lit("")) for c in sales_raw.columns]), 256)))

# Basic DQ: required fields not null, positive price
required_cols = [F.col("transaction_id").isNotNull(), F.col("product_id").isNotNull()]
dq_conditions = required_cols + [(F.col("quantity").isNull()) | (F.col("total_amount") >= F.lit(0.0))]
dq_valid = (F.col("transaction_id").isNotNull() & 
            F.col("product_id").isNotNull() & 
            ((F.col("quantity").isNull()) | (F.col("total_amount") >= F.lit(0.0))))

sales_bronze = sales_bronze.withColumn("dq_is_valid", dq_valid)


sales_bronze.show(5, truncate = False)


(sales_bronze
  .repartition(1)
  .write
  .mode("overwrite")
  .partitionBy("batch_date")
  .parquet(bronze_sales_path))

# Read the written Parquet data back
bronze_sales_read = spark.read.parquet(bronze_sales_path)
print("=== BRONZE DATA READ BACK ===")
bronze_sales_read.show(10, truncate=False)
print(f"Total rows in bronze: {bronze_sales_read.count()}")


/mnt/c/Users/HP/Learning/DE_Development
/mnt/c/Users/HP/Learning/DE_Development/data/landing/batch/daily_sales_20240115.csv
/mnt/c/Users/HP/Learning/DE_Development/data/bronze/sales
+--------------+----------+----------+--------+--------+------------+--------------+------+
|transaction_id|      date|product_id|store_id|quantity|total_amount|payment_method|region|
+--------------+----------+----------+--------+--------+------------+--------------+------+
|        TXN001|15-01-2024|      P001|    S101|       5|      1225.0|   Credit Card| North|
|        TXN002|15-01-2024|      P002|    S102|       3|       555.0|          Cash| South|
|        TXN003|15-01-2024|      P004|    S101|      10|       280.0|           UPI| North|
|        TXN004|15-01-2024|      P001|    S103|       2|       490.0|    Debit Card|  West|
|        TXN005|15-01-2024|      P005|    S104|       8|       960.0|           UPI|  East|
+--------------+----------+----------+--------+--------+------------+-------------

NameError: name 'product_bronze' is not defined