# Pipeline and Medallion Architecture

In this notebook we are going to combine the ideas from:
- [Example medallion architecture](https://docs.databricks.com/aws/en/lakehouse/medallion#example-medallion-architecture)
- [Load data with Lakeflow Declarative Pipelines
](https://docs.databricks.com/aws/en/ldp/load)
- [Manage data quality with pipeline expectations
](https://docs.databricks.com/aws/en/ldp/expectations)

In [None]:
from pyspark.sql import functions as F
from pyspark import pipelines as dp

# Landing zone directories to read the raw data

- In this example we are using the fake generated data stored in the Managed Volume.
- In a real scenario, could be:
    - Cloud-object storage path: S3, ADLS, GCS
    - External Volume path: Access existing Clod-object storage using volume-like paths

In [None]:
volume = spark.conf.get("landing_zone_volume", "")
bronze_schema = spark.conf.get("bronze_schema", "")
silver_schema = spark.conf.get("silver_schema", "")
gold_schema = spark.conf.get("gold_schema", "")

customers_directory = f"{volume}/customers"
products_directory = f"{volume}/products"
transactions_directory = f"{volume}/transactions"

# Bronze Tables

In the bronze layer we are supposed to only:

- Load the raw data into tables
- Avoid transformations, changes and filters in the data.
- Keep the data in the original format.

### Customers table

In [None]:
@dp.table(name=f"{bronze_schema}.customers_raw")
def bronze_customers():
  """
    Returns a Spark Streaming Dataframe that uses AutoLoader to incrementally read the customer data
  """

  SCHEMA_HINTS = "customer_id STRING, name STRING, country STRING, registration_date DATE, customer_segment STRING"
  
  df = (
      spark.readStream
          .format("cloudFiles")
          .option("cloudFiles.format", "parquet")
          .option("cloudFiles.schemaHints", SCHEMA_HINTS)
          .load(customers_directory)
  )
  return df

NameError: name 'dp' is not defined

### Products table

In [None]:
@dp.table(name=f"{bronze_schema}.products_raw")
def bronze_products():
  """
    Returns a Spark Streaming Dataframe that uses AutoLoader to incrementally read the orders data
  """

  SCHEMA_HINTS = "product_id STRING, product_name STRING, category STRING, price DOUBLE, cost DOUBLE"

  df = (
      spark.readStream
          .format("cloudFiles")
          .option("cloudFiles.format", "parquet")
          .option("cloudFiles.schemaHints", SCHEMA_HINTS)
          .load(products_directory)
  )
  return df

### Transactions table

In [None]:
@dp.table(name=f"{bronze_schema}.transactions_raw")
def bronze_products():
  """
    Returns a Spark Streaming Dataframe that uses AutoLoader to incrementally read the orders data
  """

  SCHEMA_HINTS = "transaction_id STRING, customer_id STRING, product_id STRING, quantity_id STRING, category STRING, price DOUBLE, cost DOUBLE"

  df = (
      spark.readStream
          .format("cloudFiles")
          .option("cloudFiles.format", "parquet")
          .option("cloudFiles.schemaHints", SCHEMA_HINTS)
          .load(transactions_directory)
  )
  return df

## Silver Tables

In this stage yo are supposed to:
    - Clean the data.
    - Transform the data.
    - Apply business and data quality rules to the data.

The only rule in this example is that there cannot be nulls on the downstream, therefore, we'll exclude the records with this criteria.

### Customers table with a data quality expectation

In [None]:
@dp.table(name=f"{silver_schema}.customers")
@dp.expect_or_drop("valid_customer_id", "customer_id IS NOT NULL")
def silver_customers():
  # Read Bronze table
  df = spark.readStream.table(f"{bronze_schema}.customers_raw")

  # Drop the Auto Loader generated column, no longer needed on silver.
  df = df.drop("_rescued_data")
  return df

### Products table with a data quality expectation

In [None]:
@dp.table(name=f"{silver_schema}.products")
@dp.expect_or_drop("valid_product_id", "product_id IS NOT NULL")
def silver_products():
  # Read Bronze table
  df = spark.readStream.table(f"{bronze_schema}.products_raw")

  # Drop the Auto Loader generated column, no longer needed on silver.
  df = df.drop("_rescued_data")
  return df

### Transactions table with a data quality expectations

In [None]:
@dp.table(name=f"{silver_schema}.transactions")
@dp.expect_or_drop("valid_transaction_id", "transaction_id IS NOT NULL")
@dp.expect_or_drop("valid_product_id", "product_id IS NOT NULL")
@dp.expect_or_drop("valid_customer_id", "customer_id IS NOT NULL")
def silver_transactions():
  # Read Bronze table
  df = spark.readStream.table(f"{bronze_schema}.transactions_raw")

  # Drop the Auto Loader generated column, no longer needed on silver.
  df = df.drop("_rescued_data")
  return df

## Sales detailed

In [None]:
@dp.table(name=f"{silver_schema}.sales_detail")
def sales_detail():
    # Silver tables
    customers_df = spark.readStream.table(f"{silver_schema}.customers")
    products_df = spark.readStream.table(f"{silver_schema}.products")
    transactions_df = spark.readStream.table(f"{silver_schema}.transactions")

    sales_detail_df = transactions_df \
    .join(customers_df, on='customer_id') \
    .join(products_df, on='product_id')

    sales_detail_df = sales_detail_df.withColumn('revenue', F.col('price') * F.col('quantity') * (1 - col('discount_applied')))

    return sales_detail_df

## Sales per country

In [None]:
@dp.table(name=f"{silver_schema}.revenue_transactions")
def revenue_transactions():
    # Silver tables
    sales_detail_df = spark.readStream.table(f"{silver_schema}.sales_detail")
    
    revenue_transactions_df = sales_detail_df.groupBy('country').agg(
        F.round(F.sum('revenue'), 2).alias('total_revenue'),
        F.count('transaction_id').alias('total_transactions'),
        F.countDistinct('customer_id').alias('unique_customers')
    )

    revenue_transactions_df = revenue_transactions_df.withColumn(
    'avg_order_value', F.round(F.col('total_revenue') / F.col('total_transactions'), 2))

    return revenue_transactions_df

## Segments per country

In [None]:
@dp.table(name=f"{silver_schema}.segments_per_country")
def segments_per_country():

    sales_detail_df = spark.readStream.table(f"{silver_schema}.sales_detail")

    segment_per_country = sales_detail_df.groupBy('country', 'customer_segment') \
    .agg(F.count('*').alias('segment_count')) \
    .withColumn('rank', F.row_number().over(F.Window.partitionBy('country').orderBy(F.desc('segment_count')))) \
    .filter(F.col('rank') == 1) \
    .select('country', F.col('customer_segment').alias('top_customer_segment'))

    return segment_per_country

# Gold layer

In [None]:
@dp.table(name=f"{gold_schema}.sales_per_country")
def sales_per_country():

    revenue_transactions_df = spark.readStream.table(f"{silver_schema}.revenue_transactions")
    segments_per_country_df = spark.readStream.table(f"{silver_schema}.segments_per_country")

    sales_per_country_df = revenue_transactions_df.join(segments_per_country_df, on='country').orderBy(F.desc('total_revenue'))

    return sales_per_country_df