In [None]:
spark

# OBT (One Big Table)

For a large scale data processing, reducing joining among data models make processing easier. This **OBT** is quite a counter-intuitive design compared to **star schema**, but it is very useful. Further more about OBT [here](https://www.fivetran.com/blog/star-schema-vs-obt)

In this labe we will make an OBT for `orders`, `orderdetails` and `products`.

![Data Models](https://www.mysqltutorial.org/wp-content/uploads/2009/12/MySQL-Sample-Database-Schema.png)

In [None]:
from pyspark.sql import functions as F

## Load data for all related models

In [None]:
def load_bronze_table(table):
    return (
        spark
            .read
            .format("parquet")
            .load(f"s3a://datalake/exercises/bronze/classicmodels/{table}.parquet")
    )

In [None]:
orders = load_bronze_table("orders")
orderdetails = load_bronze_table("orderdetails")
products = load_bronze_table("products")

In [None]:
orders.count(), orderdetails.count(), products.count()

## Join related models together

In [None]:
%%local
from IPython.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
orderfulldetails = (
    orders
        .join(orderdetails, orders.order_number == orderdetails.order_number, "leftouter")
        .join(products, orderdetails.product_code == products.product_code, "leftouter")
        .drop(orderdetails.order_number)
        .drop(products.product_code)
)

In [None]:
orderfulldetails.printSchema()

In [None]:
orderfulldetails.show(n=1, vertical=True)

In [None]:
orderfulldetails.count()

## Save OBT model to datalake

The expected destination path is: `s3a://datalake/exercises/bronze/classicmodels/orderfulldetails.parquet`

In [None]:
(
    orderfulldetails
        .write
        .format("parquet")
        .mode("overwrite")
        .save("s3a://datalake/exercises/bronze/classicmodels/orderfulldetails.parquet")
)

In [None]:
row_count = spark.read.format("parquet").load("s3a://datalake/exercises/bronze/classicmodels/orderfulldetails.parquet").count()

assert row_count == 2996, "Orderfulldetails was not valid"