# 1. dimDeliveryMethods

In [39]:
%%sql
-- Create table in Lake Database
CREATE TABLE IF NOT EXISTS gold.dim_DeliveryMethods (
    DeliveryMethodKey int, 
    DeliveryMethodID int,
    DeliveryMethodName string,
    start_date timestamp,
    end_date timestamp,
    is_active boolean
)

StatementMeta(sparkpool, 74, 1, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [40]:
%%sql
-- Clear all data from the dimension table before inserting fresh data
TRUNCATE TABLE gold.dim_DeliveryMethods 

StatementMeta(sparkpool, 74, 2, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [41]:
%%sql
-- Insert data from the Silver Layer into the Gold Layer
INSERT INTO gold.dim_DeliveryMethods
SELECT 
    DeliveryMethodKey,
    DeliveryMethodID,
    DeliveryMethodName,
    start_date,
    end_date,
    is_active
FROM silver.DeliveryMethods;

StatementMeta(sparkpool, 74, 3, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [42]:
# Write to a Parquet file in the Gold layer of the storage account
df_dim_DeliveryMethods = spark.sql("SELECT * FROM gold.dim_DeliveryMethods")
df_dim_DeliveryMethods.repartition(1).write.mode("overwrite").parquet("abfss://final@bibik224161840.dfs.core.windows.net/gold/gold.dimDeliveryMethods.parquet")

StatementMeta(sparkpool, 74, 5, Finished, Available, Finished)

# 2. dimPaymentMethods

In [43]:
%%sql
-- Create table in Lake Database
CREATE TABLE IF NOT EXISTS gold.dim_PaymentMethods (
    PaymentMethodKey int, 
    PaymentMethodID int,
    PaymentMethodName string,
    start_date timestamp,
    end_date timestamp,
    is_active boolean
)

StatementMeta(sparkpool, 74, 6, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [44]:
%%sql
-- Clear all data from the dimension table before inserting fresh data
TRUNCATE TABLE gold.dim_PaymentMethods 

StatementMeta(sparkpool, 74, 7, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [45]:
%%sql
-- Insert data from the Silver Layer into the Gold Layer
INSERT INTO gold.dim_PaymentMethods
SELECT 
    PaymentMethodKey,
    PaymentMethodID,
    PaymentMethodName,
    start_date,
    end_date,
    is_active
FROM silver.PaymentMethods;

StatementMeta(sparkpool, 74, 8, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [46]:
# Write to a Parquet file in the Gold layer of the storage account
df_dim_PaymentMethods = spark.sql("SELECT * FROM gold.dim_PaymentMethods")
df_dim_PaymentMethods.repartition(1).write.mode("overwrite").parquet("abfss://final@bibik224161840.dfs.core.windows.net/gold/gold.dimPaymentMethods.parquet")

StatementMeta(sparkpool, 74, 9, Finished, Available, Finished)

# 3. dimEmployees

In [47]:
%%sql
-- Create table in Lake Database
CREATE TABLE IF NOT EXISTS gold.dim_Employees (
    EmployeeKey INT,         
    EmployeeID INT,          
    FullName STRING,         
    PreferredName STRING,    
    SearchName STRING,       
    IsPermittedToLogon BOOLEAN,  
    LogonName STRING,        
    IsExternalLogonProvider BOOLEAN,  
    HashedPassword BINARY,   
    IsSystemUser BOOLEAN,    
    IsEmployee BOOLEAN,      
    IsSalesperson BOOLEAN,   
    UserPreferences STRING,  
    PhoneNumber STRING,      
    FaxNumber STRING,        
    EmailAddress STRING,     
    start_date TIMESTAMP,    
    end_date TIMESTAMP,      
    is_active BOOLEAN        
);

StatementMeta(sparkpool, 74, 10, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [48]:
%%sql
-- Clear all data from the dimension table before inserting fresh data
TRUNCATE TABLE gold.dim_Employees;

StatementMeta(sparkpool, 74, 11, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [49]:
%%sql
-- Insert data from the Silver Layer into the Gold Layer
INSERT INTO gold.dim_Employees
SELECT
    PersonKey AS EmployeeKey,       
    PersonID AS EmployeeID,         
    FullName,                       
    PreferredName,                  
    SearchName,                     
    IsPermittedToLogon,             
    LogonName,                      
    IsExternalLogonProvider,        
    HashedPassword,                 
    IsSystemUser,                   
    IsEmployee,                     
    IsSalesperson,                  
    UserPreferences,                
    PhoneNumber,                    
    FaxNumber,                      
    EmailAddress,                   
    start_date,
    end_date,
    is_active
FROM silver.People;


StatementMeta(sparkpool, 74, 12, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [50]:
# Write to a Parquet file in the Gold layer of the storage account
df_dim_Employees = spark.sql("SELECT * FROM gold.dim_Employees")
df_dim_Employees.repartition(1).write.mode("overwrite").parquet("abfss://final@bibik224161840.dfs.core.windows.net/gold/gold.dimEmployees.parquet")

StatementMeta(sparkpool, 74, 13, Finished, Available, Finished)

# 4. dimLocation

In [51]:
%%sql
-- Create table in Lake Database
CREATE TABLE IF NOT EXISTS gold.dim_Location (
    CityKey INT, 
    CityID INT,
    CityName STRING,
    StateProvinceName STRING,
    StateProvinceCode STRING,
    CountryName STRING,
    Continent STRING,
    Region STRING,
    Subregion STRING,
    start_date TIMESTAMP,    
    end_date TIMESTAMP,      
    is_active BOOLEAN);

StatementMeta(sparkpool, 74, 14, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [52]:
%%sql
-- Clear all data from the dimension table before inserting fresh data
TRUNCATE TABLE gold.dim_Location;

StatementMeta(sparkpool, 74, 15, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [53]:
%%sql
-- Insert data from the Silver Layer into the Gold Layer
INSERT INTO gold.dim_Location
SELECT
    c.CityKey, 
    c.CityID,
    c.CityName,
    s.StateProvinceName,
    s.StateProvinceCode,
    co.CountryName,
    co.Continent,
    co.Region,
    co.Subregion,
    c.start_date,
    c.end_date,
    c.is_active
FROM silver.Cities c
JOIN silver.StateProvinces s ON c.StateProvinceID = s.StateProvinceID
JOIN silver.Countries co ON s.CountryID = co.CountryID;

StatementMeta(sparkpool, 74, 16, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [54]:
# Write to a Parquet file in the Gold layer of the storage account
df_dim_Location = spark.sql("SELECT * FROM gold.dim_Location")
df_dim_Location.repartition(1).write.mode("overwrite").parquet("abfss://final@bibik224161840.dfs.core.windows.net/gold/gold.dimLocation.parquet")

StatementMeta(sparkpool, 74, 17, Finished, Available, Finished)

# 5. dimStatus

In [55]:
%%sql
-- Create table in Lake Database
CREATE TABLE IF NOT EXISTS gold.dim_Status (
    StatusKey INT,      
    StatusID INT,    
    Status STRING,
    Description STRING,
    start_date timestamp,
    end_date timestamp,
    is_active boolean 
);

StatementMeta(sparkpool, 74, 18, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [56]:
%%sql
-- Clear all data from the dimension table before inserting fresh data
TRUNCATE TABLE gold.dim_Status 

StatementMeta(sparkpool, 74, 19, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [57]:
%%sql
-- Insert predefined status values into the Gold Layer
INSERT INTO gold.dim_Status (StatusKey, StatusID, Status, Description, start_date, end_date, is_active)
VALUES 
    (1, 0, 'Cancelled', 'Order has been cancelled', FROM_UTC_TIMESTAMP(current_timestamp(), 'Asia/Ho_Chi_Minh'), NULL, true),
    (2, 1, 'Completed', 'Order has been finalized', FROM_UTC_TIMESTAMP(current_timestamp(), 'Asia/Ho_Chi_Minh'), NULL, true);

StatementMeta(sparkpool, 74, 20, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [58]:
# Write to a Parquet file in the Gold layer of the storage account
df_dim_Status = spark.sql("SELECT * FROM gold.dim_Status")
df_dim_Status.repartition(1).write.mode("overwrite").parquet("abfss://final@bibik224161840.dfs.core.windows.net/gold/gold.dimStatus.parquet")

StatementMeta(sparkpool, 74, 21, Finished, Available, Finished)

# 6. dimStock

In [59]:
%%sql
-- Create table in Lake Database
CREATE TABLE IF NOT EXISTS gold.dim_Stock (
    StockItemKey INT,  
    StockItemID INT,
    StockItemName STRING,
    StockGroupName STRING,  
    Brand STRING,
    Size STRING,
    LeadTimeDays INT,
    QuantityPerOuter INT,
    IsChillerStock BOOLEAN,
    Barcode STRING,
    TaxRate DECIMAL(18,3),
    UnitPrice DECIMAL(18,2),
    RecommendedRetailPrice DECIMAL(18,2),
    TypicalWeightPerUnit DECIMAL(18,3),
    start_date TIMESTAMP,    
    end_date TIMESTAMP,      
    is_active BOOLEAN        
);

StatementMeta(sparkpool, 74, 22, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [60]:
%%sql
-- Clear all data from the dimension table before inserting fresh data
TRUNCATE TABLE gold.dim_Stock;

StatementMeta(sparkpool, 74, 23, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [61]:
%%sql
-- Insert data from the Silver Layer into the Gold Layer
INSERT INTO gold.dim_Stock
SELECT
    si.StockItemKey,  
    si.StockItemID,
    si.StockItemName,
    sg.StockGroupName,  
    si.Brand,
    si.Size,
    si.LeadTimeDays,
    si.QuantityPerOuter,
    si.IsChillerStock,
    si.Barcode,
    si.TaxRate,
    si.UnitPrice,
    si.RecommendedRetailPrice,
    si.TypicalWeightPerUnit,
    si.start_date,
    si.end_date,
    si.is_active
FROM silver.StockItems si
LEFT JOIN silver.StockItemStockGroups sisg ON si.StockItemID = sisg.StockItemID
LEFT JOIN silver.StockGroups sg ON sisg.StockGroupID = sg.StockGroupID;

StatementMeta(sparkpool, 74, 24, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [62]:
# Write to a Parquet file in the Gold layer of the storage account
df_dim_Stock = spark.sql("SELECT * FROM gold.dim_Stock")
df_dim_Stock.repartition(1).write.mode("overwrite").parquet("abfss://final@bibik224161840.dfs.core.windows.net/gold/gold.dimStock.parquet")

StatementMeta(sparkpool, 74, 25, Finished, Available, Finished)

# 7. dimSuppliers

In [63]:
%%sql
-- Create table in Lake Database
CREATE TABLE IF NOT EXISTS gold.dim_Suppliers (
    SupplierKey INT,  
    SupplierID INT,
    SupplierReference STRING,
    BankAccountName STRING,  
    BankAccountBranch STRING,
    BankAccountNumber STRING,
    PaymentDays INT,
    PhoneNumber STRING,
    FaxNumber STRING,
    DeliveryAddressLine1 STRING,
    DeliveryAddressLine2 STRING,
    DeliveryPostalCode STRING,
    DeliveryLocation STRING,
    start_date TIMESTAMP,    
    end_date TIMESTAMP,      
    is_active BOOLEAN        
);

StatementMeta(sparkpool, 74, 26, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [64]:
%%sql
-- Clear all data from the dimension table before inserting fresh data
TRUNCATE TABLE gold.dim_Suppliers;

StatementMeta(sparkpool, 74, 27, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [65]:
%%sql
-- Insert data from the Silver Layer into the Gold Layer
INSERT INTO gold.dim_Suppliers
SELECT
    SupplierKey,  
    SupplierID,
    SupplierReference,
    BankAccountName,  
    BankAccountBranch,
    BankAccountNumber,
    PaymentDays,
    PhoneNumber,
    FaxNumber,
    DeliveryAddressLine1,
    DeliveryAddressLine2,
    DeliveryPostalCode,
    DeliveryLocation,
    start_date,
    end_date,
    is_active
FROM silver.Suppliers;

StatementMeta(sparkpool, 74, 28, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [66]:
# Write to a Parquet file in the Gold layer of the storage account
df_dim_Suppliers = spark.sql("SELECT * FROM gold.dim_Suppliers")
df_dim_Suppliers.repartition(1).write.mode("overwrite").parquet("abfss://final@bibik224161840.dfs.core.windows.net/gold/gold.dimSuppliers.parquet")

StatementMeta(sparkpool, 74, 29, Finished, Available, Finished)

# 8. dimTime

In [67]:
from pyspark.sql.functions import col, date_format, year, quarter, month, dayofweek, lit, expr, unix_timestamp
from datetime import datetime, timedelta

# Generate a list of dates from 2005 to 2030
start_date = datetime(2005, 1, 1)
end_date = datetime(2030, 12, 31)
date_list = [(start_date + timedelta(days=x)) for x in range((end_date - start_date).days + 1)]

# Convert to PySpark DataFrame
df = spark.createDataFrame([(date,) for date in date_list], ["Date"])

# Add necessary columns
df = df.withColumn("DateKey", date_format(col("Date"), "yyyyMMdd").cast("int")) \
       .withColumn("Year", year(col("Date"))) \
       .withColumn("StartYear", unix_timestamp(expr("concat(Year, '-01-01')"), "yyyy-MM-dd").cast("timestamp")) \
       .withColumn("EndYear", unix_timestamp(expr("concat(Year, '-12-31')"), "yyyy-MM-dd").cast("timestamp")) \
       .withColumn("Quarter", quarter(col("Date"))) \
       .withColumn("QuarterName", expr("concat('Q', Quarter)")) \
       .withColumn("StartQuarter", expr("date_trunc('quarter', Date)")) \
       .withColumn("EndQuarter", expr("last_day(date_trunc('quarter', Date))")) \
       .withColumn("Month", month(col("Date"))) \
       .withColumn("MonthName", date_format(col("Date"), "MMMM")) \
       .withColumn("MonthNameShort", date_format(col("Date"), "MMM")) \
       .withColumn("StartMonth", expr("date_trunc('month', Date)")) \
       .withColumn("EndMonth", expr("last_day(Date)")) \
       .withColumn("Day", date_format(col("Date"), "dd").cast("int")) \
       .withColumn("DayofWeek", dayofweek(col("Date"))) \
       .withColumn("IsHoliday", expr("case when DayofWeek IN (1, 7) then 1 else 0 end").cast("boolean"))


df = df.select("DateKey", *[col for col in df.columns if col != "DateKey"])

StatementMeta(sparkpool, 74, 30, Finished, Available, Finished)

In [68]:
# Write DataFrame to the Gold Layer (Lake Database) in Parquet format
df.write.mode("overwrite").format("parquet").saveAsTable("gold.dim_Time")

StatementMeta(sparkpool, 74, 31, Finished, Available, Finished)

In [69]:
# Write to a Parquet file in the Gold layer of the storage account
df_dim_Time = spark.sql("SELECT * FROM gold.dim_Time")
df_dim_Time.repartition(1).write.mode("overwrite").parquet("abfss://final@bibik224161840.dfs.core.windows.net/gold/gold.dimTime.parquet")

StatementMeta(sparkpool, 74, 32, Finished, Available, Finished)

# 9. factPurchaseOrders

In [86]:
%%sql
-- Create table in Lake Database
CREATE TABLE IF NOT EXISTS gold.fact_PurchaseOrders (
    PurchaseOrderID INT,
    StockItemKey INT,
    SupplierKey INT,
    EmployeeKey INT,
    PaymentMethodKey INT,
    DeliveryMethodKey INT,
    CityKey INT,
    StatusKey INT,
    OrderDate INT,
    ExpectedDeliveryDate INT,
    OrderedOuters INT,
    ReceivedOuters INT,
    ExpectedUnitPricePerOuter DECIMAL(18,2),
    LastReceiptDate INT,
    TaxAmount DECIMAL(18,2),
    TransactionAmount DECIMAL(18,2)
) 

StatementMeta(sparkpool, 74, 49, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [87]:
%%sql
-- Clear all data from the dimension table before inserting fresh data
TRUNCATE TABLE gold.fact_PurchaseOrders;

StatementMeta(sparkpool, 74, 50, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [88]:
%%sql
INSERT INTO gold.fact_PurchaseOrders
SELECT 
    po.PurchaseOrderID,
    ds.StockItemKey,
    dsu.SupplierKey,
    de.EmployeeKey,
    dpm.PaymentMethodKey,
    ddm.DeliveryMethodKey,
    dl.CityKey,
    CASE 
        WHEN po.IsOrderFinalized = 'TRUE' THEN 1 
        ELSE 2 
    END AS StatusKey,
    CAST(DATE_FORMAT(po.OrderDate, 'yyyyMMdd') AS INT) AS OrderDate,
    CAST(DATE_FORMAT(po.ExpectedDeliveryDate, 'yyyyMMdd') AS INT) AS ExpectedDeliveryDate,
    pol.OrderedOuters,
    pol.ReceivedOuters,
    pol.ExpectedUnitPricePerOuter,
    CAST(DATE_FORMAT(pol.LastReceiptDate, 'yyyyMMdd') AS INT) AS LastReceiptDate,
    st.TaxAmount,
    st.TransactionAmount
FROM silver.PurchaseOrders po
JOIN silver.PurchaseOrderLines pol ON po.PurchaseOrderID = pol.PurchaseOrderID
JOIN gold.dim_Stock ds ON pol.StockItemID = ds.StockItemID  
JOIN gold.dim_Suppliers dsu ON po.SupplierID = dsu.SupplierID  
JOIN gold.dim_Employees de ON po.ContactPersonID = de.EmployeeID  
JOIN silver.supplierTransactions st ON po.PurchaseOrderID = st.PurchaseOrderID  
JOIN gold.dim_PaymentMethods dpm ON st.PaymentMethodID = dpm.PaymentMethodID  
JOIN gold.dim_DeliveryMethods ddm ON po.DeliveryMethodID = ddm.DeliveryMethodKey  
JOIN silver.suppliers s ON po.SupplierID = s.SupplierID  -- Thêm JOIN để lấy DeliveryCityID từ silver.suppliers  
JOIN gold.dim_Location dl ON s.DeliveryCityID = dl.CityID;  -- Sửa để lấy từ silver.suppliers  


StatementMeta(sparkpool, 74, 51, Finished, Available, Finished)

<Spark SQL result set with 0 rows and 0 fields>

In [83]:
# Write to a Parquet file in the Gold layer of the storage account
df_fact_PurchaseOrders = spark.sql("SELECT * FROM gold.fact_PurchaseOrders")
df_fact_PurchaseOrders.repartition(1).write.mode("overwrite").parquet("abfss://final@bibik224161840.dfs.core.windows.net/gold/gold.factPurchaseOrders.parquet")

StatementMeta(sparkpool, 74, 46, Finished, Available, Finished)