In [12]:
from pyspark import sql
from lib import common_functions
from lib import configuration

In [2]:
spark = common_functions.get_spark_session('dp203')
spark.active()

Variables

In [50]:
fact_InternetSales = configuration.dp203_input_path+'01/FactInternetSales.csv'
dim_Product = configuration.dp203_input_path+'01/DimProduct.csv'
dim_Date = configuration.dp203_input_path+'01/DimDate.csv'

fact_InternetSales

'/home/jovyan/code/files/input/dp203/01/FactInternetSales.csv'

Dataframes

In [51]:
fact_InternetSales_df = spark.read.format("csv").options(header='true', inferSchema='true', delimiter=',').load(fact_InternetSales)
dimProduct_df = spark.read.format("csv").options(header='true', inferSchema='true', delimiter=',').load(dim_Product)
dim_Date_df = spark.read.format("csv").options(header='true', inferSchema='true', delimiter=',').load(dim_Date)

print(f'FactInternetSales rows - {fact_InternetSales_df.count()}')
print(f'DimProduct rows - {dimProduct_df.count()}')
print(f'DimDate rows - {dim_Date_df.count()}')

fact_InternetSales_df.show(n=100)

FactInternetSales rows - 5000
DimProduct rows - 606
DimDate rows - 1461
+----------------+--------------------+-----------+----------+------------+----------+-----------+------------+-----------+-----------------+-------------+---------+--------------+--------------------+--------------+-------------------+----------------+-----------+---------+-------------+---------------------+----------------+--------------+
|SalesOrderNumber|SalesOrderLineNumber|CustomerKey|ProductKey|OrderDateKey|DueDateKey|ShipDateKey|PromotionKey|CurrencyKey|SalesTerritoryKey|OrderQuantity|UnitPrice|ExtendedAmount|UnitPriceDiscountPct|DiscountAmount|ProductStandardCost|TotalProductCost|SalesAmount|TaxAmount|FreightAmount|CarrierTrackingNumber|CustomerPONumber|RevisionNumber|
+----------------+--------------------+-----------+----------+------------+----------+-----------+------------+-----------+-----------------+-------------+---------+--------------+--------------------+--------------+-------------------+----

In [52]:
# from pyspark.sql.types import *
# from pyspark.sql.functions import *

# orderSchema = StructType([
#     StructField("SalesOrderNumber", StringType()),
#     StructField("SalesOrderLineNumber", IntegerType()),
#     StructField("OrderDate", DateType()),
#     StructField("CustomerName", StringType()),
#     StructField("Email", StringType()),
#     StructField("Item", StringType()),
#     StructField("Quantity", IntegerType()),
#     StructField("UnitPrice", FloatType()),
#     StructField("Tax", FloatType())
#     ])

# df = spark.read.option("header", True).load(f'{configuration.dp203_input_path}/*.csv', format='csv', schema=orderSchema)
# df.show()
# df.count()

Schema

In [53]:
fact_InternetSales_df.printSchema()

root
 |-- SalesOrderNumber: string (nullable = true)
 |-- SalesOrderLineNumber: integer (nullable = true)
 |-- CustomerKey: integer (nullable = true)
 |-- ProductKey: integer (nullable = true)
 |-- OrderDateKey: integer (nullable = true)
 |-- DueDateKey: integer (nullable = true)
 |-- ShipDateKey: integer (nullable = true)
 |-- PromotionKey: integer (nullable = true)
 |-- CurrencyKey: integer (nullable = true)
 |-- SalesTerritoryKey: integer (nullable = true)
 |-- OrderQuantity: integer (nullable = true)
 |-- UnitPrice: double (nullable = true)
 |-- ExtendedAmount: double (nullable = true)
 |-- UnitPriceDiscountPct: double (nullable = true)
 |-- DiscountAmount: integer (nullable = true)
 |-- ProductStandardCost: double (nullable = true)
 |-- TotalProductCost: double (nullable = true)
 |-- SalesAmount: double (nullable = true)
 |-- TaxAmount: double (nullable = true)
 |-- FreightAmount: double (nullable = true)
 |-- CarrierTrackingNumber: string (nullable = true)
 |-- CustomerPONumber: 

Partitions

In [79]:
from pyspark.sql.functions import year, month, col, to_date

dated_df = fact_InternetSales_df.withColumn("Year", year(to_date(col("OrderDateKey"), "yyyMMdd"))).withColumn("Month", month(to_date(col("OrderDateKey"), "yyyMMdd")))
dated_df.show(n=5)
dated_df.write.partitionBy("Year","Month").mode("overwrite").parquet(f"{configuration.dp203_output_path}01/FactInternetSalesPartitioned")
print ("Fact Internet Sales Partitioned data saved!")

+----------------+--------------------+-----------+----------+------------+----------+-----------+------------+-----------+-----------------+-------------+---------+--------------+--------------------+--------------+-------------------+----------------+-----------+---------+-------------+---------------------+----------------+--------------+----+-----+
|SalesOrderNumber|SalesOrderLineNumber|CustomerKey|ProductKey|OrderDateKey|DueDateKey|ShipDateKey|PromotionKey|CurrencyKey|SalesTerritoryKey|OrderQuantity|UnitPrice|ExtendedAmount|UnitPriceDiscountPct|DiscountAmount|ProductStandardCost|TotalProductCost|SalesAmount|TaxAmount|FreightAmount|CarrierTrackingNumber|CustomerPONumber|RevisionNumber|Year|Month|
+----------------+--------------------+-----------+----------+------------+----------+-----------+------------+-----------+-----------------+-------------+---------+--------------+--------------------+--------------+-------------------+----------------+-----------+---------+-------------+-

In [84]:
parquet_FactInternetSales_df = spark.read.parquet(f"{configuration.dp203_output_path}01/FactInternetSalesPartitioned/Year=2020/Month=12")
parquet_FactInternetSales_df.count()

85

TempView

In [85]:
parquet_FactInternetSales_df.createOrReplaceTempView('FactInternetSales')
dimProduct_df.createOrReplaceTempView('DimProduct')
dim_Date_df.createOrReplaceTempView('DimDate')

SQL

In [86]:
aggregate_product_sales_df = spark.sql(
"SELECT d.CalendarYear, d.MonthNumberOfYear, d.EnglishMonthName, \
p.EnglishProductName AS Product, SUM(o.OrderQuantity) AS UnitsSold \
FROM FactInternetSales AS o \
JOIN DimDate AS d ON o.OrderDateKey = d.DateKey \
JOIN DimProduct AS p ON o.ProductKey = p.ProductKey \
GROUP BY d.CalendarYear, d.MonthNumberOfYear, d.EnglishMonthName, p.EnglishProductName \
ORDER BY d.MonthNumberOfYear")
print(aggregate_product_sales_df.count())
aggregate_product_sales_df.show()

23
+------------+-----------------+----------------+--------------------+---------+
|CalendarYear|MonthNumberOfYear|EnglishMonthName|             Product|UnitsSold|
+------------+-----------------+----------------+--------------------+---------+
|        2020|               12|        December|    Road-250 Red, 58|        5|
|        2020|               12|        December|Road-550-W Yellow...|        3|
|        2020|               12|        December|Road-550-W Yellow...|        4|
|        2020|               12|        December|Mountain-200 Silv...|        5|
|        2020|               12|        December|  Road-650 Black, 44|        2|
|        2020|               12|        December|  Road-250 Black, 48|        2|
|        2020|               12|        December|  Road-650 Black, 60|        3|
|        2020|               12|        December|  Road-650 Black, 52|        2|
|        2020|               12|        December|Mountain-200 Silv...|        9|
|        2020|           

In [58]:
aggregate_product_sales_df.write.mode("overwrite").parquet(f'{configuration.dp203_output_path}01/AggregateProductSales.parquet')
print ("Product Sales data saved!")

Product Sales data saved!
