In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.appName('infosys722-i4-dp').getOrCreate()

selected_products = spark.read.options(compression='gzip').format('parquet').load('./Cleaned Datasets/products_v001')
selected_inventory = spark.read.options(compression='gzip').format('parquet').load('./Cleaned Datasets/inventory_v001')
selected_sales = spark.read.options(compression='gzip').format('parquet').load('./Cleaned Datasets/sales_v001')
selected_purchase_detail = spark.read.options(compression='gzip').format('parquet').load('./Cleaned Datasets/purchase_detail_v001')

In [2]:
# Integrate product information to purchase details
integrated_purchase_detail = selected_purchase_detail.join(
    selected_products.drop('Size', 'VendorNumber', 'PurchasePrice', 'Classification'), on='Brand', how='left')

# Integrate product information to sales records and contruct ProfitRate fields
integrated_sales = selected_sales.join(
    selected_products.drop('Size', 'Volume', 'Classification', 'VendorNumber'), on='Brand', how='left')
integrated_sales = integrated_sales.withColumn('ProfitRate', 
    (integrated_sales['SalesPrice'] - integrated_sales['PurchasePrice']) / integrated_sales['PurchasePrice'])

In [3]:
def transform_data(dataframe, columns, transform_function = F.log10, suffix='_log10'):
    for column in columns: 
        dataframe = dataframe.withColumn(column + suffix, transform_function(F.col(column)))
    return dataframe

integrated_purchase_detail = transform_data(integrated_purchase_detail, ['PurchasePrice', 'Volume', 'Price'])
integrated_sales = transform_data(integrated_sales, ['PurchasePrice', 'Volume', 'SalesPrice'])

In [6]:
def save_ready_data(dataframe, name, folder = './Ready Datasets/', version = '_v001'):
    dataframe.write.parquet(folder + name + version, compression='gzip', mode='overwrite')

In [7]:
save_ready_data(integrated_purchase_detail, 'purchase_detail')
save_ready_data(integrated_sales, 'sales')