In [0]:
#import libraries

from pyspark.sql.functions import *
from pyspark.sql.types import *

###Data Reading

In [0]:
#Read file from given location

car_sales = spark.read.format("parquet")\
                        .option("header", "true")\
                        .option("inferSchema", "true")\
                        .load("abfss://projects@projectstorageaccount1.dfs.core.windows.net/raw/CAR SALES/")

In [0]:
car_sales.display()

Branch_ID,Dealer_ID,Model_ID,Revenue,Units_Sold,Date_ID,Day,Month,Year,BranchName,DealerName,Product_Name
BR9546,DLR0060,Jee-M10,7223451,1,DT01246,28,5,2020,Premier Motors,"Fisker, Karma Motors",Jeep
BR9666,DLR0062,Jee-M12,22093020,3,DT01246,30,5,2020,Puma Motors,Ford Australia Motors,Jeep
BR9726,DLR0063,Jee-M13,22372413,3,DT01247,31,5,2020,Power Ranger Motors,Ford do Brasil Motors,Jeep
XYZ9726,XYZ0063,ZYXM13,22372413,3,DT01247,31,5,2020,DataFam Motors,Datafam Dealers,Surprise


###Data Transformation

In [0]:
#Create a new column 'Model_Category' by splitting the 'Model_ID' column

car_sales = car_sales.withColumn("Model_Category", split(col("Model_ID"), "-")[0])

In [0]:
#Convert the 'Units_Sold' column to string type

car_sales.withColumn("Units_Sold", col("Units_Sold").cast(StringType())).display()

Branch_ID,Dealer_ID,Model_ID,Revenue,Units_Sold,Date_ID,Day,Month,Year,BranchName,DealerName,Product_Name,Model_Category
BR9546,DLR0060,Jee-M10,7223451,1,DT01246,28,5,2020,Premier Motors,"Fisker, Karma Motors",Jeep,Jee
BR9666,DLR0062,Jee-M12,22093020,3,DT01246,30,5,2020,Puma Motors,Ford Australia Motors,Jeep,Jee
BR9726,DLR0063,Jee-M13,22372413,3,DT01247,31,5,2020,Power Ranger Motors,Ford do Brasil Motors,Jeep,Jee
XYZ9726,XYZ0063,ZYXM13,22372413,3,DT01247,31,5,2020,DataFam Motors,Datafam Dealers,Surprise,ZYXM13


In [0]:
#Create a new column 'ItemPrice' by dividing 'Revenue' by 'Units_Sold' 

car_sales = car_sales.withColumn("ItemPrice", col("Revenue")/col("Units_Sold"))

In [0]:
display(car_sales.groupBy("Year","BranchName").agg(sum("Units_Sold").alias("Total_Units_Sold")).orderBy("Year","Total_Units_Sold",ascending = [1,0]))

Year,BranchName,Total_Units_Sold
2020,DataFam Motors,3
2020,Puma Motors,3
2020,Power Ranger Motors,3
2020,Premier Motors,1


Databricks visualization. Run in Databricks to view.

###Data Writing

In [0]:
car_sales.write.format("parquet")\
               .mode("overwrite")\
               .save("abfss://projects@projectstorageaccount1.dfs.core.windows.net/curated/CAR SALES/")

###Querying Curated data

In [0]:
%sql
SELECT count(*) FROM parquet.`abfss://projects@projectstorageaccount1.dfs.core.windows.net/curated/CAR SALES/`;

count(1)
4
