In [0]:
#import libraries

from pyspark.sql.functions import *
from pyspark.sql.types import *
from delta.tables import DeltaTable

###CREATING FLAG PARAMETER

In [0]:
dbutils.widgets.text("incremental_flag","0")

In [0]:
#incremental_flag = 0 means first load

incremental_flag = dbutils.widgets.get("incremental_flag")

###CREATING DIMENSION BRANCH  

In [0]:
%sql
SELECT * FROM parquet.`abfss://projects@projectstorageaccount1.dfs.core.windows.net/curated/CAR SALES/`;

Branch_ID,Dealer_ID,Model_ID,Revenue,Units_Sold,Date_ID,Day,Month,Year,BranchName,DealerName,Product_Name,Model_Category,ItemPrice
BR9546,DLR0060,Jee-M10,7223451,1,DT01246,28,5,2020,Premier Motors,"Fisker, Karma Motors",Jeep,Jee,7223451.0
BR9666,DLR0062,Jee-M12,22093020,3,DT01246,30,5,2020,Puma Motors,Ford Australia Motors,Jeep,Jee,7364340.0
BR9726,DLR0063,Jee-M13,22372413,3,DT01247,31,5,2020,Power Ranger Motors,Ford do Brasil Motors,Jeep,Jee,7457471.0
XYZ9726,XYZ0063,ZYXM13,22372413,3,DT01247,31,5,2020,DataFam Motors,Datafam Dealers,Surprise,ZYXM13,7457471.0


###Fetch related columns

In [0]:
branch_src_df = spark.sql('''
                         SELECT DISTINCT Branch_ID, BranchName 
                         FROM parquet.`abfss://projects@projectstorageaccount1.dfs.core.windows.net/curated/CAR SALES/`;
                         '''
                        )


In [0]:
branch_src_df.display()

Branch_ID,BranchName
BR9666,Puma Motors
BR9726,Power Ranger Motors
BR9546,Premier Motors
XYZ9726,DataFam Motors


###Dim Branch sink - Initial and Incremental

In [0]:
if spark.catalog.tableExists("car_sales_catalog.refined.dim_branch"):
    
    branch_sink_df  = spark.sql('''
                            SELECT Dim_Branch_Key, Branch_ID, BranchName
                            FROM car_sales_catalog.refined.dim_branch
                            '''
                        )
else:
    branch_sink_df  = spark.sql('''
                            SELECT 1 as Dim_Branch_Key, Branch_ID, BranchName 
                            FROM parquet.`abfss://projects@projectstorageaccount1.dfs.core.windows.net/curated/CAR SALES/`
                            WHERE 1 = 0;
                            '''
                        )

In [0]:
branch_sink_df.display()

Dim_Branch_Key,Branch_ID,BranchName
1,BR0131,Audi Motors
2,BR0760,Healey Motors
3,BR0789,Hillman Motors
4,BR0938,Isotta Fraschini Motors
5,BR1040,Lada Motors
6,BR1693,Saleen Motors
7,BR1792,Simca do Brasil Motors
8,BR1799,Simca do Brasil Motors
9,BR1955,Toyota Motors
10,BR1978,Turner Motors


###Filtering New Records and Old Records

In [0]:
df_filter = branch_src_df.join(branch_sink_df, branch_src_df.Branch_ID == branch_sink_df.Branch_ID, "left")\
                            .select(branch_src_df.Branch_ID, branch_src_df.BranchName, branch_sink_df.Dim_Branch_Key)  

In [0]:
df_filter.display()

Branch_ID,BranchName,Dim_Branch_Key
BR9666,Puma Motors,
BR9726,Power Ranger Motors,
BR9546,Premier Motors,
XYZ9726,DataFam Motors,


**df_filter_old**

In [0]:
df_filter_old = df_filter.filter(col('Dim_Branch_Key').isNotNull())  

In [0]:
df_filter_old.display()

Branch_ID,BranchName,Dim_Branch_Key


**df_filter_new**

In [0]:
df_filter_new = df_filter.filter(col('Dim_Branch_Key').isNull()).select(branch_src_df.Branch_ID, branch_src_df.BranchName)

In [0]:
df_filter_new.display()

Branch_ID,BranchName
BR9666,Puma Motors
BR9726,Power Ranger Motors
BR9546,Premier Motors
XYZ9726,DataFam Motors


###CREATE Surrogate key

####Fetch max surrogate key from existing table

In [0]:
if incremental_flag == '0':
    max_value = 1
else:
    max_value = spark.sql("SELECT max(Dim_Branch_Key) FROM car_sales_catalog.refined.dim_branch").collect()[0][0] + 1

In [0]:
df_filter_new = df_filter_new.withColumn('Dim_Branch_Key', max_value + monotonically_increasing_id())

In [0]:
df_filter_new.display()

Branch_ID,BranchName,Dim_Branch_Key
BR9666,Puma Motors,1837
BR9726,Power Ranger Motors,1838
BR9546,Premier Motors,1839
XYZ9726,DataFam Motors,1840


###Creating final df - df_filter_old + df_filter_new

In [0]:
df_final = df_filter_new.union(df_filter_old)

In [0]:
df_final.display()

Branch_ID,BranchName,Dim_Branch_Key
BR9666,Puma Motors,1837
BR9726,Power Ranger Motors,1838
BR9546,Premier Motors,1839
XYZ9726,DataFam Motors,1840


###SCD Type 1 UPSERT

In [0]:
#Incremental run

if spark.catalog.tableExists('car_sales_catalog.refined.dim_branch'):
    deltaTable = DeltaTable.forPath(spark, "abfss://projects@projectstorageaccount1.dfs.core.windows.net/refined/CAR SALES/dim_branch")

    deltaTable.alias("target").merge(df_final.alias("source"), "target.Dim_Branch_Key == source.Dim_Branch_Key")\
                                .whenMatchedUpdateAll()\
                                .whenNotMatchedInsertAll()\
                                .execute()    

#Initial run
else:
    df_final.write.format("delta")\
                    .mode("overwrite")\
                    .option("path","abfss://projects@projectstorageaccount1.dfs.core.windows.net/refined/CAR SALES/dim_branch" )\
                    .saveAsTable("car_sales_catalog.refined.dim_branch")