In [0]:
from pyspark.sql.functions import col, lit, current_timestamp,coalesce
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, FloatType, TimestampType
from datetime import datetime

# Define schema for the source DataFrame
source_schema = StructType([
    StructField("Emp_ID", IntegerType(), False),
    StructField("First_Name", StringType(), False),
    StructField("Last_Name", StringType(), False),
    StructField("Salary", FloatType(), False),
    StructField("Nationality", StringType(), False),
    StructField("timestamp", TimestampType(), False)
])

# Define initial data with datetime objects
initial_data = [
    (1, 'Scott', 'Tiger', 1000.0, 'India', datetime(2023, 1, 1, 0, 0, 0)),
    (2, 'John', 'Clair', 2000.0, 'UK', datetime(2023, 1, 1, 0, 0, 0))
]
employee_source = spark.createDataFrame(initial_data, schema=source_schema)

# Convert the timestamp field from string to TimestampType
employee_source = employee_source.withColumn("timestamp", col("timestamp").cast(TimestampType()))


# Define schema for the target DataFrame
target_schema = StructType([
    StructField("Emp_ID", IntegerType(), False),
    StructField("First_Name", StringType(), False),
    StructField("Last_Name", StringType(), False),
    StructField("Salary", FloatType(), False),
    StructField("Nationality", StringType(), False)
])

# Create an empty target DataFrame
employee_target = spark.createDataFrame([], schema=target_schema)





In [0]:
# Deduplicate the source DataFrame based on the latest timestamp
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

windowSpec = Window.partitionBy("Emp_ID").orderBy(col("timestamp").desc())
deduped_employee_source = employee_source.withColumn("row_num", row_number().over(windowSpec)).filter(col("row_num") == 1).drop("row_num")

# Merge operation: update existing records and insert new ones
# Join the target with the deduped source DataFrame
merged_df = employee_target.alias('target').join(
    deduped_employee_source.alias('source'),
    on='Emp_ID',
    how='outer'
)

# Create updated DataFrame
updated_df = merged_df.select(
    coalesce(col('source.Emp_ID'), col('target.Emp_ID')).alias('Emp_ID'),
    coalesce(col('source.First_Name'), col('target.First_Name')).alias('First_Name'),
    coalesce(col('source.Last_Name'), col('target.Last_Name')).alias('Last_Name'),
    coalesce(col('source.Salary'), col('target.Salary')).alias('Salary'),
    coalesce(col('source.Nationality'), col('target.Nationality')).alias('Nationality')
)

# Show the updated target DataFrame
updated_df.show()

+------+----------+---------+-------+-----------+
|Emp_ID|First_Name|Last_Name| Salary|Nationality|
+------+----------+---------+-------+-----------+
|     1|     Scott|    Tiger|1000.13|        USA|
|     2|      John|    Clair| 2000.0|         UK|
+------+----------+---------+-------+-----------+



In [0]:
# Insert new data into the source DataFrame
new_data = [
    (1, 'Scott', 'Tiger', 1000.13, 'USA', datetime(2023, 1, 2, 0, 0, 0))
]
new_employee_source = spark.createDataFrame(new_data, schema=source_schema)

# Union new data with existing source data
employee_source = employee_source.union(new_employee_source)

In [0]:
dbutils.fs.rm("dbfs:/user/hive/warehouse/employee_target", True)

Out[10]: True

In [0]:
%sql
-- Create the source table if it does not exist, including the timestamp column
CREATE TABLE IF NOT EXISTS employee_source (
    Emp_ID INT,
    First_Name STRING,
    Last_Name STRING,
    Salary FLOAT,
    Nationality STRING,
    timestamp TIMESTAMP
);

-- Insert initial data into the source table
INSERT INTO employee_source VALUES
    (1, 'Scott', 'Tiger', 1000.0, 'India', CURRENT_TIMESTAMP()), 
    (2, 'John', 'Clair', 2000.0, 'UK', CURRENT_TIMESTAMP());

-- Create the target table if it does not exist
CREATE TABLE IF NOT EXISTS employee_target (
    Emp_ID INT,
    First_Name STRING,
    Last_Name STRING,
    Salary FLOAT,
    Nationality STRING,
    start_date DATE,
    end_date DATE,
    current_flag BOOLEAN
);

-- Create a CTE to get the latest records from the source table
WITH latest_employee_records AS (
    SELECT Emp_ID, First_Name, Last_Name, Salary, Nationality, timestamp
    FROM (
        SELECT 
            *,
            ROW_NUMBER() OVER (PARTITION BY Emp_ID ORDER BY timestamp DESC) as rn
        FROM employee_source
    ) subquery
    WHERE rn = 1
)

-- Perform the SCD Type 2 MERGE operation
MERGE INTO employee_target AS target
USING (
    SELECT 
        Emp_ID, 
        First_Name, 
        Last_Name, 
        Salary, 
        Nationality, 
        timestamp
    FROM latest_employee_records
) AS source
ON target.Emp_ID = source.Emp_ID AND target.current_flag = TRUE
WHEN MATCHED 
    AND (
        target.First_Name != source.First_Name OR 
        target.Last_Name != source.Last_Name OR 
        target.Salary != source.Salary OR 
        target.Nationality != source.Nationality
    )
THEN
    UPDATE SET
        target.current_flag = FALSE,
        target.end_date = source.timestamp
WHEN NOT MATCHED THEN
    INSERT (
        Emp_ID,
        First_Name,
        Last_Name,
        Salary,
        Nationality,
        start_date,
        end_date,
        current_flag
    )
    VALUES (
        source.Emp_ID,
        source.First_Name,
        source.Last_Name,
        source.Salary,
        source.Nationality,
        source.timestamp,
        NULL,
        TRUE
    );

-- Insert new records for the updated entries
INSERT INTO employee_target (Emp_ID, First_Name, Last_Name, Salary, Nationality, start_date, end_date, current_flag)
SELECT 
    source.Emp_ID, 
    source.First_Name, 
    source.Last_Name, 
    source.Salary, 
    source.Nationality, 
    source.timestamp,
    NULL,
    TRUE
FROM latest_employee_records source
JOIN employee_target target
ON target.Emp_ID = source.Emp_ID AND target.end_date = source.timestamp
WHERE target.First_Name != source.First_Name 
    OR target.Last_Name != source.Last_Name 
    OR target.Salary != source.Salary 
    OR target.Nationality != source.Nationality;

-- Select data from the target table to verify changes
SELECT * FROM employee_target ORDER BY Emp_ID, start_date;


[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
[0;32m<command-1101136203510778>[0m in [0;36m<cell line: 1>[0;34m()[0m
[1;32m     10[0m     [0mdisplay[0m[0;34m([0m[0mdf[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[1;32m     11[0m     [0;32mreturn[0m [0mdf[0m[0;34m[0m[0;34m[0m[0m
[0;32m---> 12[0;31m   [0m_sqldf[0m [0;34m=[0m [0m____databricks_percent_sql[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[1;32m     13[0m [0;32mfinally[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[1;32m     14[0m   [0;32mdel[0m [0m____databricks_percent_sql[0m[0;34m[0m[0;34m[0m[0m

[0;32m<command-1101136203510778>[0m in [0;36m____databricks_percent_sql[0;34m()[0m
[1;32m      6[0m     [0mspark[0m[0;34m.[0m[0msql[0m[0;34m([0m[0mbase64[0m[0;34m.[0m[0mstandard_b64decode[0m[0;34m([0m[0;34m"LS0gQ3JlYXRlIHRoZSB0YXJnZXQ

In [0]:
%sql
-- Perform the SCD Type 2 MERGE operation
MERGE INTO employee_target1 AS target
USING (
    SELECT 
        Emp_ID, 
        First_Name, 
        Last_Name, 
        Salary, 
        Nationality, 
        timestamp
    FROM latest_employee_records1
) AS source
ON target.Emp_ID = source.Emp_ID AND target.current_flag = TRUE
WHEN MATCHED 
    AND (
        target.First_Name != source.First_Name OR 
        target.Last_Name != source.Last_Name OR 
        target.Salary != source.Salary OR 
        target.Nationality != source.Nationality
    )
THEN
    UPDATE SET
        target.current_flag = FALSE,
        target.end_date = source.timestamp
WHEN NOT MATCHED THEN
    INSERT (
        Emp_ID,
        First_Name,
        Last_Name,
        Salary,
        Nationality,
        start_date,
        end_date,
        current_flag
    )
    VALUES (
        source.Emp_ID,
        source.First_Name,
        source.Last_Name,
        source.Salary,
        source.Nationality,
        source.timestamp,
        NULL,
        TRUE
    );


In [0]:
%sql
-- Insert new records for the updated entries
INSERT INTO employee_target1 (Emp_ID, First_Name, Last_Name, Salary, Nationality, start_date, end_date, current_flag)
SELECT 
    source.Emp_ID, 
    source.First_Name, 
    source.Last_Name, 
    source.Salary, 
    source.Nationality, 
    source.timestamp,
    NULL,
    TRUE
FROM latest_employee_records1 source
JOIN employee_target1 target
ON target.Emp_ID = source.Emp_ID AND target.end_date = source.timestamp
WHERE target.First_Name != source.First_Name 
    OR target.Last_Name != source.Last_Name 
    OR target.Salary != source.Salary 
    OR target.Nationality != source.Nationality;

-- Select data from the target table to verify changes
SELECT * FROM employee_target1 ORDER BY Emp_ID, start_date;
