In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit

In [21]:
spark = SparkSession.builder.appName("SCD_Type_II").getOrCreate()

In [None]:
# Load data to DataFrame
customer_df = spark.read.option("header", True).csv("dataset/customer.csv")
new_customer_df = spark.read.option("header", True).csv("dataset/newcustomer.csv")

In [4]:
customer_df.show()

+-----------+------------+-------------+----------+--------+----------+
|Customer_ID|        Name|      Address|Start_Date|End_Date|Is_Current|
+-----------+------------+-------------+----------+--------+----------+
|        101|    John Doe|     New York|01-01-2022|    NULL|         Y|
|        102|  Jane Smith|      Chicago|15-03-2023|    NULL|         Y|
|        103| Michael Lee|San Francisco|20-07-2021|    NULL|         Y|
|        104| Emily Davis|       Boston|10-01-2024|    NULL|         Y|
|        105|David Wilson|      Seattle|05-11-2023|    NULL|         Y|
+-----------+------------+-------------+----------+--------+----------+



In [5]:
new_customer_df.show()

+-----------+----------+-----------+----------+
|Customer_ID|      Name|    Address|Start_Date|
+-----------+----------+-----------+----------+
|        101|  John Doe|Los Angeles|20-07-2025|
|        102|Jane Smith|     Dallas|20-07-2025|
+-----------+----------+-----------+----------+



In [6]:
current_date = "2025-07-20"
previous_date = "2025-07-19"

In [7]:
joined_df = customer_df.alias("cust").join(
    new_customer_df.alias("new"),
    on="Customer_ID"
).select(
    col("cust.Customer_ID"),
    col("cust.Name").alias("old_name"),
    col("cust.Address").alias("old_address"),
    col("cust.Start_Date"),
    col("cust.End_Date"),
    col("cust.Is_Current"),
    col("new.Name").alias("new_name"),
    col("new.Address").alias("new_address"),
    col("new.Start_Date").alias("new_start_date")
)


In [8]:
joined_df.show()

+-----------+----------+-----------+----------+--------+----------+----------+-----------+--------------+
|Customer_ID|  old_name|old_address|Start_Date|End_Date|Is_Current|  new_name|new_address|new_start_date|
+-----------+----------+-----------+----------+--------+----------+----------+-----------+--------------+
|        101|  John Doe|   New York|01-01-2022|    NULL|         Y|  John Doe|Los Angeles|    20-07-2025|
|        102|Jane Smith|    Chicago|15-03-2023|    NULL|         Y|Jane Smith|     Dallas|    20-07-2025|
+-----------+----------+-----------+----------+--------+----------+----------+-----------+--------------+



In [9]:
changed_rows = joined_df.filter(col("old_address") != col("new_address"))

In [11]:
changed_rows.show()

+-----------+----------+-----------+----------+--------+----------+----------+-----------+--------------+
|Customer_ID|  old_name|old_address|Start_Date|End_Date|Is_Current|  new_name|new_address|new_start_date|
+-----------+----------+-----------+----------+--------+----------+----------+-----------+--------------+
|        101|  John Doe|   New York|01-01-2022|    NULL|         Y|  John Doe|Los Angeles|    20-07-2025|
|        102|Jane Smith|    Chicago|15-03-2023|    NULL|         Y|Jane Smith|     Dallas|    20-07-2025|
+-----------+----------+-----------+----------+--------+----------+----------+-----------+--------------+



In [12]:
expired_records = customer_df.join(
    changed_rows.select("Customer_ID"), on="Customer_ID"
).withColumn("End_Date", lit(previous_date)) \
 .withColumn("Is_Current", lit("N"))

In [13]:
expired_records.show()

+-----------+----------+--------+----------+----------+----------+
|Customer_ID|      Name| Address|Start_Date|  End_Date|Is_Current|
+-----------+----------+--------+----------+----------+----------+
|        101|  John Doe|New York|01-01-2022|2025-07-19|         N|
|        102|Jane Smith| Chicago|15-03-2023|2025-07-19|         N|
+-----------+----------+--------+----------+----------+----------+



In [14]:
new_records = changed_rows.select(
    col("Customer_ID"),
    col("new_name").alias("Name"),
    col("new_address").alias("Address"),
    col("new_start_date").alias("Start_Date")
).withColumn("End_Date", lit("")) \
 .withColumn("Is_Current", lit("Y"))

In [15]:
new_records.show()

+-----------+----------+-----------+----------+--------+----------+
|Customer_ID|      Name|    Address|Start_Date|End_Date|Is_Current|
+-----------+----------+-----------+----------+--------+----------+
|        101|  John Doe|Los Angeles|20-07-2025|        |         Y|
|        102|Jane Smith|     Dallas|20-07-2025|        |         Y|
+-----------+----------+-----------+----------+--------+----------+



In [16]:
unchanged_records = customer_df.join(
    changed_rows.select("Customer_ID"), on="Customer_ID", how="left_anti"
)

In [17]:
unchanged_records.show()

+-----------+------------+-------------+----------+--------+----------+
|Customer_ID|        Name|      Address|Start_Date|End_Date|Is_Current|
+-----------+------------+-------------+----------+--------+----------+
|        103| Michael Lee|San Francisco|20-07-2021|    NULL|         Y|
|        104| Emily Davis|       Boston|10-01-2024|    NULL|         Y|
|        105|David Wilson|      Seattle|05-11-2023|    NULL|         Y|
+-----------+------------+-------------+----------+--------+----------+



In [18]:
final_df = expired_records.union(new_records).union(unchanged_records)

In [19]:
final_df.show()

+-----------+------------+-------------+----------+----------+----------+
|Customer_ID|        Name|      Address|Start_Date|  End_Date|Is_Current|
+-----------+------------+-------------+----------+----------+----------+
|        101|    John Doe|     New York|01-01-2022|2025-07-19|         N|
|        102|  Jane Smith|      Chicago|15-03-2023|2025-07-19|         N|
|        101|    John Doe|  Los Angeles|20-07-2025|          |         Y|
|        102|  Jane Smith|       Dallas|20-07-2025|          |         Y|
|        103| Michael Lee|San Francisco|20-07-2021|      NULL|         Y|
|        104| Emily Davis|       Boston|10-01-2024|      NULL|         Y|
|        105|David Wilson|      Seattle|05-11-2023|      NULL|         Y|
+-----------+------------+-------------+----------+----------+----------+



In [20]:
final_df.orderBy("Customer_ID", "Start_Date").show(truncate=False)

+-----------+------------+-------------+----------+----------+----------+
|Customer_ID|Name        |Address      |Start_Date|End_Date  |Is_Current|
+-----------+------------+-------------+----------+----------+----------+
|101        |John Doe    |New York     |01-01-2022|2025-07-19|N         |
|101        |John Doe    |Los Angeles  |20-07-2025|          |Y         |
|102        |Jane Smith  |Chicago      |15-03-2023|2025-07-19|N         |
|102        |Jane Smith  |Dallas       |20-07-2025|          |Y         |
|103        |Michael Lee |San Francisco|20-07-2021|NULL      |Y         |
|104        |Emily Davis |Boston       |10-01-2024|NULL      |Y         |
|105        |David Wilson|Seattle      |05-11-2023|NULL      |Y         |
+-----------+------------+-------------+----------+----------+----------+

