# Slowly Changing Dimensions - Type 2

In [1]:
import os
import shutil
from datetime import datetime

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, DataFrame
import pyspark.sql.types as T
import pyspark.sql.functions as F

In [3]:
sc = SparkContext()
spark = SparkSession.builder.appName('scd2').getOrCreate()

## 1. Preperation steps
In the following cells we will perform the following steps:
1. Read-in our target dataframe
2. Add our technical columns to this dataframe
3. Save the target dataframe as our 'source' dataframe (initial load)

### Read-in dataframe

In [89]:
# Define schema
schema = T.StructType([
    T.StructField('INDEX', T.IntegerType(), True), 
    T.StructField('CUSTOMER_ID', T.StringType(), True), 
    T.StructField('FIRST_NAME', T.StringType(), True), 
    T.StructField('LAST_NAME', T.StringType(), True), 
    T.StructField('COMPANY', T.StringType(), True), 
    T.StructField('CITY', T.StringType(), True), 
    T.StructField('COUNTRY', T.StringType(), True), 
    T.StructField('PHONE_1', T.StringType(), True), 
    T.StructField('PHONE_2', T.StringType(), True), 
    T.StructField('EMAIL', T.StringType(), True), 
    T.StructField('SUBSCRIPTION_DATE', T.DateType(), True), 
    T.StructField('WEBSITE', T.StringType(), True)
])

# Read-in dataframe
df = (
    spark
    .read
    .option('header', True)
    .option('schema', schema)
    .csv('scd2_data/source.csv')
)

# Show dataframe
df.show(5)

+---------------+----------+---------+-----------------+--------------------+--------------------+--------------------+
|    CUSTOMER_ID|FIRST_NAME|LAST_NAME|             CITY|             COUNTRY|               PHONE|               EMAIL|
+---------------+----------+---------+-----------------+--------------------+--------------------+--------------------+
|DD37Cf93aecA6Dc|    Sheryl|   Baxter|     East Leonard|               Chile|        229.077.5154|zunigavanessa@smi...|
|1Ef7b82A4CAAD10|   Preston|   Lozano|East Jimmychester|            Djibouti|          5153435776|     vmata@colon.com|
|5Cef8BFA16c5e3c|     Linda|    Olsen|       Bensonview|  Dominican Republic|001-808-617-6467x...|stanleyblackwell@...|
|053d585Ab6b3159|    Joanna|   Bender|   West Priscilla|Slovakia (Slovak ...|001-234-203-0635x...|colinalvarado@mil...|
|2d08FB17EE273F4|     Aimee|    Downs|    Chavezborough|Bosnia and Herzeg...| (283)437-3886x88321| louis27@gilbert.com|
+---------------+----------+---------+--

### Add technical columns
We start be defining helper functions to add our technical columns to the dataframe.
Here, the following columns are added:
* `VALID_FROM`: indicates the timestamp at which this row was valid.
* `VALID_TO`: indicates the timestamp at which this row was/is no longer valid.
* `isCurrent`: indicates whether this row is currently 'active' or not.

This can be done using the `pyspark.sql.types` and `pyspark.sql.function` modules

In [90]:
def add_technical_columns(df: DataFrame, curr_timestamp: datetime = datetime.now()) -> DataFrame:
    return (
        df
        .withColumn('VALID_FROM', F.lit(curr_timestamp).cast(T.TimestampType()))
        .withColumn('VALID_TO', F.lit('9999-12-31 23:59:59').cast(T.TimestampType()))
        .withColumn('isCurrent', F.lit('Y').cast(T.StringType()))
    )

In [91]:
df = add_technical_columns(df)

In [92]:
df.show(5)

+---------------+----------+---------+-----------------+--------------------+--------------------+--------------------+--------------------+-------------------+---------+
|    CUSTOMER_ID|FIRST_NAME|LAST_NAME|             CITY|             COUNTRY|               PHONE|               EMAIL|          VALID_FROM|           VALID_TO|isCurrent|
+---------------+----------+---------+-----------------+--------------------+--------------------+--------------------+--------------------+-------------------+---------+
|DD37Cf93aecA6Dc|    Sheryl|   Baxter|     East Leonard|               Chile|        229.077.5154|zunigavanessa@smi...|2024-05-15 17:42:...|9999-12-31 23:59:59|        Y|
|1Ef7b82A4CAAD10|   Preston|   Lozano|East Jimmychester|            Djibouti|          5153435776|     vmata@colon.com|2024-05-15 17:42:...|9999-12-31 23:59:59|        Y|
|5Cef8BFA16c5e3c|     Linda|    Olsen|       Bensonview|  Dominican Republic|001-808-617-6467x...|stanleyblackwell@...|2024-05-15 17:42:...|9999-

### Saving dataframe as 'output'
PySpark has a particular way of saving parquet, delta, and csv files.
Because of this, we need to create a helper function, so that our output is saved as a single csv file.
Do not worry to much about understanding this code.

In [93]:
def save_as_csv(df: DataFrame, file_path: str):
    tmp_folder = file_path + 'tmp'
    
    # Save DataFrame to a temporary folder
    (
        df
        .coalesce(1)  # Ensure a single partition
        .write
        .mode('overwrite')
        .format('csv')
        .option('header', True)
        .save(tmp_folder)
    )
    
    # Find the single partition file
    for file_name in os.listdir(tmp_folder):
        if file_name.endswith('.csv'):
            tmp_file_path = os.path.join(tmp_folder, file_name)
            break
    
    # Move and rename the file to the final destination
    shutil.move(tmp_file_path, file_path)
    
    # Remove the temporary folder
    shutil.rmtree(tmp_folder)

In [94]:
save_as_csv(df=df, file_path='scd2_data/target.csv')

## 2. Starting the SCD2 Proces
Now, we will begin with the implementation of the Slowly Changing Dimensions type 1. We will be implementing the following steps:
1. Change the target dataframe by adding or editing some rows.
2. Read-in the target and source dataframe.
3. Select the rows in source dataframe that are new.
4. Select the rows in source dataframe that have been deleted.
5. Select the rows in source dataframe that are updated.
6. Insert, update, and/or delete the selected rows in the source dataframe.

### Step 1: Change the target dataframe by adding or editing some rows

* Make some alterations to the source data.
* You can find this file under notebooks/scd2_data/source.csv

### Step 2: Read-in the target and source dataframe.

In [121]:
# Read-in the source dataframe
source_schema = T.StructType([
    T.StructField('INDEX', T.IntegerType(), True), 
    T.StructField('CUSTOMER_ID', T.StringType(), True), 
    T.StructField('FIRST_NAME', T.StringType(), True), 
    T.StructField('LAST_NAME', T.StringType(), True), 
    T.StructField('COMPANY', T.StringType(), True), 
    T.StructField('CITY', T.StringType(), True), 
    T.StructField('COUNTRY', T.StringType(), True), 
    T.StructField('PHONE_1', T.StringType(), True), 
    T.StructField('PHONE_2', T.StringType(), True), 
    T.StructField('EMAIL', T.StringType(), True), 
    T.StructField('SUBSCRIPTION_DATE', T.DateType(), True), 
    T.StructField('WEBSITE', T.StringType(), True)
])
source_df = (
    spark
    .read
    .option('header', True)
    .option('schema', schema)
    .csv('scd2_data/source.csv')
)

# Read-in the target dataframe
target_schema = T.StructType([
    T.StructField('INDEX', T.IntegerType(), True), 
    T.StructField('CUSTOMER_ID', T.StringType(), True), 
    T.StructField('FIRST_NAME', T.StringType(), True), 
    T.StructField('LAST_NAME', T.StringType(), True), 
    T.StructField('COMPANY', T.StringType(), True), 
    T.StructField('CITY', T.StringType(), True), 
    T.StructField('COUNTRY', T.StringType(), True), 
    T.StructField('PHONE_1', T.StringType(), True), 
    T.StructField('PHONE_2', T.StringType(), True), 
    T.StructField('EMAIL', T.StringType(), True), 
    T.StructField('SUBSCRIPTION_DATE', T.DateType(), True), 
    T.StructField('WEBSITE', T.StringType(), True),
    T.StructField('VALID_FROM', T.TimestampType(), True),
    T.StructField('VALID_TO', T.TimestampType(), True),
    T.StructField('isCurrent', T.StringType(), True),
])
target_df = (
    spark
    .read
    .option('header', True)
    .option('schema', schema)
    .csv('scd2_data/target.csv')
)

In [122]:
source_df.show(5)

+---------------+----------+---------+-----------------+--------------------+--------------------+--------------------+
|    CUSTOMER_ID|FIRST_NAME|LAST_NAME|             CITY|             COUNTRY|               PHONE|               EMAIL|
+---------------+----------+---------+-----------------+--------------------+--------------------+--------------------+
|1Ef7b82A4CAAD10|  Prestoni|  Lozanoi|East Jimmychester|            Djibouti|          5153435776|     vmata@colon.com|
|XXXXXXXXXXXXXXX|     Puppi| Schtinki|East Jimmychester|            Djibouti|          5153435776|     vmata@colon.com|
|5Cef8BFA16c5e3c|     Linda|    Olsen|       Bensonview|  Dominican Republic|001-808-617-6467x...|stanleyblackwell@...|
|053d585Ab6b3159|    Joanna|   Bender|   West Priscilla|Slovakia (Slovak ...|001-234-203-0635x...|colinalvarado@mil...|
|2d08FB17EE273F4|     Aimee|    Downs|    Chavezborough|Bosnia and Herzeg...| (283)437-3886x88321| louis27@gilbert.com|
+---------------+----------+---------+--

In [123]:
target_df.show(5)

+---------------+----------+---------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------+
|    CUSTOMER_ID|FIRST_NAME|LAST_NAME|             CITY|             COUNTRY|               PHONE|               EMAIL|          VALID_FROM|            VALID_TO|isCurrent|
+---------------+----------+---------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+---------+
|DD37Cf93aecA6Dc|    Sheryl|   Baxter|     East Leonard|               Chile|        229.077.5154|zunigavanessa@smi...|2024-05-15T17:42:...|9999-12-31T23:59:...|        Y|
|1Ef7b82A4CAAD10|   Preston|   Lozano|East Jimmychester|            Djibouti|          5153435776|     vmata@colon.com|2024-05-15T17:42:...|9999-12-31T23:59:...|        Y|
|5Cef8BFA16c5e3c|     Linda|    Olsen|       Bensonview|  Dominican Republic|001-808-617-6467x...|stanleyblackwell@...|2024-05-15T17:42:...|

### Step 3: Select the rows in source dataframe that are new.

In [124]:
def get_inserts(source_df: DataFrame, target_df: DataFrame):
    # Find rows in source_df that are not present in target_df
    insert_df = source_df.join(target_df, on='CUSTOMER_ID', how='leftanti')
    
    return insert_df

In [125]:
insert_df = get_inserts(source_df, target_df)

In [126]:
insert_df.show(5)

+---------------+----------+---------+-----------------+--------+----------+---------------+
|    CUSTOMER_ID|FIRST_NAME|LAST_NAME|             CITY| COUNTRY|     PHONE|          EMAIL|
+---------------+----------+---------+-----------------+--------+----------+---------------+
|XXXXXXXXXXXXXXX|     Puppi| Schtinki|East Jimmychester|Djibouti|5153435776|vmata@colon.com|
+---------------+----------+---------+-----------------+--------+----------+---------------+



### Step 4: Select the rows in source dataframe that have been deleted.

In [127]:
def get_deletes(source_df: DataFrame, target_df: DataFrame):
    # Find rows in target_df that are not present in source_df
    delete_df = target_df.join(source_df, on='CUSTOMER_ID', how='leftanti')
    
    return delete_df

In [128]:
delete_df = get_deletes(source_df, target_df).select(*source_df.columns) # extra select to keep column order

In [129]:
delete_df.show(5)

+---------------+----------+---------+------------+-------+------------+--------------------+
|    CUSTOMER_ID|FIRST_NAME|LAST_NAME|        CITY|COUNTRY|       PHONE|               EMAIL|
+---------------+----------+---------+------------+-------+------------+--------------------+
|DD37Cf93aecA6Dc|    Sheryl|   Baxter|East Leonard|  Chile|229.077.5154|zunigavanessa@smi...|
+---------------+----------+---------+------------+-------+------------+--------------------+



### Step 5: Select the rows in source dataframe that are updated.

In [130]:
def add_hash_column(df: DataFrame, columns: list, hash_column_name: str = 'CTC_HASH') -> DataFrame:
    # Add a hash column to the DataFrame based on the specified columns.
    return df.withColumn(hash_column_name, F.sha2(F.concat_ws('||', *columns), 256))

def get_updates(source_df: DataFrame, target_df: DataFrame, ctc_cols: list):
    # Add hash columns based on the specified columns
    source_df_hash = add_hash_column(source_df, ctc_cols)
    target_df_hash = add_hash_column(target_df, ctc_cols)
    
    # Find corresponding rows between source_df and target_df
    overlap_df = source_df_hash.alias('src').join(target_df_hash.alias('tgt'), on='CUSTOMER_ID', how='inner')
    
    # Apply filter to get rows where hash values are different
    update_df = (
        overlap_df
        .filter(F.col('src.CTC_HASH') != F.col('tgt.CTC_HASH'))
        .select('src.*')
        .drop('CTC_HASH')
    )
    
    return update_df

In [133]:
key_cols = ['COSTUMER_ID']
ctc_cols = [col for col in source_df.columns if col not in key_cols]

In [134]:
update_df = get_updates(source_df, target_df, ctc_cols).select(*source_df.columns) # extra select to keep column order

In [135]:
update_df.show(5)

+---------------+----------+---------+-----------------+--------+----------+---------------+
|    CUSTOMER_ID|FIRST_NAME|LAST_NAME|             CITY| COUNTRY|     PHONE|          EMAIL|
+---------------+----------+---------+-----------------+--------+----------+---------------+
|1Ef7b82A4CAAD10|  Prestoni|  Lozanoi|East Jimmychester|Djibouti|5153435776|vmata@colon.com|
+---------------+----------+---------+-----------------+--------+----------+---------------+



### Step 6: Insert, update, and/or delete the selected rows in the source dataframe.

In [136]:
curr_timestamp = datetime.now() # we need this for our inserts, updates and deletes

#### Handling deletes

In [137]:
def handle_deletes(target_df: DataFrame, delete_df: DataFrame, curr_timestamp: datetime):
    # Step 1: Find the rows with the same key in the target_df
    records_to_invalidate = (
        target_df.alias('tgt')
        .join(delete_df.alias('inv'), on='CUSTOMER_ID', how='inner')
        .select('tgt.*')
    )
    
    # Step 2: Create new dataframe with updated VALID_TO and isCurrent = 'N'
    invalidated_records = (
        records_to_invalidate
        .withColumn("VALID_TO", F.lit(curr_timestamp).cast(T.TimestampType()))
        .withColumn("isCurrent", F.lit("N").cast(T.StringType()))
    )

    # Step 3: Delete current active records from the dataframe
    target_df = target_df.join(invalidated_records, on='CUSTOMER_ID', how='leftanti')
    
    # Step 4: Add new invalidated records to target_df
    target_df = target_df.union(invalidated_records)

    return target_df

In [138]:
target_df = handle_deletes(target_df, delete_df, curr_timestamp)

In [139]:
# Show the last 5 rows of the dataframe
last_five_df = spark.createDataFrame(target_df.tail(5), target_df.schema)
last_five_df.show()

+---------------+----------+---------+---------------+---------------+--------------------+--------------------+--------------------+--------------------+---------+
|    CUSTOMER_ID|FIRST_NAME|LAST_NAME|           CITY|        COUNTRY|               PHONE|               EMAIL|          VALID_FROM|            VALID_TO|isCurrent|
+---------------+----------+---------+---------------+---------------+--------------------+--------------------+--------------------+--------------------+---------+
|CeD220bdAaCfaDf|      Lynn| Atkinson|   New Bradview|      Sri Lanka|     +1-846-706-2218|   vkemp@ferrell.com|2024-05-15T17:42:...|9999-12-31T23:59:...|        Y|
|28CDbC0dFe4b1Db|      Fred|   Guerra|     Ortegaland|Solomon Islands|+1-753-067-8419x7170|    swagner@kane.org|2024-05-15T17:42:...|9999-12-31T23:59:...|        Y|
|c23d1D9EE8DEB0A|    Yvonne|   Farmer|Lake Elijahview|          Aruba|       (530)311-9786|mccarthystephen@h...|2024-05-15T17:42:...|9999-12-31T23:59:...|        Y|
|2354a0E33

#### Handling updates

In [143]:
# Update the target DataFrame
def handle_updates(target_df: DataFrame, update_df: DataFrame, curr_timestamp: datetime) -> DataFrame:
    # Step 1: Invalidate existing rows in target_df with rows from update_df (hint: use handle_deletes())
    target_df = handle_deletes(target_df, update_df, curr_timestamp)

    # Step 2: Add technical columns to update_df (hint: use add_technical_columns())
    update_df = add_technical_columns(update_df, curr_timestamp)
    
    # Step 3: Insert new rows from update_df to target_df
    target_df = target_df.union(update_df)

    return target_df

In [144]:
target_df = handle_updates(target_df, update_df, curr_timestamp)

In [145]:
# Show the last 5 rows of the dataframe
last_five_df = spark.createDataFrame(target_df.tail(5), target_df.schema)
last_five_df.show()

+---------------+----------+---------+-----------------+-------------+------------+--------------------+--------------------+--------------------+---------+
|    CUSTOMER_ID|FIRST_NAME|LAST_NAME|             CITY|      COUNTRY|       PHONE|               EMAIL|          VALID_FROM|            VALID_TO|isCurrent|
+---------------+----------+---------+-----------------+-------------+------------+--------------------+--------------------+--------------------+---------+
|7Ce381e4Afa4ba9|   Gabriel|    Mejia|    Port Annatown|Liechtenstein|  4077245425|coleolson@jenning...|2024-05-15T17:42:...|9999-12-31T23:59:...|        Y|
|DD37Cf93aecA6Dc|    Sheryl|   Baxter|     East Leonard|        Chile|229.077.5154|zunigavanessa@smi...|2024-05-15T17:42:...|2024-05-15 17:52:...|        N|
|1Ef7b82A4CAAD10|   Preston|   Lozano|East Jimmychester|     Djibouti|  5153435776|     vmata@colon.com|2024-05-15T17:42:...|2024-05-15 17:52:...|        N|
|1Ef7b82A4CAAD10|  Prestoni|  Lozanoi|East Jimmychester|  

#### Handling inserts

In [146]:
def handle_inserts(target_df: DataFrame, insert_df: DataFrame, curr_timestamp: datetime):
    # Step 1: Add technical columns to update_df (hint: use add_technical_columns())
    update_df = add_technical_columns(insert_df, curr_timestamp)
    
    # Step 2: insert rows from insert_df into target_df
    target_df = target_df.union(insert_df)
    
    return target_df

In [147]:
target_df = handle_updates(target_df, insert_df, curr_timestamp)

In [149]:
# Show the last 5 rows of the dataframe
last_five_df = spark.createDataFrame(target_df.tail(5), target_df.schema)
last_five_df.show()

+---------------+----------+---------+-----------------+--------+------------+--------------------+--------------------+--------------------+---------+
|    CUSTOMER_ID|FIRST_NAME|LAST_NAME|             CITY| COUNTRY|       PHONE|               EMAIL|          VALID_FROM|            VALID_TO|isCurrent|
+---------------+----------+---------+-----------------+--------+------------+--------------------+--------------------+--------------------+---------+
|DD37Cf93aecA6Dc|    Sheryl|   Baxter|     East Leonard|   Chile|229.077.5154|zunigavanessa@smi...|2024-05-15T17:42:...|2024-05-15 17:52:...|        N|
|1Ef7b82A4CAAD10|   Preston|   Lozano|East Jimmychester|Djibouti|  5153435776|     vmata@colon.com|2024-05-15T17:42:...|2024-05-15 17:52:...|        N|
|1Ef7b82A4CAAD10|  Prestoni|  Lozanoi|East Jimmychester|Djibouti|  5153435776|     vmata@colon.com|2024-05-15 17:52:...|2024-05-15 17:52:...|        N|
|1Ef7b82A4CAAD10|  Prestoni|  Lozanoi|East Jimmychester|Djibouti|  5153435776|     vmata

#### Write to file

In [150]:
save_as_csv(df=target_df, file_path='scd2_data/target.csv')