In [47]:
import os
import shutil
from datetime import datetime

In [48]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, DataFrame
import pyspark.sql.types as T
import pyspark.sql.functions as F

In [None]:
sc = SparkContext()
spark = SparkSession.builder.appName('example app').getOrCreate()

#### Preperation steps
In the following cells we will perform the following steps:
1. Read-in our target dataframe
2. Add our technical columns to this dataframe
3. Save the target dataframe as our 'source' dataframe (initial load)

##### Read-in dataframe

In [49]:
# Define schema
schema = T.StructType([
    T.StructField('INDEX', T.IntegerType(), True), 
    T.StructField('CUSTOMER_ID', T.StringType(), True), 
    T.StructField('FIRST_NAME', T.StringType(), True), 
    T.StructField('LAST_NAME', T.StringType(), True), 
    T.StructField('COMPANY', T.StringType(), True), 
    T.StructField('CITY', T.StringType(), True), 
    T.StructField('COUNTRY', T.StringType(), True), 
    T.StructField('PHONE_1', T.StringType(), True), 
    T.StructField('PHONE_2', T.StringType(), True), 
    T.StructField('EMAIL', T.StringType(), True), 
    T.StructField('SUBSCRIPTION_DATE', T.DateType(), True), 
    T.StructField('WEBSITE', T.StringType(), True)
])

# Read-in dataframe
df = (
    spark
    .read
    .option('header', True)
    .option('schema', schema)
    .csv('scd_data/source.csv')
)

# Show dataframe
df.show(5)

+-----+---------------+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+
|INDEX|    CUSTOMER_ID|FIRST_NAME|LAST_NAME|             COMPANY|             CITY|             COUNTRY|             PHONE_1|             PHONE_2|               EMAIL|SUBSCRIPTION_DATE|             WEBSITE|
+-----+---------------+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+
|    1|DD37Cf93aecA6Dc|    Sheryl|   Baxter|     Rasmussen Group|     East Leonard|               Chile|        229.077.5154|    397.884.0519x718|zunigavanessa@smi...|       2020-08-24|http://www.stephe...|
|    2|1Ef7b82A4CAAD10|   Preston|   Lozano|         Vega-Gentry|East Jimmychester|            Djibouti|          5153435776|    686-620-1820x944|     vmata@colon.com|     

##### Add technical columns
We start be defining helper functions to add our technical columns to the dataframe.
Here, the following columns are added:
* `VALID_FROM`: indicates the timestamp at which this row was valid.
* `VALID_TO`: indicates the timestamp at which this row was/is no longer valid.

This can be done using the `pyspark.sql.types` and `pyspark.sql.function` modules.

In [50]:
def add_technical_columns(df: DataFrame) -> DataFrame:
    return (
        df
        .withColumn('VALID_FROM', F.current_timestamp().cast(T.TimestampType()))
        .withColumn('VALID_TO', F.lit('9999-12-31 23:59:59').cast(T.TimestampType()))
    )

In [51]:
df = add_technical_columns(df)

In [52]:
df.show(5)

+-----+---------------+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+-------------------+
|INDEX|    CUSTOMER_ID|FIRST_NAME|LAST_NAME|             COMPANY|             CITY|             COUNTRY|             PHONE_1|             PHONE_2|               EMAIL|SUBSCRIPTION_DATE|             WEBSITE|          VALID_FROM|           VALID_TO|
+-----+---------------+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+-------------------+
|    1|DD37Cf93aecA6Dc|    Sheryl|   Baxter|     Rasmussen Group|     East Leonard|               Chile|        229.077.5154|    397.884.0519x718|zunigavanessa@smi...|       2020-08-24|http://www.stephe...|2024-05-13 21:19:...|9999-12-31 23:59:59|
|    2|1

##### Saving dataframe as 'output'
PySpark has a particular way of saving parquet, delta, and csv files.
Because of this, we need to create a helper function, so that our output is saved as a single csv file.
Do not worry to much about understanding this code.

In [53]:
def save_as_csv(df: DataFrame, file_path: str):
    tmp_folder = file_path + 'tmp'
    
    # Save DataFrame to a temporary folder
    (
        df
        .coalesce(1)  # Ensure a single partition
        .write
        .mode('overwrite')
        .format('csv')
        .option('header', True)
        .save(tmp_folder)
    )
    
    # Find the single partition file
    for file_name in os.listdir(tmp_folder):
        if file_name.endswith('.csv'):
            tmp_file_path = os.path.join(tmp_folder, file_name)
            break
    
    # Move and rename the file to the final destination
    shutil.move(tmp_file_path, file_path)
    
    # Remove the temporary folder
    shutil.rmtree(tmp_folder)

In [54]:
save_as_csv(df=df, file_path='scd_data/target.csv')

#### Starting the SCD1 Proces
Now, we will begin with the implementation of the Slowly Changing Dimensions type 1. We will be implementing the following steps:
1. Change the target dataframe by adding or editing some rows
2. Read-in the target and source dataframe.
3. Select the rows that have changed from the target dataframe
4. Insert the new rows in the source dataframe.

##### Step 1

In [88]:
# Read-in the source dataframe
source_schema = T.StructType([
    T.StructField('INDEX', T.IntegerType(), True), 
    T.StructField('CUSTOMER_ID', T.StringType(), True), 
    T.StructField('FIRST_NAME', T.StringType(), True), 
    T.StructField('LAST_NAME', T.StringType(), True), 
    T.StructField('COMPANY', T.StringType(), True), 
    T.StructField('CITY', T.StringType(), True), 
    T.StructField('COUNTRY', T.StringType(), True), 
    T.StructField('PHONE_1', T.StringType(), True), 
    T.StructField('PHONE_2', T.StringType(), True), 
    T.StructField('EMAIL', T.StringType(), True), 
    T.StructField('SUBSCRIPTION_DATE', T.DateType(), True), 
    T.StructField('WEBSITE', T.StringType(), True)
])
source_df = (
    spark
    .read
    .option('header', True)
    .option('schema', schema)
    .csv('scd_data/source.csv')
)

# Read-in the target dataframe
target_schema = T.StructType([
    T.StructField('INDEX', T.IntegerType(), True), 
    T.StructField('CUSTOMER_ID', T.StringType(), True), 
    T.StructField('FIRST_NAME', T.StringType(), True), 
    T.StructField('LAST_NAME', T.StringType(), True), 
    T.StructField('COMPANY', T.StringType(), True), 
    T.StructField('CITY', T.StringType(), True), 
    T.StructField('COUNTRY', T.StringType(), True), 
    T.StructField('PHONE_1', T.StringType(), True), 
    T.StructField('PHONE_2', T.StringType(), True), 
    T.StructField('EMAIL', T.StringType(), True), 
    T.StructField('SUBSCRIPTION_DATE', T.DateType(), True), 
    T.StructField('WEBSITE', T.StringType(), True),
    T.StructField('VALID_FROM', T.TimestampType(), True), 
    T.StructField('VALID_TO', T.TimestampType(), True), 
])
target_df = (
    spark
    .read
    .option('header', True)
    .option('schema', schema)
    .csv('scd_data/target.csv')
)

In [89]:
source_df.show(5)

+-----+---------------+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+
|INDEX|    CUSTOMER_ID|FIRST_NAME|LAST_NAME|             COMPANY|             CITY|             COUNTRY|             PHONE_1|             PHONE_2|               EMAIL|SUBSCRIPTION_DATE|             WEBSITE|
+-----+---------------+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+
|    1|DD37Cf93aecA6Dc|   Sheryll|   Baxter|     Rasmussen Group|     East Leonard|               Chile|        229.077.5154|    397.884.0519x718|zunigavanessa@smi...|       2020-08-24|http://www.stephe...|
|    2|1Ef7b82A4CAAD10|   Preston|   Lozano|         Vega-Gentry|East Jimmychester|            Djibouti|          5153435776|    686-620-1820x944|     vmata@colon.com|     

In [90]:
target_df.show(5)

+-----+---------------+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+
|INDEX|    CUSTOMER_ID|FIRST_NAME|LAST_NAME|             COMPANY|             CITY|             COUNTRY|             PHONE_1|             PHONE_2|               EMAIL|SUBSCRIPTION_DATE|             WEBSITE|          VALID_FROM|            VALID_TO|
+-----+---------------+----------+---------+--------------------+-----------------+--------------------+--------------------+--------------------+--------------------+-----------------+--------------------+--------------------+--------------------+
|    1|DD37Cf93aecA6Dc|    Sheryl|   Baxter|     Rasmussen Group|     East Leonard|               Chile|        229.077.5154|    397.884.0519x718|zunigavanessa@smi...|       2020-08-24|http://www.stephe...|2024-05-13T21:19:...|9999-12-31T23:59:...|
|   

##### Step 2 

In [91]:
key_cols = ['COSTUMER_ID']
tech_cols = ['VALID_FROM', 'VALID_TO']
ctc_cols = [col for col in source_df.columns if col not in key_cols + tech_cols]

In [92]:
def add_hash_column(df: DataFrame, columns: list, hash_column_name: str = 'CTC_HASH') -> DataFrame:
    # Add a hash column to the DataFrame based on the specified columns.
    return df.withColumn(hash_column_name, F.sha2(F.concat_ws('||', *columns), 256))

In [93]:
def get_changed_records(source_df: DataFrame, target_df: DataFrame, ctc_cols: list):
    # Add hash columns based on the specified columns
    source_df = add_hash_column(source_df, ctc_cols)
    target_df = add_hash_column(target_df, ctc_cols)
    
    # Perform an inner join on CUSTOMER_ID
    joined_df = source_df.alias('src').join(target_df.alias('tgt'), on='CUSTOMER_ID', how='inner')
    
    # Filter for rows where hash values are different
    changed_df = joined_df.filter(F.col('src.CTC_HASH') != F.col('tgt.CTC_HASH')).select('src.*')
    
    return changed_df

In [94]:
update_df = get_changed_records(source_df, target_df, ctc_cols)

In [95]:
update_df.show(5)

+---------------+-----+----------+---------+---------------+------------+-------+------------+----------------+--------------------+-----------------+--------------------+--------------------+
|    CUSTOMER_ID|INDEX|FIRST_NAME|LAST_NAME|        COMPANY|        CITY|COUNTRY|     PHONE_1|         PHONE_2|               EMAIL|SUBSCRIPTION_DATE|             WEBSITE|            CTC_HASH|
+---------------+-----+----------+---------+---------------+------------+-------+------------+----------------+--------------------+-----------------+--------------------+--------------------+
|DD37Cf93aecA6Dc|    1|   Sheryll|   Baxter|Rasmussen Group|East Leonard|  Chile|229.077.5154|397.884.0519x718|zunigavanessa@smi...|       2020-08-24|http://www.stephe...|e304136e0ed441ef5...|
+---------------+-----+----------+---------+---------------+------------+-------+------------+----------------+--------------------+-----------------+--------------------+--------------------+

