# Slowly Changing Dimensions - Type 1

In [None]:
import os
import shutil
from datetime import datetime

In [None]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, DataFrame
import pyspark.sql.types as T
import pyspark.sql.functions as F

In [None]:
sc = SparkContext()
spark = SparkSession.builder.appName('scd1').getOrCreate()

## 1. Preperation steps
In the following cells we will perform the following steps:
1. Read-in our target dataframe
2. Save the target dataframe as our 'source' dataframe (initial load)

### Read-in dataframe

In [None]:
# Define schema
schema = T.StructType([
    T.StructField('CUSTOMER_ID', T.StringType(), True), 
    T.StructField('FIRST_NAME', T.StringType(), True), 
    T.StructField('LAST_NAME', T.StringType(), True),
    T.StructField('CITY', T.StringType(), True), 
    T.StructField('COUNTRY', T.StringType(), True), 
    T.StructField('PHONE', T.StringType(), True),
    T.StructField('EMAIL', T.StringType(), True),
])

# Read-in dataframe
df = (
    spark
    .read
    .option('header', True)
    .option('schema', schema)
    .csv('scd1_data/source.csv')
)

# Show dataframe
df.show(5)

### Saving dataframe as 'output'
PySpark has a particular way of saving parquet, delta, and csv files.
Because of this, we need to create a helper function, so that our output is saved as a single csv file.
Do not worry to much about understanding this code.

In [None]:
def save_as_csv(df: DataFrame, file_path: str):
    tmp_folder = file_path + 'tmp'
    
    # Save DataFrame to a temporary folder
    (
        df
        .coalesce(1)  # Ensure a single partition
        .write
        .mode('overwrite')
        .format('csv')
        .option('header', True)
        .save(tmp_folder)
    )
    
    # Find the single partition file
    for file_name in os.listdir(tmp_folder):
        if file_name.endswith('.csv'):
            tmp_file_path = os.path.join(tmp_folder, file_name)
            break
    
    # Move and rename the file to the final destination
    shutil.move(tmp_file_path, file_path)
    
    # Remove the temporary folder
    shutil.rmtree(tmp_folder)

In [None]:
save_as_csv(df=df, file_path='scd1_data/target.csv')

## 2. Starting the SCD1 Proces
Now, we will begin with the implementation of the Slowly Changing Dimensions type 1. We will be implementing the following steps:
1. Change the target dataframe by adding or editing some rows.
2. Read-in the target and source dataframe.
3. Select the rows in source dataframe that are new.
4. Select the rows in source dataframe that have been deleted.
5. Select the rows in source dataframe that are updated.
6. Insert, update, and/or delete the selected rows in the source dataframe.

### Step 1: Change the target dataframe by adding or editing some rows

* Make some alterations to the source data.
* You can find this file under notebooks/scd1_data/source.csv

### Step 2: Read-in the target and source dataframe.

In [None]:
# Read-in the source dataframe
source_schema = T.StructType([
    T.StructField('CUSTOMER_ID', T.StringType(), True), 
    T.StructField('FIRST_NAME', T.StringType(), True), 
    T.StructField('LAST_NAME', T.StringType(), True),
    T.StructField('CITY', T.StringType(), True), 
    T.StructField('COUNTRY', T.StringType(), True), 
    T.StructField('PHONE', T.StringType(), True),
    T.StructField('EMAIL', T.StringType(), True),
])
source_df = (
    spark
    .read
    .option('header', True)
    .option('schema', schema)
    .csv('scd1_data/source.csv')
)

# Read-in the target dataframe
target_schema = T.StructType([
    T.StructField('CUSTOMER_ID', T.StringType(), True), 
    T.StructField('FIRST_NAME', T.StringType(), True), 
    T.StructField('LAST_NAME', T.StringType(), True),
    T.StructField('CITY', T.StringType(), True), 
    T.StructField('COUNTRY', T.StringType(), True), 
    T.StructField('PHONE', T.StringType(), True),
    T.StructField('EMAIL', T.StringType(), True),
])
target_df = (
    spark
    .read
    .option('header', True)
    .option('schema', schema)
    .csv('scd1_data/target.csv')
)

In [None]:
source_df.show(5)

In [None]:
target_df.show(5)

### Step 3: Select the rows in source dataframe that are new.

In [None]:
def get_inserts(source_df: DataFrame, target_df: DataFrame):
    # Find rows in source_df that are not present in target_df
    ### FILL THIS IN ###
    insert_df = 'TODO'
    ### FILL THIS IN ###
    
    return insert_df

In [None]:
insert_df = get_inserts(source_df, target_df)

In [None]:
insert_df.show(5)

### Step 4: Select the rows in source dataframe that have been deleted.

In [None]:
def get_deletes(source_df: DataFrame, target_df: DataFrame):
    # Find rows in target_df that are not present in source_df
    ### FILL THIS IN ###
    delete_df = 'TODO'
    ### FILL THIS IN ###
    
    return delete_df

In [None]:
delete_df = get_deletes(source_df, target_df).select(*source_df.columns) # extra select to keep column order

In [None]:
delete_df.show(5)

### Step 5: Select the rows in source dataframe that are updated.

In [None]:
def add_hash_column(df: DataFrame, columns: list, hash_column_name: str = 'CTC_HASH') -> DataFrame:
    # Add a hash column to the DataFrame based on the specified columns.
    return df.withColumn(hash_column_name, F.sha2(F.concat_ws('||', *columns), 256))

def get_updates(source_df: DataFrame, target_df: DataFrame, ctc_cols: list):
    # Add hash columns based on the specified columns
    source_df_hash = add_hash_column(source_df, ctc_cols)
    target_df_hash = add_hash_column(target_df, ctc_cols)
    
    # Find corresponding rows between source_df and target_df
    ### FILL THIS IN ###
    overlap_df = 'TODO'
    ### FILL THIS IN ###
    
    # Apply filter to get rows where hash values are different
    update_df = (
        overlap_df
        .filter(F.col('src.CTC_HASH') != F.col('tgt.CTC_HASH'))
        .select('src.*')
        .drop('CTC_HASH')
    )
    
    return update_df

In [None]:
key_cols = ['COSTUMER_ID']
ctc_cols = [col for col in source_df.columns if col not in key_cols]

In [None]:
update_df = get_updates(source_df, target_df, ctc_cols)

In [None]:
update_df.show(5)

### Step 6: Insert, update, and/or delete the selected rows in the source dataframe.

In [None]:
# Update the target DataFrame
def update_target_df(target_df: DataFrame, insert_df: DataFrame, delete_df: DataFrame, update_df: DataFrame) -> DataFrame:
    # Step 1: Delete rows in target_df that are present in delete_df
    target_df = target_df.join(delete_df, on='CUSTOMER_ID', how='leftanti')
    
    # Step 2: Insert new rows from insert_df to target_df
    target_df = target_df.union(insert_df)
    
    # Step 3: Update existing rows in target_df with rows from update_df
    target_df = target_df.join(update_df, on='CUSTOMER_ID', how='leftanti')
    target_df = target_df.union(update_df)
    
    return target_df

In [None]:
target_df = update_target_df(
    target_df=target_df,
    insert_df=insert_df,
    delete_df=delete_df,
    update_df=update_df
).select(*[field.name for field in target_schema.fields])

In [None]:
target_df.show(5)

#### Write to file

In [None]:
save_as_csv(df=target_df, file_path='scd1_data/target.csv')