In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, current_date
from awsglue.context import GlueContext
from awsglue.job import Job

# Initialize Spark and Glue contexts
spark = SparkSession.builder.appName("CDCLoad").getOrCreate()
glueContext = GlueContext(spark.sparkContext)
job = Job(glueContext)

# Load source table
source_table = (glueContext
                .create_dynamic_frame
                .from_catalog(database="insurance_db", table_name="tb_policy")
                )
source_df = source_table.toDF()

# Identify new partitions with update_date greater than load_date
load_date = (   spark
                .sql("""SELECT MAX(load_date) as max_load_date 
                        FROM insurance_prd.policy_prd""")
                .collect()[0]["max_load_date"]
            )

source_partitions = (   source_df
                        .filter(col("update_date") > load_date)
                        .select(col("year"), col("month"), col("day"))
                        .distinct()
                    )

# Load only new partitions into the target
if not source_partitions.isEmpty():
    new_data = (
        source_df
        .join(source_partitions, on=["year", "month", "day"], how="inner")
        .withColumn("load_date", current_date())  # Assuming the load date is the current date
    )
    
    # Perform merge/upsert
    target_table = glueContext.create_dynamic_frame.from_catalog(database="insurance_prd", table_name="policy_prd")
    target_df = target_table.toDF()

    merged_df = (target_df
                 .union(new_data)
                 .dropDuplicates(["policy_id", "year", "month", "day"])
                 .coalesce(1)
                )
    
    # Write the merged data back to the target table
    additionalOptions = {
        "enableUpdateCatalog"   : True,
        "updateBehavior"        : "UPDATE_IN_DATABASE",
        "partitionKeys"         : ["year", "month", "day"]
    }
    glueContext.write_dynamic_frame.fromDF(merged_df, database="insurance_prd", table_name="policy_prd")

# Commit the job
job.commit()
