In [131]:
%iam_role arn:aws:iam::331504768406:role/service-role/AWSGlueServiceRole
%region us-east-1
%idle_timeout 5
%glue_version 4.0
%worker_type G.1X
%number_of_workers 2

Current iam_role is arn:aws:iam::331504768406:role/service-role/AWSGlueServiceRole
iam_role has been set to arn:aws:iam::331504768406:role/service-role/AWSGlueServiceRole.
Previous region: us-east-1
Setting new region to: us-east-1
Region is set to: us-east-1
Current idle_timeout is 5 minutes.
idle_timeout has been set to 5 minutes.
Setting Glue version to: 4.0
Previous worker type: G.1X
Setting new worker type to: G.1X
Previous number of workers: 2
Setting new number of workers to: 2


In [133]:
%%configure -f
{
    "conf": "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog",
    "datalake-formats":"delta",
    #"additional-python-modules": "awswrangler",
    'enable-auto-scaling': 'false',
    'JOB_NAME': 'glue-test-job',
    'start_date': '2024-01-24'
}

The following configurations have been updated: {'conf': 'spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog', 'datalake-formats': 'delta', 'enable-auto-scaling': 'false', 'JOB_NAME': 'glue-test-job', 'start_date': '2024-01-24'}


In [1]:
import sys
from pyspark.context import SparkContext
from pyspark.sql import functions as sqlFunc
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue import DynamicFrame
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from datetime import datetime, date, timedelta



Trying to create a Glue session for the kernel.
Session Type: etl
Worker Type: G.1X
Number of Workers: 2
Session ID: aa4ec36e-e95b-450a-98d1-9c5a970fcdb0
Applying the following default arguments:
--glue_kernel_version 1.0.4
--enable-glue-datacatalog true
--conf spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog
--datalake-formats delta
--enable-auto-scaling false
--JOB_NAME glue-test-job
--start_date 2024-01-24
Waiting for session aa4ec36e-e95b-450a-98d1-9c5a970fcdb0 to get into ready status...
Session aa4ec36e-e95b-450a-98d1-9c5a970fcdb0 has been created.



In [2]:
# apply groupBy and aggregate functions in a DyF
def sparkAggregate(glueContext, parentFrame, groups, aggs, transformation_ctx) -> DynamicFrame:

    aggsFuncs = []
    aggsColNames = []
    for column, func in aggs:
        aggsFuncs.append(getattr(sqlFunc, func)(column))
        aggsColNames.append(func +'_'+ column)
   
    result_df = (
                parentFrame.toDF().groupBy(*groups).agg(*aggsFuncs)
                if len(groups) > 0
                else parentFrame.toDF().agg(*aggsFuncs)
            )
    cols = result_df.columns
    
    for c in range(len(aggsColNames)):
        result_df = result_df.withColumnRenamed(cols[c], aggsColNames[c])

    return DynamicFrame.fromDF(result_df, glueContext, transformation_ctx)





In [3]:

args = getResolvedOptions(sys.argv, ["JOB_NAME", "start_date"])
glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)





In [4]:
start_date  = args['start_date']
days_ago    = 1
bucket_name = 'jamil-datalake-dev'
catalog     = "glue-catalog"
source_db   = 'insurance_db'
target_db   = "insurance_prd"
source_table= "policy"
target_table= "policy_prod"
primary_keys= ["policy_id"]
partitions  = ["year", "month", "day"]
target_path = f"s3://{bucket_name}/{catalog}/{target_db}/{target_table}/"

if start_date == 'cron' or not start_date:
    start_date = (date.today() - timedelta(days=days_ago)).strftime('%Y-%m-%d')

part_year = start_date[:4]
part_month= start_date[5:7]
part_day  = start_date[8:]

target_exists = target_table in [tb.name for tb in spark.catalog.listTables(dbName=target_db)]





In [5]:


source_df = glueContext.create_data_frame.from_catalog( database   = source_db, 
                                                        table_name = source_table
    )

source_dyf = DynamicFrame.fromDF(source_df,
                                 glueContext,
                                 "source_dyf"
    )

if target_exists:
    src_dyf = source_dyf.filter(
                    f= lambda x: x[partitions[0]] == start_date[:4]
                             and x[partitions[1]] == start_date[5:7]
                             and x[partitions[2]] == start_date[8:]
                )
else:
    ymd_dyf = sparkAggregate(
                        glueContext = glueContext,
                        parentFrame = source_dyf,
                        groups      = ['year','month','day'],
                        aggs        = [['year_month_day', 'first']],
                        transformation_ctx='ymd_dyf'
    )
    
    src_dyf = source_dyf.join(frame2=ymd_dyf, 
                              paths1=['year','month','day'], 
                              paths2=['year','month','day']
                              )    

# TODO:
## src_dyf deduplication

src_dyf = ApplyMapping.apply(
                frame=src_dyf,
                mappings=[
                    ("policy_id", "long", "policy_id", "long"),
                    ("expiry_date", "date", "expiry_date", "date"),
                    ("location_name", "string", "location_name", "string"),
                    ("state_code", "string", "state_code", "string"),
                    ("region_name", "string", "region_name", "string"),
                    ("insured_value", "double", "insured_value", "double"),
                    ("business_type", "string", "business_type", "string"),
                    ("flood", "string", "flood", "string"),
                    ("file_name", "string", "file_name", "string"),
                    ("year_month_day", "string", "year_month_day", "string"),
                    ("year", "string", "year", "string"),
                    ("month", "string", "month", "string"),
                    ("day", "string", "day", "string"),
                ]
            )





In [6]:

## verificar se a tabela existe
## se não existir, não executar a leitura full da target

src_df = src_dyf.toDF()

if not target_exists:
    target_df = src_df
else:
    target_df = (glueContext.create_dynamic_frame
                             .from_catalog(database  = target_db, 
                                           table_name= target_table
                                           )
                ).toDF()
   

    # filter for deletions
    not_match_df = (target_df.alias('target')
                             .join(src_df.alias('src'), 
                                   #'policy_id',
                                   sqlFunc.col("target.policy_id") == sqlFunc.col("src.policy_id"),
                                   'left')
                          .where('src.policy_id is null or target.expiry_date > src.expiry_date' )
                          .select('target.*')
                )
    
    target_df = not_match_df.union(src_df)

            
    




In [9]:

target_dyf = DynamicFrame.fromDF(target_df, glueContext, 'target_dyf')

# delete existing data
if target_dyf.toDF().count() > 0:
    glueContext.purge_table(target_db, 
                            target_table, 
                            {"retentionPeriod": 0, 
                             "manifestFilePath": target_path + "manifest/"},
                            'target_dyf'
                            )





In [10]:
additional_options={
        "enableUpdateCatalog": True,
        "updateBehavior": "UPDATE_IN_DATABASE"        
    }

s3sink = glueContext.getSink(
    connection_type     = "s3",
    path                = target_path,
    partitionKeys       = partitions,
    compression         = "snappy",
    enableUpdateCatalog = True,
    updateBehavior      = "UPDATE_IN_DATABASE"
)
    
s3sink.setCatalogInfo(
    catalogDatabase  = target_db, 
    catalogTableName = target_table
)
    
s3sink.setFormat("glueparquet", useGlueParquetWriter=True)

final_dyf = s3sink.writeFrame(target_dyf)






In [11]:

job.commit()




In [136]:
%stop_session

Stopping session: aa4ec36e-e95b-450a-98d1-9c5a970fcdb0
Stopped session.
