In [None]:
%iam_role arn:aws:iam::331504768406:role/service-role/AWSGlueServiceRole
%region us-east-1
%idle_timeout 5
%glue_version 4.0
%worker_type G.1X
%number_of_workers 2

In [None]:
%%configure -f
{
    "conf": "spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension --conf spark.sql.catalog.spark_catalog=org.apache.spark.sql.delta.catalog.DeltaCatalog",
    "datalake-formats":"delta",
    'enable-auto-scaling': 'false',
    'JOB_NAME': 'glue-test-job',
    'start_date': 'cron'
}

In [None]:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from awsglue import DynamicFrame
from datetime import datetime, date, timedelta



In [None]:

args = getResolvedOptions(sys.argv, ["JOB_NAME", "start_date"])
glueContext = GlueContext(SparkContext.getOrCreate())
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args["JOB_NAME"], args)


In [None]:
start_date  = args['start_date']
days_ago    = 2
bucket_name = 'jamil-datalake-dev'
catalog     = "glue-catalog"
db_src      = 'insurance_db'
db_name     = "insurance_prd"
table_name  = "policy"
partitions  = ["year", "month", "day"]
target_path = f"s3://{bucket_name}/{catalog}/{db_name}/{table_name}/"

if start_date == 'cron' or not start_date:
    start_date = (date.today() - timedelta(days=days_ago)).strftime('%Y-%m-%d')

print(start_date[:4])
print(start_date[5:7])
print(start_date[8:])

In [None]:

policy_df = glueContext.create_data_frame.from_catalog(
                    database            = db_src, 
                    table_name          = table_name
    )

_dyf = DynamicFrame.fromDF(
            dataframe=policy_df,
            glue_ctx=glueContext,
            name="_dyf"
    )


In [None]:
#policy_df.filter(f"    year = '{start_date[:4]}' and month = '{start_date[5:7]}' and   day = '{start_date[8:]}' ").show()
_dyf = _dyf.filter(
            f= lambda x: x['year'] == start_date[:4]
                    and x['month'] == start_date[5:7]
                    and x['day']   == start_date[8:]
    )


In [None]:

chg_dyf = ApplyMapping.apply(
    frame=_dyf,
    mappings=[
        ("policy_id", "long", "policy_id", "long"),
        ("expiry_date", "date", "expiry_date", "date"),
        ("location_name", "string", "location_name", "string"),
        ("state_code", "string", "state_code", "string"),
        ("region_name", "string", "region_name", "string"),
        ("insured_value", "double", "insured_value", "double"),
        ("business_type", "string", "business_type", "string"),
        ("flood", "string", "flood", "string"),
        ("file_name", "string", "file_name", "string"),
        ("year_month_day", "string", "year_month_day", "string"),
        ("year", "string", "year", "string"),
        ("month", "string", "month", "string"),
        ("day", "string", "day", "string"),
    ]
)

chg_dyf.printSchema()


In [None]:
additional_options={
        "enableUpdateCatalog": True,
        "updateBehavior": "UPDATE_IN_DATABASE"        
    }

s3sink = glueContext.getSink(
    connection_type="s3",
    path= target_path,
    partitionKeys=["year", "month", "day"],
    compression="snappy",
    enableUpdateCatalog=True,
    updateBehavior="UPDATE_IN_DATABASE"
)
    
s3sink.setCatalogInfo(
    catalogDatabase=db_name, 
    catalogTableName=table_name
)
    
s3sink.setFormat("glueparquet", useGlueParquetWriter=True)

final_dyf = s3sink.writeFrame(chg_dyf)



In [None]:

job.commit()

In [None]:
%stop_session