In [0]:
import os
import logging
import yaml
from datetime import datetime
from pyspark.sql.functions import current_timestamp

logging.basicConfig(level=logging.INFO, format='%(asctime)s-%(name)s-%(levelname)s-%(message)s')

def safe_load_yaml(file_path):
    try:
        if not os.path.isfile(file_path):
            logging.error(f"Configuration file not found: {file_path}")
            raise FileNotFoundError (f"Missing configuration file: {file_path}")
        else:
            with open(file_path, 'r') as f:
                return yaml.safe_load(f)
    except Exception as e:
        logging.error(f"Error loading YAML file: {e}")
        raise

def get_s3_data(file_path,file_format):
    try:
        if file_format=='csv':
            return spark.read.format(file_format).load(file_path,header=True,inferschema=False)
        else:
            return spark.read.format(file_format).load(file_path)
    except Exception as e:
        raise RuntimeError(f"Error in reading the file from s3 at {file_path} : ,{e}")


#load yaml config
global_config=safe_load_yaml('/Workspace/Users/hritikraj143@gmail.com/Retail-Analytics/Config/global_config.yaml')
logging.info('global config loaded successfully')
s3_customers_file_path=global_config['customers_file_path']
if not isinstance(s3_customers_file_path, str):
    raise ValueError('customers file path not found in config')
catalog=global_config['catalog']
if not isinstance(catalog, str):
    raise ValueError('catalog not found in config')
bronze_config=safe_load_yaml('/Workspace/Users/hritikraj143@gmail.com/Retail-Analytics/Config/bronze_config.yaml')
logging.info('bronze config loaded successfully')
target_schema=bronze_config['bronze']['customers_landing']['target_schema']
if not isinstance(target_schema, str):
    raise ValueError('target_schema not found in config')
target_table=bronze_config['bronze']['customers_landing']['target_table']
if not isinstance(target_table, str):
    raise ValueError('target_table not found in config')
customers_df=get_s3_data(s3_customers_file_path,'csv')
logging.info('customers.csv from s3 is loaded successfully into the memory')
customers_df = customers_df.withColumn("UpdatedAt", current_timestamp())
logging.info('UpdatedAt column added with current timestamp')
customers_df.createOrReplaceTempView("customers_updates")
update_table_query=f"""MERGE INTO {catalog}.{target_schema}.{target_table} as trg
using customers_updates as src
ON trg.CustomerID = src.CustomerID
WHEN MATCHED THEN
UPDATE SET *
WHEN NOT MATCHED THEN
INSERT *
"""
try:
    spark.sql(update_table_query)
    logging.info(f"query to update the {target_table} ran successfully")
except Exception as e:
    raise RuntimeError(f"failed to run the query : {update_table_query}, {e}")