In [0]:
import os
import logging
import yaml
from datetime import datetime
from pyspark.sql.functions import(col, coalesce,lit,trim,regexp_replace,to_timestamp,when,from_unixtime,current_timestamp)

logging.basicConfig(level=logging.INFO, format='%(asctime)s-%(name)s-%(levelname)s-%(message)s')

def safe_load_yaml(file_path):
    try:
        if not os.path.isfile(file_path):
            logging.error(f"Configuration file not found: {file_path}")
            raise FileNotFoundError (f"Missing configuration file: {file_path}")
        else:
            with open(file_path, 'r') as f:
                return yaml.safe_load(f)
    except Exception as e:
        logging.error(f"Error loading YAML file: {e}")
        raise

def load_bronze_table(catalog,source_schema,source_table):
    try:
        return spark.read.table(f"{catalog}.{source_schema}.{source_table}")
        logging.info(f"bronze table {source_table} loaded successfully into dataframe")
    except Exception as e:
        raise RuntimeError(f"Error loading bronze table: {e}")

def write_silver_table(df,catalog,target_schema,target_table):
    try:
        df.write.mode("overwrite").InsertInto(f"{catalog}.{target_schema}.{target_table}",overwrite=True)
        logging.info(f"silver table {target_table} written successfully")
    except Exception as e:
        raise RuntimeError(f"Error writing silver table {target_table}: {e}")

def normalize_simple_date(df,raw_col,out_col,status_col):
    """
    Simple normalization:
        -tries a few common text formats
        -tries epoch seconds (10 digits) and epoch miliseconds (13 digits)
        -returns df with two new columns: out_col(date) and status_col (how parsed)
    """
    #make sure we have a trimmed string version
    df=df.withColumn('_raw',trim(col(raw_col).cast('string')))
    #remove surrounding quotes if any
    df=df.withColumn('_raw',regexp_replace(col('_raw'),r"^[\'\"]|[\'\"]$",""))
    #incomplete function

#load configs
global_config=safe_load_yaml('/Workspace/Users/hritikraj143@gmail.com/Retail-Analytics/Config/global_config.yaml')
catalog=global_config['catalog']
silver_config=safe_load_yaml('/Workspace/Users/hritikraj143@gmail.com/Retail-Analytics/Config/silver_config.yaml')
source_schema=silver_config['silver']['sales_transformed']['source_schema']
source_table=silver_config['silver']['sales_transformed']['source_table']
target_schema=silver_config['silver']['sales_transformed']['target_schema']
target_table=silver_config['silver']['sales_transformed']['target_table']
#load bronze table
sales_landing=load_bronze_table(catalog,source_schema,source_table)
#remove invalid or null orderid rows
sales_landing=sales_landing.filter(col('OrderID').isNotNull())
sales_landing=sales_landing.withColumn('OrderID',sales_landing['OrderID'].cast('BIGINT'))
sales_landing=sales_landing.withColumn('CustomerID',coalesce(col('CustomerID'),lit('UNKNOWN_CUSTOMER')))
sales_landing=sales_landing.withColumn('ProductID',coalesce(col('ProductID'),lit('UNKNOWN_PRODUCT')))
#new=sales_landing.filter(col('CustomerID')=='UNKNOWN_CUSTOMER')
#display(new)
#sales_landing.count()
display(sales_landing)