In [None]:
import findspark
findspark.init()
findspark.find()

In [None]:
import logging
import os
from pyspark.sql import SparkSession

In [None]:
log_file = "bronze_layer.log"

# Remove any existing handlers
for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)

# Configure logging
logging.basicConfig(
    filename=log_file,
    level=logging.INFO,
    filemode="w",  # Overwrites log file on each run
    format="%(asctime)s - %(levelname)s - %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S"
)

logging.info("Logging system initialized.")


In [None]:
logging.info("Starting Spark session")
spark = SparkSession.spark = SparkSession.builder.appName('bronze_layer_ingestion').getOrCreate()
logging.info("Spark session created")

In [None]:
def bronze_layer_ingestion(input_path):
    logs = {}
    try:
        data = data = spark.read.csv(input_path, sep=',', header = True)
        no_of_rows = data.count()
        file_name = os.path.splitext(os.path.basename(input_path))[0]
        logging.info(f"Writing data started for file {file_name}")
        output_path = f"hdfs://0.0.0.0:19000/bronze_layer/global_fashion_sales/{file_name}/"
        data.coalesce(1).write.mode("overwrite")\
            .option('header', 'true')\
            .csv(output_path)
        
        logs["no_of_rows"] = no_of_rows
        logs["file_name"] = file_name
        logs["output_path"] = output_path
        print(logs)
        logging.info(f"File name:{file_name}, no of rows: {no_of_rows}")
        logging.info(f"Output Path = {output_path}")


    except Exception as e:
        logging.error(f"Error writing CSV: {str(e)}", exc_info=True)


In [None]:
def get_all_files(directory):
    file_paths = []
    for root, _, files in os.walk(directory):
        for file in files:
            file_paths.append(os.path.join(root, file))
    return file_paths

In [None]:
basepath = r"D:\Dev\global-fashion-sales-dwh\dataset"
paths = get_all_files(basepath)
paths

In [None]:
logging.info("reading data")
for path in paths:
    bronze_layer_ingestion(path)
logging.info("Writing data completed")

In [None]:
logging.info("stoping spark")
spark.stop()
logging.info("job completed")
logging.shutdown()  # Close all log handlers