###Chunk 1 : used for first time load to CSV

####Logic: 

This section handles the primary transaction data. It performs a "move" operation from the raw staging area to a specific chunked directory.

####Why this code: 
 
 We use dbutils.fs.mv instead of a copy to ensure that once files are processed, the source directory is cleared, preventing duplicate processing in future runs. This notebook handles multiple run scenarios

In [0]:
# 1. # 1. Define source and target paths for the first chunk (Transactions)
source_path = "/Volumes/vstone-catalog/vstone_schema/raw_data/transactions"
target_path = "/Volumes/vstone-catalog/vstone_schema/chunked_data/chunk1/"


In [0]:

# 2.# 2. Ensure the destination directory exists to avoid path errors
# dbutils.fs.mkdirs is idempotent (won't error if it already exists)
dbutils.fs.mkdirs(target_path)

# 3. List files in the source and move them to the target
try:
    files = dbutils.fs.ls(source_path)
    
    if len(files) == 0:
        print(f"No files found in {source_path}. They may have already been moved.")
    else:
        for file in files:
            # Construct the full destination path for each file
            destination = f"{target_path}/{file.name}"
            
            # 4. Perform the move operation
            # Moving is faster than copying and manages source cleanup automatically
            print(f"Moving: {file.name}...")
            dbutils.fs.mv(file.path, destination)
            
        print("Move operation complete.")

except Exception as e:
    # Logic: If the folder is missing, it's likely because the move was successful previously
    if "java.io.FileNotFoundException" in str(e):
        print("Source directory not found. The files have likely already been moved.")
    else:# Re-raise error if it's a different, unexpected issue
        raise e

###Chunk 2 : Used for Incremental load to csv

####Logic:

 This chunk processes transaction_items. Similar to Chunk 1, it utilizes a move logic but is separated to allow for a different scheduling frequency (incremental updates). 
 
####Why this code: 
 
 Separating items from parent transactions allows for more granular data management and troubleshooting if the incremental feed fails.

In [0]:
# 1. Define source and destination paths
source_path = "/Volumes/vstone-catalog/vstone_schema/raw_data/transaction_items"
target_path = "/Volumes/vstone-catalog/vstone_schema/chunked_data/chunk2/transaction_items"


In [0]:

# 2. Idempotent directory creation
# This won't fail if the directory already exists
dbutils.fs.mkdirs(target_path)

# 3. Handle the file movement logic with error trapping
try:
    files = dbutils.fs.ls(source_path)
    
    if not files:
        print(f"Notice: No files found in {source_path}. They may have already been moved in a previous run.")
    else:
        for file in files:
            # Construct destination path
            destination = f"{target_path}/{file.name}"
            
            # Logic: We use mv to ensure the 'raw_data' folder stays clean for the next delta load
            print(f"Moving: {file.name}")
            dbutils.fs.mv(file.path, destination)
            
        print("Success: All transaction_items moved to chunk2.")

except Exception as e:
    # Catching the error if the source folder was deleted/moved entirely
    if "FileNotFoundException" in str(e):
        print("Source path not found. Check if files were already moved to chunk2.")
    else:
        print(f"An unexpected error occurred: {e}")

###Chunk 3 : Used for Json Format Conversion

####Logic: 

This section converts standard CSV user data into JSON format. 

####Why this code:

 Downstream applications (like web services or NoSQL databases) often require JSON. We use spark.read and df.write.json to handle the schema conversion automatically, and then use dbutils to "flatten" the Spark output from a folder into a single clean .json file.

In [0]:
# List of folders containing data that requires JSON formatting
folders = [
     "users"
]

base_raw_path = "/Volumes/vstone-catalog/vstone_schema/raw_data"
base_target_path = "/Volumes/vstone-catalog/vstone_schema/chunked_data/chunk3/users/"

# Ensure the main output directory exists
dbutils.fs.mkdirs(base_target_path)

for folder in folders:
    source_dir = f"{base_raw_path}/{folder}"
    
    # Logic: Filter out directories or hidden Spark metadata files (starting with _)
    try:
        files = [f for f in dbutils.fs.ls(source_dir) if not f.isDir() and not f.name.startswith("_")]
        
        print(f"--- Processing {len(files)} files in folder: {folder} ---")
        
        for file in files:
            # Create a unique name: foldername_filename.json
            file_name_clean = file.name.split('.')[0]
            final_name = f"{file_name_clean}.json"
            final_path = f"{base_target_path}/{final_name}"
            
            # Logic: Spark writes to a directory. We use a temp_dir to hold the Spark output.
            temp_dir = f"{base_target_path}/_temp_{final_name}"
            
            # 1. Read CSV source
            df = spark.read.format("csv").option("header", "true").load(file.path)
            
            # 2. Write to JSON format
            # coalesce(1) ensures we only get one part file
            df.coalesce(1).write.mode("overwrite").json(temp_dir)
            
           # 3. Rename/Extract: Spark names the file 'part-000...'. 
            # We copy it out to our 'final_path' with the correct name.
            part_file = [f for f in dbutils.fs.ls(temp_dir) if f.name.startswith("part-")][0]
            dbutils.fs.cp(part_file.path, final_path)
            
            # 4. Cleanup: Delete the temporary Spark folder to keep the workspace clean
            dbutils.fs.rm(temp_dir, recurse=True)
            
            print(f"Created: {final_name}")

    except Exception as e:
        print(f"Skipping {folder}: {e}")

print("\nConversion complete. Every raw file now has a matching JSON file.")

###Chunk 4 : Used for XML Format Conversion


####Logic: 

This chunk processes reference data (vouchers, stores, etc.) into XML. 

####Why this code: 

Many legacy systems or third-party integrations require XML. This script uses the Spark-XML connector logic to wrap CSV rows into specific rootTag and rowTag structures, ensuring the data is valid for XML parsers.

In [0]:
# 1. Configuration for reference data tables
folders_to_convert = ["vouchers", "payment_methods", "menu_items", "stores"]
base_raw_path = "/Volumes/vstone-catalog/vstone_schema/raw_data/"
target_chunk4 = "/Volumes/vstone-catalog/vstone_schema/chunked_data/chunk4/"


In [0]:

# Ensure target directory exists
dbutils.fs.mkdirs(target_chunk4)

for folder in folders_to_convert:
    source_dir = f"{base_raw_path}/{folder}"
    
    try:
        # Get all individual files (ignoring directories and hidden/metadata files)
        files = [f for f in dbutils.fs.ls(source_dir) if not f.isDir() and not f.name.startswith("_")]
        
        print(f"--- Converting {len(files)} files from {folder} ---")
        
        for file in files:
            # Create a clean name for the output
            file_name_base = file.name.split('.')[0]
            final_xml_name = f"{file_name_base}.xml"
            final_path = f"{target_chunk4}/{final_xml_name}"
            
            # Temporary folder for Spark output
            temp_dir = f"{target_chunk4}/_temp_{final_xml_name}"
            
           # 2. Read file into a Spark DataFrame
            df = spark.read.format("csv").option("header", "true").load(file.path)
            
            # 3. Write to XML (This creates a folder with a part-file inside)
            # 'rootTag' and 'rowTag' define the XML structure
            df.coalesce(1).write.format("xml") \
                .option("rootTag", f"{folder}_data") \
                .option("rowTag", "item") \
                .mode("overwrite") \
                .save(temp_dir)
            
            # 4. Extract the XML part-file and rename it to the final destination
            # This ensures we get a single .xml file, not a folder
            actual_xml_part = [f for f in dbutils.fs.ls(temp_dir) if f.name.startswith("part-") and f.name.endswith(".xml")][0]
            dbutils.fs.cp(actual_xml_part.path, final_path)
            
            # 5. Remove Spark's temporary folder and metadata
            dbutils.fs.rm(temp_dir, recurse=True)
            
            print(f"Successfully created: {final_xml_name}")

    except Exception as e:
        print(f"Error processing folder {folder}: {e}")

print(f"\nTask Complete! XML files are located in: {target_chunk4}")