In [0]:
import json
import os

# Clean up: Delete all existing .txt files and *_text_analysis.csv files from /dbfs/tmp
print("Cleaning up existing files...")
tmp_files = dbutils.fs.ls("dbfs:/tmp/")
for file_info in tmp_files:
    file_path = file_info.path
    if file_path.lower().endswith(".txt") or file_path.lower().endswith("_text_analysis.csv"):
        print(f"Deleting: {file_path}")
        dbutils.fs.rm(file_path)

# Get volume path from widget
volume_path = dbutils.widgets.get("volume_path")

# List .txt files in Unity Catalog volume
src_paths = [
    f.path for f in dbutils.fs.ls(volume_path)
    if f.path.lower().endswith(".txt")
]
print(f"Found {len(src_paths)} source text files")

copied_paths = []

# Copy each file to /dbfs/tmp and collect real local-accessible paths
for src in src_paths:
    filename = src.split("/")[-1]
    dst_java = f"/dbfs/tmp/{filename}"
    dst_spark = f"dbfs:/tmp/{filename}"
    
    print(f"Copying file: {filename}")
    
    try:
        # Copy the file from volume to DBFS
        dbutils.fs.cp(src, dst_spark, recurse=False)
        
        # Verify the file exists locally
        if os.path.exists(dst_java):
            file_size = os.path.getsize(dst_java)
            print(f"  ✓ Successfully copied: {dst_java} ({file_size} bytes)")
            copied_paths.append(dst_java)
        else:
            print(f"  ⚠️ File not found locally after copy: {dst_java}")
    except Exception as e:
        print(f"  ❌ Error copying file: {str(e)}")

# Summary
print("\nSummary:")
print(f"Total source files: {len(src_paths)}")
print(f"Files copied: {len(copied_paths)}")

# Store for downstream use
dbutils.jobs.taskValues.set("file_paths", copied_paths)