In [1]:
def create_and_save_file(filename, content):
  try:
    with open(filename, 'w') as file:
      file.write(content)
    print(f"File '{filename}' created and saved successfully.")
  except Exception as e:
    print(f"An error occurred: {e}")

filename = "hitl_markdown_log.txt"
text_content = """This contains the logs for HITL MarkdownReview.ipynb."""

create_and_save_file(filename, text_content)

File 'hitl_markdown_log.txt' created and saved successfully.


In [None]:
import boto3
from botocore.exceptions import ClientError
import time
import logging
import sys
import os

# Configure logging (optional, but helpful for debugging)
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

def check_s3_file_exists_head(bucket_name: str, object_key: str) -> bool:
    """
    Checks if an object exists in an S3 bucket using head_object.

    Args:
        bucket_name: The name of the S3 bucket.
        object_key: The key (full path including filename) of the object.

    Returns:
        True if the object exists, False otherwise. Also returns False
        on non-404 errors to allow retries, logging the specific error.
    """
    # Ensure you have AWS credentials configured (e.g., via environment variables,
    # AWS credentials file, or IAM role)
    try:
            # Create an S3 client
        s3_client = boto3.client('s3',
                              endpoint_url=os.environ.get('minio_url'),
                              aws_access_key_id='minio',
                              aws_secret_access_key='minio123')
        s3_client.head_object(Bucket=bucket_name, Key=object_key)
        return True # Found the object
    except ClientError as e:
        error_code = e.response.get('Error', {}).get('Code')
        if error_code == '404':
            # Object Not Found - this is expected during polling
            return False
        else:
            # Handle other potential errors (e.g., permissions, throttling, bucket not found)
            # Log the error but return False to potentially allow retries
            # depending on the polling logic's max_attempts.
            logger.error(f"Error checking object '{object_key}' in bucket '{bucket_name}': {e}. Error Code: {error_code}")
            return False
    except Exception as e:
        # Catch any other unexpected exceptions
        logger.error(f"A general error occurred checking object '{object_key}': {e}")
        return False # Return False to allow potential retries

def wait_for_s3_file(bucket_name: str, object_key: str, poll_interval_seconds: int = 10, max_attempts: int | None = None):
    """
    Polls an S3 bucket until a specific object key is found.

    Args:
        bucket_name: The name of the S3 bucket.
        object_key: The key (full path including filename) of the object to wait for.
        poll_interval_seconds: The number of seconds to wait between checks.
        max_attempts: Optional. The maximum number of times to check before giving up.
                      If None, it will loop indefinitely until the file is found or
                      an unrecoverable error occurs.
    """
    logger.info(f"Waiting for object '{object_key}' to appear in bucket '{bucket_name}'...")
    attempts = 0
    while True:
        attempts += 1
        logger.info(f"Checking for '{object_key}'... (Attempt {attempts})")
        found = check_s3_file_exists_head(bucket_name, object_key)

        if found:
            logger.info(f"Success! Object '{object_key}' found in bucket '{bucket_name}'.")
            break # Exit the loop

        # Check if max_attempts limit is reached
        if max_attempts is not None and attempts >= max_attempts:
            logger.warning(f"Maximum attempts ({max_attempts}) reached. Object '{object_key}' not found.")
            # Exit the script or function indicating failure
            # sys.exit(1) # Uncomment this line to exit the entire script with an error code
            return # Or simply return from the function

        # File not found yet, wait before the next attempt
        logger.info(f"Object not found yet. Waiting {poll_interval_seconds} seconds before retrying...")
        time.sleep(poll_interval_seconds)

# --- Example Usage ---
if __name__ == "__main__":
    # --- Configuration ---
    target_bucket = 'data-files-bucket'  # <--- !! Replace with your bucket name !!
    target_file_key = 'docling-markdown-approved.md' # <--- !! Replace with the specific file key !!
    check_interval = 10  # Check every 10 seconds
    # Set max_attempts to None to wait forever, or set a number (e.g., 60 for ~10 minutes)
    maximum_checks = 60 # Example: Try 60 times (60 * 10s = 600s = 10 minutes)

    # --- Start Waiting ---
    try:
        wait_for_s3_file(
            bucket_name=target_bucket,
            object_key=target_file_key,
            poll_interval_seconds=check_interval,
            max_attempts=maximum_checks
        )
        logger.info("Polling finished successfully.")
        # Add any code here that should run *after* the file is found

    except Exception as e:
        logger.error(f"An unexpected error occurred during the waiting process: {e}")
        sys.exit(1) # Exit script with error if something unexpected happens

    # Script execution will end here naturally after the loop in wait_for_s3_file finishes
    # or if sys.exit() was called.
    print("Script finished.")