# S3 Bucket Explorer (Using Pubtator's S3IOUtil)

This notebook uses Pubtator's `S3IOUtil` class directly, with YAML config injected inline.

**Bucket:** `gilead-edp-kite-rd-dev-us-west-2-kite-benchling-text-sql`  
**Target Folder:** `benchling_unstructured/`

In [0]:
import sys
from pathlib import Path

# Add project root to path
project_root = Path.cwd().parent
sys.path.insert(0, str(project_root))

# ============================================================
# INLINE YAML CONFIG (replaces configs/aws.yaml)
# ============================================================
AWS_CONFIG = {
    "aws": {
        "platform_type": "databricks",
        "databricks": {
            "s3": {
                "bucket_name": "gilead-edp-kite-rd-dev-us-west-2-kite-benchling-text-sql",
                "bucket_region": "us-west-2",
            }
        }
    }
}

# ============================================================
# Mock YAMLConfigLoader before importing S3IOUtil
# ============================================================
class MockYAMLConfigLoader:
    """Mock config loader that returns inline config instead of reading YAML files."""
    _instance = None
    
    def __new__(cls):
        if cls._instance is None:
            cls._instance = super().__new__(cls)
        return cls._instance
    
    def __init__(self, config_dir=None):
        pass
    
    def get_config(self, name: str):
        if name == "aws":
            return AWS_CONFIG
        return {}

# Patch the config loader in the module before import
import src.pubtator_utils.config_handler.config_reader as config_reader_module
config_reader_module.YAMLConfigLoader = MockYAMLConfigLoader

# Also patch in s3_io_util module
import src.pubtator_utils.file_handler.s3_io_util as s3_io_util_module
s3_io_util_module.config_loader = MockYAMLConfigLoader()

print("‚úÖ Inline config injected successfully")

In [0]:
# Import Pubtator's S3IOUtil (now using our inline config)
from src.pubtator_utils.file_handler.s3_io_util import S3IOUtil

# Initialize with platform_type matching our config
s3_util = S3IOUtil(platform_type="databricks")

print(f"‚úÖ Connected to bucket: {s3_util.bucket_name}")
print(f"   Region: {s3_util.bucket_region}")

In [0]:
# List benchling_unstructured/ folder using Pubtator's list_files method
PREFIX = "benchling_unstructured/RD_Biovia_ELNs/"

files = s3_util.list_files(prefix=PREFIX)
files = [f for f in files if f.lower().endswith('.pdf')][0:3] # Filter PDFs

print(f"\nüìÅ Files in '{PREFIX}':")
for f in files:  # Show first 20
    print(f"   {f}")

if len(files) > 3:
    print(f"   ... and {len(files) - 3} more files")

In [0]:
# Display as DataFrame with file details
import pandas as pd

def format_size(size_bytes: int) -> str:
    for unit in ['B', 'KB', 'MB', 'GB']:
        if size_bytes < 1024:
            return f"{size_bytes:.1f} {unit}"
        size_bytes /= 1024
    return f"{size_bytes:.1f} TB"

file_data = []
for key in files:
    obj = s3_util.bucket.Object(key)
    file_data.append({
        "file": key.replace(PREFIX, ""),
        "size": format_size(obj.content_length),
        "last_modified": obj.last_modified.strftime("%Y-%m-%d %H:%M"),
    })

df = pd.DataFrame(file_data)
display(df)

In [0]:
# Analyze folder structure
from collections import Counter

subfolders = Counter()
extensions = Counter()

for key in files:
    rel_path = key.replace(PREFIX, "")
    parts = rel_path.split("/")
    
    if len(parts) > 1:
        subfolders[parts[0]] += 1
    
    if "." in parts[-1]:
        ext = "." + parts[-1].split(".")[-1].lower()
        extensions[ext] += 1

print(f"üìÅ Subfolders in {PREFIX}:")
for folder, count in subfolders.most_common():
    print(f"   {folder}/: {count} files")

print(f"\nüìÑ File types:")
for ext, count in extensions.most_common():
    print(f"   {ext}: {count} files")