# S3 Bucket Explorer

Lightweight S3 explorer using patterns from `src.pubtator_utils.file_handler.s3_handler`.

**Bucket:** `gilead-edp-kite-rd-dev-us-west-2-kite-benchling-text-sql`  
**Target Folder:** `benchling_unstructured/`

In [0]:
"""
S3IOUtil - Lightweight version mirroring src.pubtator_utils.file_handler.s3_io_util
Works without YAML config, suitable for Databricks with direct S3 access.
"""
import os
import boto3
from botocore.exceptions import ClientError

class S3IOUtil:
    """Lightweight S3 utility class (mirrors project's S3IOUtil without YAML dependencies)."""
    
    def __init__(self, bucket_name: str, bucket_region: str = "us-west-2"):
        self.bucket_name = bucket_name
        self.bucket_region = bucket_region
        self.s3 = boto3.resource("s3", region_name=bucket_region)
        self.bucket = self.s3.Bucket(bucket_name)
        self.client = boto3.client("s3", region_name=bucket_region)
    
    def list_files(self, prefix: str = "") -> list:
        """List all files under a prefix (mirrors project's list_files)."""
        files = []
        for obj in self.bucket.objects.filter(Prefix=prefix):
            if not obj.key.endswith("/"):
                files.append(obj.key)
        print(f"Found {len(files)} files under '{prefix}'")
        return files
    
    def download_file(self, object_name: str, file_path: str = None, as_binary: bool = False):
        """Download a file from S3 (mirrors project's download_file)."""
        try:
            obj = self.bucket.Object(object_name)
            response = obj.get()
            if file_path:
                os.makedirs(os.path.dirname(file_path), exist_ok=True)
                with open(file_path, "wb") as f:
                    f.write(response["Body"].read())
                return True
            elif as_binary:
                return response["Body"].read()
            else:
                return response["Body"].read().decode("utf-8")
        except ClientError as e:
            print(f"Error downloading {object_name}: {e}")
            return None
    
    def file_exists(self, object_name: str) -> bool:
        """Check if a file exists (mirrors project's file_exists)."""
        try:
            self.client.head_object(Bucket=self.bucket_name, Key=object_name)
            return True
        except ClientError:
            return False

# Initialize
BUCKET_NAME = "gilead-edp-kite-rd-dev-us-west-2-kite-benchling-text-sql"
s3_util = S3IOUtil(BUCKET_NAME)
print(f"‚úÖ Connected to bucket: {BUCKET_NAME}")

In [0]:
# List benchling_unstructured/ folder content
PREFIX = "benchling_unstructured/RD_Biovia_ELNs/"

files = s3_util.list_files(PREFIX)
files = [f for f in files if f.lower().endswith('.pdf')]

# Display as DataFrame
import pandas as pd

def format_size(size_bytes: int) -> str:
    for unit in ['B', 'KB', 'MB', 'GB']:
        if size_bytes < 1024:
            return f"{size_bytes:.1f} {unit}"
        size_bytes /= 1024
    return f"{size_bytes:.1f} TB"

# Get file details
file_data = []
for key in files[0:3]:
    obj = s3_util.bucket.Object(key)
    file_data.append({
        "file": key.replace(PREFIX, ""),
        "size": format_size(obj.content_length),
        "last_modified": obj.last_modified.strftime("%Y-%m-%d %H:%M"),
    })


df = pd.DataFrame(file_data)
display(df)

In [0]:
# Analyze folder structure within benchling_unstructured/
from collections import Counter

subfolders = Counter()
extensions = Counter()

for key in files:
    rel_path = key.replace(PREFIX, "")
    parts = rel_path.split("/")
    
    # Count subfolders
    if len(parts) > 1:
        subfolders[parts[0]] += 1
    
    # Count extensions
    if "." in parts[-1]:
        ext = "." + parts[-1].split(".")[-1].lower()
        extensions[ext] += 1

print(f"üìÅ Subfolders in {PREFIX}:")
for folder, count in subfolders.most_common():
    print(f"   {folder}/: {count} files")

print(f"\nüìÑ File types:")
for ext, count in extensions.most_common():
    print(f"   {ext}: {count} files")