In [None]:
import os
os.environ["AWS_ACCESS_KEY_ID"] = "------------------------"
os.environ["AWS_SECRET_ACCESS_KEY"] = "------------------"
os.environ["AWS_DEFAULT_REGION"] = "us-east-1"


In [None]:
# -*- coding: utf-8 -*-
import os
import time
from pathlib import Path
import boto3
from botocore.exceptions import ClientError
from boto3.s3.transfer import TransferConfig

# =========================
# CONFIG
# =========================
LOCAL_DIR = Path(os.environ.get("LOCAL_DIR", r"data"))
BUCKET = "cun-transcribe-five9"
S3_PREFIX = os.environ.get("S3_PREFIX","")  # destino en S3

# Subida eficiente
config = TransferConfig(
    multipart_threshold=25 * 1024 * 1024,  # 25MB
    max_concurrency=10,
    multipart_chunksize=25 * 1024 * 1024,
    use_threads=True,
)

def list_existing_keys(bucket: str, prefix: str) -> set[str]:
    s3 = boto3.client("s3")
    paginator = s3.get_paginator("list_objects_v2")
    existing = set()
    for page in paginator.paginate(Bucket=bucket, Prefix=prefix):
        for obj in page.get("Contents", []):
            existing.add(obj["Key"])
    return existing

def fmt_time(sec: float) -> str:
    sec = max(0, int(sec))
    m, s = divmod(sec, 60)
    h, m = divmod(m, 60)
    if h:
        return f"{h}h {m}m {s}s"
    if m:
        return f"{m}m {s}s"
    return f"{s}s"

def main():
    if not LOCAL_DIR.exists():
        raise FileNotFoundError(f"No existe la carpeta: {LOCAL_DIR}")

    # Credenciales por entorno (recomendado)
    if not os.getenv("AWS_ACCESS_KEY_ID") or not os.getenv("AWS_SECRET_ACCESS_KEY"):
        raise RuntimeError("No veo AWS_ACCESS_KEY_ID / AWS_SECRET_ACCESS_KEY en variables de entorno.")

    s3 = boto3.client("s3")

    pdf_files = sorted(LOCAL_DIR.rglob("*.pdf"))
    if not pdf_files:
        print("No encontr√© PDFs en la carpeta.")
        return

    total = len(pdf_files)
    print(f"üìÑ PDFs encontrados: {total}")
    print("üîé Revisando cu√°les ya existen en S3 para no repetir...")
    existing_keys = list_existing_keys(BUCKET, S3_PREFIX)
    print(f"‚úÖ Ya existen en S3 (bajo {S3_PREFIX}): {len(existing_keys)}\n")

    uploaded = 0
    skipped = 0
    failed = 0

    t0 = time.perf_counter()

    for idx, pdf in enumerate(pdf_files, start=1):
        rel = pdf.relative_to(LOCAL_DIR).as_posix()
        s3_key = f"{S3_PREFIX}{rel}"

        # Faltan considerando el total de archivos por iterar
        remaining = total - idx

        # Progreso global (subidos/saltados/fallidos) hasta ahora
        processed = uploaded + skipped + failed

        # ETA simple (basada en items procesados)
        elapsed = time.perf_counter() - t0
        rate = processed / elapsed if elapsed > 0 and processed > 0 else 0
        eta = (total - processed) / rate if rate > 0 else 0

        header = (
            f"[{idx}/{total}] "
            f"Subidos:{uploaded}  Saltados:{skipped}  Fallidos:{failed}  "
            f"Faltan:{total - processed}  ETA:{fmt_time(eta)}"
        )

        if s3_key in existing_keys:
            skipped += 1
            print(f"‚è≠Ô∏è  {header}\n    Ya existe, salto: {rel}\n")
            continue

        try:
            s3.upload_file(
                Filename=str(pdf),
                Bucket=BUCKET,
                Key=s3_key,
                Config=config,
                ExtraArgs={"ContentType": "application/pdf"},
            )
            uploaded += 1
            print(f"‚úÖ {header}\n    Subido: {rel} -> s3://{BUCKET}/{s3_key}\n")

        except ClientError as e:
            failed += 1
            print(f"‚ùå {header}\n    Error: {rel}\n    {e}\n")

    print("===== RESUMEN FINAL =====")
    print(f"üìÑ Total PDFs: {total}")
    print(f"‚úÖ Subidos:   {uploaded}")
    print(f"‚è≠Ô∏è  Saltados:  {skipped}")
    print(f"‚ùå Fallidos:  {failed}")

if __name__ == "__main__":
    main()
