In [None]:
import os
import sys
import json
import shutil
import subprocess
from pathlib import Path

BASE = Path("/content/data")
BASE.mkdir(parents=True, exist_ok=True)

FIVES_SLUG = "nitishsingla0/fives-dataset"
FIVES_NAME = "FIVES"

# Utils
def sh(cmd, check=True, echo=True, capture=True):
    if echo:
        print("$", cmd)
    if capture:
        p = subprocess.run(cmd, shell=True, text=True, capture_output=True)
        if p.stdout:
            print(p.stdout)
        if p.stderr:
            print(p.stderr, file=sys.stderr)
    else:
        p = subprocess.run(cmd, shell=True)
    if check and p.returncode != 0:
        raise subprocess.CalledProcessError(p.returncode, cmd)
    return p

def move_into(src_dir: Path, out_dir: Path):
    out_dir.mkdir(parents=True, exist_ok=True)
    for item in src_dir.iterdir():
        dst = out_dir / item.name
        if item.is_dir():
            dst.mkdir(parents=True, exist_ok=True)
            for s in item.rglob("*"):
                if s.is_file():
                    d = dst / s.relative_to(item)
                    d.parent.mkdir(parents=True, exist_ok=True)
                    shutil.copy2(s, d)
        else:
            shutil.copy2(item, dst)

    try:
        shutil.rmtree(src_dir)
    except Exception:
        pass

# Kaggle auth
def ensure_kaggle_auth():
    kaggle_dir = Path("/root/.kaggle")
    kaggle_json = kaggle_dir / "kaggle.json"
    if not kaggle_json.exists():
        try:
            from google.colab import files
        except Exception:
            print(" google.colab not available. If you are not in Colab,"
                  " place kaggle.json at /root/.kaggle/kaggle.json manually.")
            raise
        print(" Kaggle API token not found. Upload kaggle.json (Kaggle → Account → Create New API Token).")
        kaggle_dir.mkdir(parents=True, exist_ok=True)
        uploaded = files.upload()
        fname = next(iter(uploaded))
        if fname != "kaggle.json":
            Path(f"/content/{fname}").rename("/content/kaggle.json")
            fname = "kaggle.json"
        shutil.move(f"/content/{fname}", kaggle_json)
        kaggle_json.chmod(0o600)

    try:
        creds = json.loads(kaggle_json.read_text())
        os.environ["KAGGLE_USERNAME"] = creds.get("username", "")
        os.environ["KAGGLE_KEY"] = creds.get("key", "")
    except Exception:
        pass

    try:
        import kaggle
    except Exception:
        sh("pip -q install kaggle", check=True)

def kaggle_probe(slug: str) -> bool:
    p = sh(f'kaggle datasets files -d "{slug}"', check=False)
    return p.returncode == 0

def kaggle_download(slug: str, out_dir: Path):
    """Download a Kaggle dataset and place contents into out_dir."""
    out_dir.mkdir(parents=True, exist_ok=True)
    if not kaggle_probe(slug):
        print(f" Probe failed for {slug}. Skipping.")
        return False

    # Snapshot zips/dirs before
    before_zips = {p.name for p in Path("/content").glob("*.zip")}
    before_dirs = {p.name for p in Path("/content").iterdir() if p.is_dir()}

    # Bulk download ZIP to
    p = sh(f'kaggle datasets download -d "{slug}" -p /content', check=False)
    if p.returncode == 0:
        new_zips = [p for p in Path("/content").glob("*.zip") if p.name not in before_zips]
        if new_zips:
            for z in new_zips:
                sh(f'unzip -q -o "{z}" -d "{out_dir}"', check=False)
                z.unlink(missing_ok=True)
            print(f" {slug} → {out_dir}")
            return True
        else:
            after_dirs = {p.name for p in Path("/content").iterdir() if p.is_dir()}
            created = sorted(list(after_dirs - before_dirs))
            moved_any = False
            for dname in created:
                src = Path("/content") / dname
                if any(src.iterdir()):
                    move_into(src, out_dir)
                    moved_any = True
            if moved_any:
                print(f"✓ {slug} → {out_dir}")
                return True

    # per-file download
    lst = sh(f'kaggle datasets files -d "{slug}"', check=False)
    if lst.returncode != 0:
        return False
    names = []
    for line in lst.stdout.splitlines():
        s = line.strip()
        if (not s) or s.startswith("name") or s.startswith("---") or s.startswith("Next Page Token"):
            continue
        names.append(s.split()[0])
    success_any = False
    for fname in names:
        print(f"  ↓ {fname}")
        q = sh(f'kaggle datasets download -d "{slug}" -f "{fname}" -p /content --force', check=False)
        if q.returncode != 0:
            print(f" Failed: {fname}")
            continue
        z = Path("/content") / (Path(fname).name + ".zip")
        if z.exists():
            sh(f'unzip -q -o "{z}" -d "{out_dir}"', check=False)
            z.unlink(missing_ok=True)
            success_any = True
        else:
            # direct file case
            src = Path("/content") / Path(fname).name
            if src.exists():
                dst = out_dir / src.name
                dst.parent.mkdir(parents=True, exist_ok=True)
                shutil.move(str(src), str(dst))
                success_any = True
    if success_any:
        print(f"✓ {slug} → {out_dir}")
    return success_any

# Run FIVES download
print("=== Ensuring Kaggle auth ===")
ensure_kaggle_auth()

target = BASE / FIVES_NAME
if target.exists() and any(target.iterdir()):
    print(f"Skip (exists): {FIVES_NAME} at {target}")
else:
    print(f"\n=== Kaggle: {FIVES_SLUG} → {target} ===")
    ok = kaggle_download(FIVES_SLUG, target)
    if not ok:
        print(" FIVES download failed; please check Kaggle access/slug.")
    else:
        print(f" FIVES dataset ready at: {target}")