In [5]:
import os
import zipfile
from pathlib import Path
from dotenv import load_dotenv


def check_env() -> str:
    if os.environ.get('KAGGLE_KERNEL_RUN_TYPE'):
        print("Running on Kaggle")
        return "kaggle"
    else:
        print("Running locally")
        return "local"


ENV = check_env()

if ENV == "kaggle":
    data_dir = Path("/kaggle/input/ka-ocr")
else:
    load_dotenv()

    from huggingface_hub import hf_hub_download

    data_dir = Path("./data")
    data_dir.mkdir(parents=True, exist_ok=True)

    hf_repo = os.getenv("HF_DATASET_REPO")
    hf_token = os.getenv("HF_TOKEN")

    if not hf_repo:
        raise ValueError("HF_DATASET_REPO not set in .env")

    # Download with automatic caching - skips if local matches remote (etag-based)
    zip_path = hf_hub_download(
        repo_id=hf_repo,
        filename="ka-ocr.zip",
        repo_type="dataset",
        token=hf_token,
        local_dir=data_dir,
    )

    # Extract only if not already extracted OR if zip is newer than extraction
    extract_marker = data_dir / ".extracted"
    zip_file = Path(zip_path)
    needs_extract = (
        not extract_marker.exists() or
        zip_file.stat().st_mtime > extract_marker.stat().st_mtime
    )

    if needs_extract:
        print("Extracting dataset...")
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(data_dir)
        extract_marker.touch()
        print("Extraction complete")
    else:
        print("Dataset already extracted, skipping")

print(f"\nDataset contents in {data_dir}:")
for item in data_dir.iterdir():
    if not item.name.startswith('.') and item.name != "ka-ocr.zip":
        print(f"  {item.name}")

Running locally
Dataset already extracted, skipping

Dataset contents in data:
  3d_unicode
  alkroundedmtav-medium
  alkroundednusx-medium
  ar-archy-regular
  arial_geo
  arial_geo-bold
  arial_geo-bold-italic
  arial_geo-italic
  bpg_algeti
  bpg_algeti_compact
  bpg_arial_2009
  bpg_boxo
  bpg_boxo-boxo
  bpg_classic_medium
  bpg_dedaena
  bpg_dedaena_nonblock
  bpg_excelsior_caps_dejavu_2010
  bpg_excelsior_dejavu_2010
  bpg_extrasquare_2009
  bpg_extrasquare_mtavruli_2009
  bpg_glaho
  bpg_glaho_2008
  bpg_glaho_arial
  bpg_glaho_bold
  bpg_glaho_sylfaen
  bpg_glaho_traditional
  bpg_ingiri_2008
  bpg_irubaqidze
  bpg_mrgvlovani_caps_2010
  bpg_nino_elite_exp
  bpg_nino_elite_ultra
  bpg_nino_elite_ultra_caps
  bpg_nino_medium_caps
  bpg_nino_mtavruli_bold
  bpg_nino_mtavruli_book
  bpg_nino_mtavruli_normal
  bpg_no9
  bpg_nostalgia
  bpg_paata
  bpg_paata_caps
  bpg_paata_cond
  bpg_paata_cond_caps
  bpg_paata_exp
  bpg_phone_sans_bold
  bpg_phone_sans_bold_italic
  bpg_phone_sa