In [None]:
import os
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd "drive/MyDrive/Colab Notebooks/NFFA-SEM/data"
!ls

/content/drive/MyDrive/Colab Notebooks/NFFA-SEM/data


In [6]:
"""b2share_downloader.py (v4: 任意数だけダウンロード)
-------------------------------------------------
* 指定した個数 (`LIMIT_FILES`) だけ .tar を保存
* それ以外はスキップ
* 進捗バー (tqdm)
-------------------------------------------------
インストール:
    pip install tqdm requests

使い方:
    LIMIT_FILES を 3 に設定すると先頭 3 件だけ取得します。
"""
from __future__ import annotations

import hashlib
import logging
import os
import sys
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, List, Tuple

import requests
from requests.adapters import HTTPAdapter, Retry
from urllib3.exceptions import ProtocolError
from http.client import IncompleteRead
from tqdm import tqdm

# ──────────────────────────  CONFIG  ──────────────────────────
BASE_URL = "https://b2share.eudat.eu"
RECORD_ID = "80df8606fcdb4b2bae1656f0dc6db8ba"  # ← 変更可
NUM_THREADS = 2          # 並列数
MAX_RETRIES = 10         # リトライ回数
BACKOFF_START = 10       # 秒
CHUNK = 512 * 1024       # 512 KiB
LIMIT_FILES = 3          # ← ★ ここを変更 (None なら全ファイル)
MIN_EXPECTED_SIZE = 1 * 1024 * 1024  # 1 MiB
# ──────────────────────────────────────────────────────────────

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
)
logger = logging.getLogger("b2share_dl")

session = requests.Session()
retry_strategy = Retry(
    total=7,
    connect=7,
    read=7,
    backoff_factor=1,
    status_forcelist=[500, 502, 503, 504],
    allowed_methods=["HEAD", "GET"],
    raise_on_status=False,
)
session.mount("https://", HTTPAdapter(max_retries=retry_strategy))
session.mount("http://", HTTPAdapter(max_retries=retry_strategy))


def build_true_urls(record_id: str) -> Tuple[List[dict], Dict[str, dict]]:
    api_url = f"{BASE_URL}/api/records/{record_id}"
    logger.info("メタデータ取得: %s", api_url)
    r = session.get(api_url, timeout=30)
    r.raise_for_status()
    record = r.json()
    files = record.get("files", record.get("entries", []))
    tar_infos, md5_lookup = [], {}
    for f in files:
        name = f.get("key") or f.get("fileName") or f.get("filename")
        bucket = f.get("bucket")
        if not name or not bucket:
            continue
        url = f"{BASE_URL}/api/files/{bucket}/{name}"
        if name.endswith(".tar"):
            tar_infos.append({"name": name, "url": url, "size": f.get("size")})
        elif name.endswith(".md5"):
            md5_lookup[name[:-4]] = {"name": name, "url": url}
    return tar_infos, md5_lookup


def request_stream(url: str, headers: dict | None = None):
    return session.get(url, headers=headers or {}, stream=True, timeout=60, allow_redirects=True)


def download_file(url: str, local_path: str) -> bool:
    resume_from = os.path.getsize(local_path) if os.path.exists(local_path) else 0
    attempt = 0
    backoff = BACKOFF_START
    while attempt < MAX_RETRIES:
        attempt += 1
        hdr = {"Range": f"bytes={resume_from}-"} if resume_from else {}
        try:
            with request_stream(url, hdr) as r:
                r.raise_for_status()
                ctype = r.headers.get("Content-Type", "").lower()
                clen  = int(r.headers.get("Content-Length", "0"))
                if ("tar" not in ctype and "octet" not in ctype) or clen < MIN_EXPECTED_SIZE:
                    raise ValueError("Unexpected response")
                total = clen + resume_from
                mode = "ab" if resume_from else "wb"
                with open(local_path, mode) as fp, tqdm(
                    total=total,
                    initial=resume_from,
                    unit="B",
                    unit_scale=True,
                    desc=os.path.basename(local_path),
                ) as bar:
                    for chunk in r.iter_content(CHUNK):
                        if chunk:
                            fp.write(chunk)
                            bar.update(len(chunk))
            logger.info("%s 完了", local_path)
            return True
        except (IncompleteRead, ProtocolError, requests.exceptions.ChunkedEncodingError,
                requests.exceptions.ConnectionError, requests.exceptions.Timeout) as e:
            logger.warning("Read/Conn error %s – retry", e)
        except Exception as e:
            logger.warning("Download error (%s – attempt %d/%d): %s", local_path, attempt, MAX_RETRIES, e)
        time.sleep(backoff)
        backoff *= 2
        resume_from = os.path.getsize(local_path) if os.path.exists(local_path) else 0
    logger.error("Failed to download %s", local_path)
    return False


def verify_md5(path_: str, expected: str) -> bool:
    md5 = hashlib.md5()
    with open(path_, "rb") as fr:
        for chunk in iter(lambda: fr.read(CHUNK), b""):
            md5.update(chunk)
    ok = md5.hexdigest().lower() == expected.lower()
    if not ok:
        logger.error("MD5 mismatch: %s", path_)
    return ok


def task(tar: dict, md5: dict | None):
    name = tar["name"]
    if not download_file(tar["url"], name):
        return name, False
    if md5:
        try:
            exp = session.get(md5["url"], timeout=20).text.strip().split()[0]
            if not verify_md5(name, exp):
                return name, False
        except Exception as e:
            logger.error("MD5取得失敗 %s: %s", name, e)
            return name, False
    return name, True


def main(rec_id: str = RECORD_ID):
    tar_files, md5_map = build_true_urls(rec_id)
    if LIMIT_FILES is not None:
        tar_files = tar_files[:LIMIT_FILES]
    if not tar_files:
        logger.error("対象ファイルがありません。")
        sys.exit(1)
    logger.info("%d 個の .tar をダウンロード開始 (threads=%d)", len(tar_files), NUM_THREADS)
    results = []
    with ThreadPoolExecutor(max_workers=NUM_THREADS) as exe:
        fut2name = {exe.submit(task, t, md5_map.get(t["name"])): t["name"] for t in tar_files}
        for fut in as_completed(fut2name):
            name = fut2name[fut]
            try:
                _, ok = fut.result()
                results.append((name, ok))
            except Exception as exc:
                logger.error("Unhandled exception in %s: %s", name, exc)
                results.append((name, False))
    ok_cnt = sum(ok for _, ok in results)
    logger.info("完了: %d / %d 成功", ok_cnt, len(results))


if __name__ == "__main__":
    main()


Biological.tar:   0%|          | 0.00/704M [00:00<?, ?B/s]
Biological.tar:   0%|          | 1.05M/704M [00:00<07:42, 1.52MB/s]
Biological.tar:   0%|          | 1.57M/704M [00:00<05:20, 2.19MB/s]
Biological.tar:   0%|          | 2.62M/704M [00:01<03:01, 3.86MB/s]
Biological.tar:   1%|          | 5.24M/704M [00:01<01:20, 8.71MB/s]
Biological.tar:   1%|          | 7.86M/704M [00:01<00:55, 12.5MB/s]
Biological.tar:   1%|▏         | 10.5M/704M [00:01<00:44, 15.4MB/s]
Biological.tar:   2%|▏         | 13.1M/704M [00:01<00:39, 17.6MB/s]
Biological.tar:   2%|▏         | 16.3M/704M [00:01<00:33, 20.5MB/s]
Biological.tar:   3%|▎         | 18.9M/704M [00:01<00:32, 21.3MB/s]
Biological.tar:   3%|▎         | 21.5M/704M [00:01<00:31, 21.8MB/s]
Biological.tar:   4%|▎         | 24.6M/704M [00:01<00:28, 23.6MB/s]
Biological.tar:   4%|▍         | 27.3M/704M [00:02<00:29, 23.3MB/s]
Biological.tar:   4%|▍         | 30.4M/704M [00:02<00:27, 24.5MB/s]
Biological.tar:   5%|▍         | 33.0M/704M [00:02<00:27,

In [9]:
!ls -lh

total 3.6G
-rw------- 1 root root 672M May  9 16:04 Biological.tar
-rw------- 1 root root  78M May  9 16:03 Fibres.tar
-rw------- 1 root root 189M May  9 16:03 Films_Coated_Surface.tar
-rw------- 1 root root 2.7G May  9 15:53 Patterned_surface.tar


In [12]:
"""verify_md5.py (v1.2)
-------------------------------------------------
MD5 verifier – now supports **直接ハッシュ指定**
  * `-m / --md5 <hex>` で .md5 ファイルを置かずに比較
  * 従来通りフォルダ・ファイルを複数渡せる（ハッシュ直指定は 1 ファイル用）
-------------------------------------------------
Examples
========
# 1) .md5 ファイルがある場合（従来）
!python verify_md5.py Fibres.tar

# 2) .md5 が無いのでハッシュ直指定
!python verify_md5.py Fibres.tar -m 296cd6be5ac97b2203e807e9552d9aeb
"""
from __future__ import annotations

import argparse
import hashlib
import sys
from pathlib import Path
from typing import Iterable, List, Tuple

from tqdm import tqdm

CHUNK = 1 << 20  # 1 MiB

def compute_md5(path: Path) -> str:
    md5 = hashlib.md5()
    total = path.stat().st_size
    with path.open("rb") as f, tqdm(
        total=total,
        unit="B",
        unit_scale=True,
        desc=path.name,
        leave=False,
    ) as bar:
        for chunk in iter(lambda: f.read(CHUNK), b""):
            md5.update(chunk)
            bar.update(len(chunk))
    return md5.hexdigest()

def find_md5_file(tar_path: Path) -> Path | None:
    for cand in (tar_path.with_suffix(tar_path.suffix + ".md5"), tar_path.with_suffix(".md5")):
        if cand.exists():
            return cand
    return None

def parse_expected(md5_file: Path) -> str:
    return md5_file.read_text().strip().split()[0]

def verify_one(tar_path: Path, expected_hash: str | None = None) -> Tuple[str, bool]:
    if expected_hash:
        expected = expected_hash
    else:
        md5file = find_md5_file(tar_path)
        if md5file is None:
            print(f"[SKIP] {tar_path.name}: .md5 file not found & no -m given")
            return tar_path.name, False
        expected = parse_expected(md5file)
    actual = compute_md5(tar_path)
    if actual.lower() == expected.lower():
        print(f"[PASS] {tar_path.name}")
        return tar_path.name, True
    else:
        print(f"[FAIL] {tar_path.name} – expected {expected}, got {actual}")
        return tar_path.name, False

def collect_tars(paths: Iterable[str]) -> List[Path]:
    files: List[Path] = []
    for p in paths:
        p = Path(p)
        if p.is_dir():
            files.extend(sorted(p.glob("*.tar")))
        elif p.suffix == ".tar":
            files.append(p)
    # unique order
    seen = set()
    unique: List[Path] = []
    for f in files:
        if f not in seen:
            seen.add(f)
            unique.append(f)
    return unique

def main():
    parser = argparse.ArgumentParser(description="MD5 verifier for .tar archives", add_help=False)
    parser.add_argument("paths", nargs="*", default=["."], help="directories or .tar files")
    parser.add_argument("-m", "--md5", help="expected md5 (used when verifying exactly one file)")
    parser.add_argument("-h", "--help", action="help", help="show this help message and exit")

    args, _ = parser.parse_known_args()

    targets = collect_tars(args.paths)
    if not targets:
        print("No .tar files found.")
        sys.exit(1)

    # sanity: if md5 string is given, only first target uses it
    provided_hash = args.md5
    if provided_hash and len(targets) > 1:
        print("[WARN] --md5 は 1 ファイル用です。先頭ファイルにのみ適用します。")

    passed = 0
    for idx, t in enumerate(targets):
        exp = provided_hash if idx == 0 else None
        _, ok = verify_one(t, exp)
        if ok:
            passed += 1
    print(f"Summary: {passed}/{len(targets)} passed")
    sys.exit(0 if passed == len(targets) else 2)

if __name__ == "__main__":
    main()


No .tar files found.


SystemExit: 1

https://b2share.eudat.eu/records/80df8606fcdb4b2bae1656f0dc6db8ba

In [18]:
!ls -lh

total 3.6G
-rw------- 1 root root 672M May  9 16:04 Biological.tar
-rw------- 1 root root  78M May  9 16:03 Fibres.tar
-rw------- 1 root root    0 May  9 16:18 Fibres.tar.md5
-rw------- 1 root root 189M May  9 16:03 Films_Coated_Surface.tar
-rw------- 1 root root 2.7G May  9 15:53 Patterned_surface.tar
-rw------- 1 root root 3.8K May  9 16:17 verify_md5.py


In [19]:
!python verify_md5.py Fibres.tar -m c942e19b8ab221d3c700f44a0c67b306

[PASS] Fibres.tar
Summary: 1/1 passed


In [22]:
!python verify_md5.py Biological.tar -m 296cd6be5ac97b2203e807e9552d9aeb

[PASS] Biological.tar
Summary: 1/1 passed


In [23]:
!python verify_md5.py Films_Coated_Surface.tar -m 3c2e6cb914e1864855d9c3805ef47913

[PASS] Films_Coated_Surface.tar
Summary: 1/1 passed


In [24]:
!python verify_md5.py Patterned_surface.tar -m 44d3964bef4ee7e84ba2f99c84036e9c

[PASS] Patterned_surface.tar
Summary: 1/1 passed
