In [31]:
import os
import requests
from datetime import datetime, timedelta
from zipfile import ZipFile

# ==== 認証情報 ====
JRDB_USER = os.environ.get("JRDB_USER")
JRDB_PASSWORD = os.environ.get("JRDB_PASSWORD")

# ==== ダウンロード先ディレクトリ ====
DOWNLOAD_DIR = "data/jrdb/zip"
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# ==== ベースURL ====
BASE_URL = "https://jrdb.com/member/datazip"

def download_file(prefix, year, filename):
    dir_prefix = prefix.capitalize()
    url = f"{BASE_URL}/{dir_prefix}/{year}/{filename}"
    dest_path = os.path.join(DOWNLOAD_DIR, filename)
    os.makedirs(os.path.dirname(dest_path), exist_ok=True)

    if os.path.exists(dest_path):
        print(f"Already exists: {filename}")
        return

    try:
        resp = session.get(url, auth=(JRDB_USER, JRDB_PASSWORD), stream=True, timeout=10)

        # Treat 404 and 403 as "file doesn't exist" → skip silently
        if resp.status_code in (404, 403):
            return

        resp.raise_for_status()  # Raise other HTTP errors

        # Save the file
        with open(dest_path, "wb") as f:
            for chunk in resp.iter_content(1024):
                f.write(chunk)
        print(f"Saved {dest_path}")

    except requests.exceptions.ConnectionError:
        print(f"Skipped (connection error, probably missing): {filename}")
    except requests.exceptions.RequestException as e:
        raise RuntimeError(f"Failed to download existing file {filename}: {e}")


def download_range(prefix, start_date, end_date):
    date = start_date
    delta = timedelta(days=1)

    while date <= end_date:
        year = date.strftime("%Y")
        yymmdd = date.strftime("%y%m%d")
        filename = f"{prefix}{yymmdd}.zip"
        download_file(prefix, year, filename)
        date += delta

if __name__ == "__main__":
    # === 設定 ===
    PREFIX = "HJC"
    START_DATE = datetime(2015, 1, 1)
    END_DATE = datetime(2020, 1, 1)

    download_range(PREFIX, START_DATE, END_DATE)

Saved data/jrdb/zip/HJC150104.zip
Saved data/jrdb/zip/HJC150105.zip
Saved data/jrdb/zip/HJC150110.zip
Saved data/jrdb/zip/HJC150111.zip
Saved data/jrdb/zip/HJC150112.zip
Saved data/jrdb/zip/HJC150117.zip
Saved data/jrdb/zip/HJC150118.zip
Saved data/jrdb/zip/HJC150124.zip
Saved data/jrdb/zip/HJC150125.zip
Saved data/jrdb/zip/HJC150131.zip
Saved data/jrdb/zip/HJC150201.zip
Saved data/jrdb/zip/HJC150207.zip
Saved data/jrdb/zip/HJC150208.zip
Saved data/jrdb/zip/HJC150214.zip
Saved data/jrdb/zip/HJC150215.zip
Saved data/jrdb/zip/HJC150221.zip
Saved data/jrdb/zip/HJC150222.zip
Saved data/jrdb/zip/HJC150228.zip
Saved data/jrdb/zip/HJC150301.zip
Saved data/jrdb/zip/HJC150307.zip
Saved data/jrdb/zip/HJC150308.zip
Saved data/jrdb/zip/HJC150314.zip
Saved data/jrdb/zip/HJC150315.zip
Saved data/jrdb/zip/HJC150321.zip
Saved data/jrdb/zip/HJC150322.zip
Saved data/jrdb/zip/HJC150328.zip
Saved data/jrdb/zip/HJC150329.zip
Saved data/jrdb/zip/HJC150404.zip
Saved data/jrdb/zip/HJC150405.zip
Saved data/jrd

In [32]:
EXTRACT_DIR = "data/jrdb/extracted"

def unzip_all_files(zip_dir, extract_dir):
    os.makedirs(extract_dir, exist_ok=True)
    
    for file in os.listdir(zip_dir):
        if file.endswith(".zip"):
            zip_path = os.path.join(zip_dir, file)
            target_dir = os.path.join(extract_dir, os.path.splitext(file)[0])
            os.makedirs(target_dir, exist_ok=True)
            
            with ZipFile(zip_path, "r") as zip_ref:
                zip_ref.extractall(target_dir)
            
            print(f"Extracted {zip_path} -> {target_dir}")

if __name__ == "__main__":
    unzip_all_files(DOWNLOAD_DIR, EXTRACT_DIR)


Extracted data/jrdb/zip/TYB160228.zip -> data/jrdb/extracted/TYB160228
Extracted data/jrdb/zip/HJC230715.zip -> data/jrdb/extracted/HJC230715
Extracted data/jrdb/zip/TYB200202.zip -> data/jrdb/extracted/TYB200202
Extracted data/jrdb/zip/SED241222.zip -> data/jrdb/extracted/SED241222
Extracted data/jrdb/zip/HJC180708.zip -> data/jrdb/extracted/HJC180708
Extracted data/jrdb/zip/PACI190616.zip -> data/jrdb/extracted/PACI190616
Extracted data/jrdb/zip/PACI190602.zip -> data/jrdb/extracted/PACI190602
Extracted data/jrdb/zip/SED240128.zip -> data/jrdb/extracted/SED240128
Extracted data/jrdb/zip/TYB200216.zip -> data/jrdb/extracted/TYB200216
Extracted data/jrdb/zip/HJC230701.zip -> data/jrdb/extracted/HJC230701
Extracted data/jrdb/zip/PACI150426.zip -> data/jrdb/extracted/PACI150426
Extracted data/jrdb/zip/TYB201108.zip -> data/jrdb/extracted/TYB201108
Extracted data/jrdb/zip/PACI210904.zip -> data/jrdb/extracted/PACI210904
Extracted data/jrdb/zip/HJC230729.zip -> data/jrdb/extracted/HJC23072