In [1]:
import sys
import os
import time

# --- Step 1: モジュールをインポートするためのパス設定 ---
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src')))

try:
    from s2orc_processor import S2ORCProcessor
except ImportError:
    print("❌ Error: 'src/s2orc_processor.py' が見つかりません。")
    raise

# --- Step 2: 設定 ---

S2ORC_DIR = "../data/raw/s2orc/"
DB_PATH = "../data/processed/s2orc_filtered.db"

# --- Step 3: データベース構築の実行 ---

def build_database():
    """データベース構築プロセス全体を実行するメイン関数"""
    
    print("--- Starting S2ORC Database Construction ---")
    print(f"Source data directory: {S2ORC_DIR}")
    print(f"Output database file: {DB_PATH}")
    
    # ▼▼▼ 修正点: データベースの保存先ディレクトリを事前に作成 ▼▼▼
    # os.path.dirname(DB_PATH) でディレクトリ部分のパスを取得
    # exist_ok=True で、ディレクトリが既に存在していてもエラーにならないようにする
    os.makedirs(os.path.dirname(DB_PATH), exist_ok=True)
    print(f"Ensured output directory exists: {os.path.dirname(DB_PATH)}")
    print("-" * 50)
    
    start_time = time.time()
    
    processor = S2ORCProcessor(db_path=DB_PATH)

    processor.build_database_parallel(s2orc_dir=S2ORC_DIR)
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    print(f"\n--- Database construction finished ---")
    print(f"Total execution time: {elapsed_time / 3600:.2f} hours ({elapsed_time / 60:.2f} minutes)")

# --- 実行 ---
if __name__ == '__main__':
    build_database()

  from .autonotebook import tqdm as notebook_tqdm


--- Starting S2ORC Database Construction ---
Source data directory: ../data/raw/s2orc/
Output database file: ../data/processed/s2orc_filtered.db
Ensured output directory exists: ../data/processed
--------------------------------------------------
Total files: 297. Processed: 4. Remaining: 293.


Building Filtered Database: 100%|██████████| 293/293 [46:24<00:00,  9.50s/it]  

✅ Database construction complete.

--- Database construction finished ---
Total execution time: 0.77 hours (46.42 minutes)



