In [1]:
import sys
import os
import json
import pandas as pd
from dotenv import load_dotenv
from tqdm.auto import tqdm

# --- Step 1: モジュールと設定の準備 ---
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..', 'src')))

try:
    from scopus_handler import ScopusSearcher
except ImportError:
    print("❌ Error: 'src/scopus_handler.py' が見つかりません。")
    raise

load_dotenv()
SCOPUS_API_KEY = os.getenv("SCOPUS_API_KEY")
if not SCOPUS_API_KEY:
    raise ValueError("SCOPUS_API_KEYが設定されていません。")

# --- ファイルパスの定義 ---
DATAPAPERS_FILE = "../data/datapapers/raw/all_datapapers_raw.json"
OUTPUT_DIR = "../data/citingpapers/raw/"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# --- Step 2: 引用論文リストの収集 ---

def collect_all_citing_papers():
    """
    全てのデータ論文について、引用論文リストを収集し、個別のJSONファイルとして保存する。
    中断・再開機能付き。
    """
    print("--- Collecting Citing Papers for All Data Papers ---")
    
    # --- 2.1: Scopusのデータ論文リストを読み込み ---
    if not os.path.exists(DATAPAPERS_FILE):
        print(f"❌ Error: Data paper file not found at {DATAPAPERS_FILE}")
        return

    with open(DATAPAPERS_FILE, 'r', encoding='utf-8') as f:
        all_data_papers = json.load(f)
    
    # EIDを持つ論文のみを対象とする
    papers_to_process = [p for p in all_data_papers if p.get('eid')]
    total_papers = len(papers_to_process)
    print(f"Found {total_papers:,} data papers to process.")

    # --- 2.2: 収集ループ ---
    scopus = ScopusSearcher(api_key=SCOPUS_API_KEY)
    
    # tqdmで全体の進捗を表示
    for paper in tqdm(papers_to_process, desc="Overall Progress"):
        eid = paper.get('eid')
        if not eid:
            continue
            
        # 出力ファイルが既に存在するかチェック
        output_filename = os.path.join(OUTPUT_DIR, f"{eid}.json")
        if os.path.exists(output_filename):
            continue # 存在すればスキップ

        # 引用論文を全件取得
        citing_papers = scopus.fetch_all_results(f"REF({eid})", max_results=float('inf'))
        
        # 取得した結果をJSONファイルとして保存
        if citing_papers:
            with open(output_filename, 'w', encoding='utf-8') as f:
                json.dump(citing_papers, f, ensure_ascii=False, indent=4)
    
    print(f"\n✨ Collection process complete.")
    # 最終的な完了済みファイル数を確認
    completed_files = len([f for f in os.listdir(OUTPUT_DIR) if f.endswith('.json')])
    print(f"   Total completed files: {completed_files}/{total_papers}")

# --- 実行 ---
if __name__ == '__main__':
    collect_all_citing_papers()

  from .autonotebook import tqdm as notebook_tqdm


--- Collecting Citing Papers for All Data Papers ---
Found 13,339 data papers to process.


Searching Scopus: 100%|██████████| 7/7 [00:01<00:00,  3.97 papers/s]
Searching Scopus: 100%|██████████| 95/95 [00:04<00:00, 20.71 papers/s]s]
Searching Scopus: 100%|██████████| 2/2 [00:01<00:00,  1.66 papers/s]
Searching Scopus: 100%|██████████| 12/12 [00:01<00:00,  9.80 papers/s]
Searching Scopus: 100%|██████████| 8/8 [00:01<00:00,  6.34 papers/s]
Searching Scopus: 100%|██████████| 4/4 [00:01<00:00,  3.44 papers/s]
Searching Scopus: 100%|██████████| 3/3 [00:01<00:00,  2.71 papers/s]
Searching Scopus: 100%|██████████| 9/9 [00:01<00:00,  7.00 papers/s]
Searching Scopus: 100%|██████████| 12/12 [00:01<00:00,  9.14 papers/s]
Searching Scopus: 100%|██████████| 12/12 [00:01<00:00,  8.60 papers/s]
Searching Scopus: 100%|██████████| 8/8 [00:01<00:00,  5.73 papers/s]
Searching Scopus: 100%|██████████| 18/18 [00:01<00:00, 12.42 papers/s]
Searching Scopus: 100%|██████████| 4/4 [00:01<00:00,  3.57 papers/s]t/s]
Searching Scopus: 100%|██████████| 5/5 [00:01<00:00,  4.45 papers/s]/s] 
Searching Scop


✨ Collection process complete.
   Total completed files: 13339/13339



