In [1]:
import pandas as pd
import sqlite3
import json
import os
from tqdm.auto import tqdm

# --- Step 1: 設定 ---

# Scopusから収集したデータ論文リストのファイルパス
SCOPUS_DATAPAPERS_FILE = "../data/datapapers/raw/all_datapapers_raw.json"
# 構築したS2ORCデータベースのファイルパス
S2ORC_DB_PATH = "../data/processed/s2orc_filtered.db"


# --- Step 2: カバレッジの検証 ---

def check_coverage():
    """
    Scopusのデータ論文がS2ORCデータベースにどの程度含まれているかを検証する
    """
    print("--- Checking Coverage of Scopus Data Papers in S2ORC Database ---")

    # --- 2.1: Scopusのデータ論文DOIリストを読み込み ---
    if not os.path.exists(SCOPUS_DATAPAPERS_FILE):
        print(f"❌ Error: Scopus data paper file not found at {SCOPUS_DATAPAPERS_FILE}")
        return
    
    with open(SCOPUS_DATAPAPERS_FILE, 'r', encoding='utf-8') as f:
        scopus_papers = json.load(f)
    
    # DOIを抽出し、比較しやすいように大文字に統一してセットに格納
    scopus_dois = {paper['prism:doi'].upper() for paper in scopus_papers if paper.get('prism:doi')}
    
    if not scopus_dois:
        print("❌ No DOIs found in the Scopus data paper file.")
        return
        
    print(f"Found {len(scopus_dois):,} unique DOIs in Scopus data paper list.")

    # --- 2.2: S2ORCデータベースのDOIリストを読み込み ---
    if not os.path.exists(S2ORC_DB_PATH):
        print(f"❌ Error: S2ORC database file not found at {S2ORC_DB_PATH}")
        return

    print("Fetching all DOIs from S2ORC database... (This may take a minute)")
    with sqlite3.connect(S2ORC_DB_PATH) as conn:
        # DBから全てのDOIを効率的に読み込む
        s2orc_dois = set(row[0] for row in conn.execute("SELECT doi FROM papers"))
    
    if not s2orc_dois:
        print("❌ No DOIs found in the S2ORC database.")
        return
        
    print(f"Found {len(s2orc_dois):,} unique DOIs in S2ORC database.")

    # --- 2.3: 集合の積（共通部分）を計算 ---
    print("\n--- Calculating Coverage ---")
    
    intersection = scopus_dois.intersection(s2orc_dois)
    
    coverage_percentage = (len(intersection) / len(scopus_dois)) * 100 if scopus_dois else 0
    
    # --- 2.4: 結果の報告 ---
    print(f"Total Scopus Data Papers with DOI: {len(scopus_dois):,}")
    print(f"Found in S2ORC Database: {len(intersection):,}")
    print(f"Coverage Rate: {coverage_percentage:.2f}%")
    
    # S2ORCに含まれていなかったDOIのサンプルをいくつか表示
    missing_dois = list(scopus_dois - s2orc_dois)
    if missing_dois:
        print("\n--- Sample of DOIs NOT found in S2ORC database ---")
        for doi in missing_dois[:5]:
            print(doi)

# --- 実行 ---
check_coverage()

  from .autonotebook import tqdm as notebook_tqdm


--- Checking Coverage of Scopus Data Papers in S2ORC Database ---
Found 13,339 unique DOIs in Scopus data paper list.
Fetching all DOIs from S2ORC database... (This may take a minute)
Found 11,619,136 unique DOIs in S2ORC database.

--- Calculating Coverage ---
Total Scopus Data Papers with DOI: 13,339
Found in S2ORC Database: 7,692
Coverage Rate: 57.67%

--- Sample of DOIs NOT found in S2ORC database ---
10.1016/J.DIB.2024.110650
10.1016/J.DIB.2020.105530
10.1177/8755293019891722
10.3389/FMARS.2022.966275
10.1016/J.DIB.2020.106661


In [2]:
import pandas as pd
import json
import os
import math # 小数点以下切り上げ(ceil)のためにインポート

# --- Step 1: 設定 ---

# Scopusから収集したデータ論文リストのファイルパス
SCOPUS_DATAPAPERS_FILE = "../data/datapapers/raw/all_datapapers_raw.json"
# 1リクエストあたりの取得件数
RESULTS_PER_PAGE = 25

# --- Step 2: APIリクエスト数の見積もり ---

def estimate_total_api_requests():
    """
    データ論文の被引用数リストから、全引用論文を取得するために必要な
    APIリクエストの総数を見積もる。
    """
    print("--- Estimating Total API Requests for Approach B ---")

    # --- 2.1: Scopusのデータ論文リストを読み込み ---
    if not os.path.exists(SCOPUS_DATAPAPERS_FILE):
        print(f"❌ Error: Scopus data paper file not found at {SCOPUS_DATAPAPERS_FILE}")
        return
    
    with open(SCOPUS_DATAPAPERS_FILE, 'r', encoding='utf-8') as f:
        scopus_papers = json.load(f)
    
    # citedby-countを抽出してDataFrameを作成
    citation_counts = [int(p.get('citedby-count', 0)) for p in scopus_papers if p.get('citedby-count')]
    df = pd.DataFrame(citation_counts, columns=['citedby_count'])
    
    if df.empty:
        print("❌ No citation count data found.")
        return

    # --- 2.2: リクエスト数を計算 ---
    # 各論文について、必要なリクエスト数を計算
    # (例: 26件の引用があれば、ceil(26/25) = 2回のリクエストが必要)
    df['required_requests'] = df['citedby_count'].apply(lambda x: math.ceil(x / RESULTS_PER_PAGE))
    
    # 全ての数値を合計
    total_data_papers = len(df)
    total_citations_to_fetch = df['citedby_count'].sum()
    total_api_requests = df['required_requests'].sum()
    
    # --- 2.3: 結果の報告 ---
    print("\n" + "="*50)
    print("--- Estimation Results ---")
    print(f"Total Data Papers to Process: {total_data_papers:,}")
    print(f"Total Citing Papers to Fetch: {total_citations_to_fetch:,}")
    print("-" * 50)
    print(f"Estimated Total API Requests Required: {int(total_api_requests):,}")
    print("="*50)

# --- 実行 ---
estimate_total_api_requests()

--- Estimating Total API Requests for Approach B ---

--- Estimation Results ---
Total Data Papers to Process: 13,339
Total Citing Papers to Fetch: 203,547
--------------------------------------------------
Estimated Total API Requests Required: 17,420
