In [1]:
import sys
import os
import pandas as pd
import sqlite3
from dotenv import load_dotenv

# --- Step 1: モジュールをインポートするためのパス設定 ---
sys.path.append(os.path.abspath('..'))
from src.pdf_url_extractor import PdfUrlExtractor

# --- Step 2: 設定 ---
S2ORC_DIR = "../data/raw/s2orc/"
DB_PATH = "../data/processed/s2orc_filtered.db"

# --- Step 3: PDF URLテーブルの構築を実行 ---
def build_pdf_url_table():
    print("--- Starting PDF URL Table Construction ---")
    
    extractor = PdfUrlExtractor(db_path=DB_PATH)
    extractor.build_table(s2orc_dir=S2ORC_DIR)
    
    print("\n--- Verification ---")
    with sqlite3.connect(DB_PATH) as conn:
        count = pd.read_sql_query("SELECT COUNT(*) FROM used_paper_pdf_links", conn).iloc[0,0]
        print(f"Total PDF URLs saved: {count:,}")
        display(pd.read_sql_query("SELECT * FROM used_paper_pdf_links LIMIT 5", conn))

# --- 実行 ---
if __name__ == '__main__':
    build_pdf_url_table()

  from .autonotebook import tqdm as notebook_tqdm


--- Starting PDF URL Table Construction ---
Fetching target DOIs from `positive_candidates` table...
Found 3,993 'Used' candidate DOIs to extract.


Extracting PDF URLs: 100%|██████████| 297/297 [08:26<00:00,  1.71s/it]

✅ PDF URL table construction complete.

--- Verification ---
Total PDF URLs saved: 3,594





Unnamed: 0,doi,pdf_url
0,10.3390/CIMB44090267,https://www.mdpi.com/1467-3045/44/9/267/pdf?ve...
1,10.1016/J.DIB.2021.107641,https://doi.org/10.1016/j.dib.2021.107641
2,10.3390/V15101977,https://www.mdpi.com/1999-4915/15/10/1977/pdf?...
3,10.3390/GENES14122147,https://www.mdpi.com/2073-4425/14/12/2147/pdf?...
4,10.1038/S41597-021-00893-Z,https://www.nature.com/articles/s41597-021-008...
