In [1]:
import pandas as pd
import sqlite3
import json
import os
from tqdm.auto import tqdm

# --- Step 1: 設定 ---

# Scopusから収集した、起点となるデータ論文の全リスト
DATAPAPERS_FILE = "../data/datapapers/raw/all_datapapers_raw.json"
# S2ORCデータベースのパス
S2ORC_DB_PATH = "../data/processed/s2orc_filtered.db"


# --- Step 2: 正例候補を特定し、DBに保存する ---

def identify_and_store_candidates():
    """
    S2ORCの引用情報とScopusのデータ論文リストを突合し、
    正例候補を新しいテーブルに保存する。
    """
    print("--- Identifying and Storing Positive Candidates ---")

    # --- 2.1: Scopusのデータ論文DOIリストを読み込み ---
    if not os.path.exists(DATAPAPERS_FILE):
        print(f"❌ Error: Scopus data paper file not found at {DATAPAPERS_FILE}")
        return
    
    with open(DATAPAPERS_FILE, 'r', encoding='utf-8') as f:
        scopus_papers = json.load(f)
    
    # DOIを抽出し、比較しやすいように大文字に統一してセットに格納
    datapaper_dois = {paper['prism:doi'].upper() for paper in scopus_papers if paper.get('prism:doi')}
    
    if not datapaper_dois:
        print("❌ No DOIs found in the Scopus data paper file.")
        return
        
    print(f"Loaded {len(datapaper_dois):,} unique data paper DOIs to check against.")

    # --- 2.2: データベースに接続し、新しいテーブルを作成 ---
    if not os.path.exists(S2ORC_DB_PATH):
        print(f"❌ Error: S2ORC database file not found at {S2ORC_DB_PATH}")
        return

    with sqlite3.connect(S2ORC_DB_PATH) as conn:
        cursor = conn.cursor()
        print("Creating `positive_candidates` table if it doesn't exist...")
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS positive_candidates (
                citing_doi TEXT,
                cited_datapaper_doi TEXT,
                annotation_status INTEGER DEFAULT 0,
                annotation_source TEXT,
                PRIMARY KEY (citing_doi, cited_datapaper_doi)
            )
        ''')
        conn.commit()

        # --- 2.3: `citations`テーブルから正例候補を検索し、一括挿入 ---
        print("Searching for positive candidates in the `citations` table...")
        
        # 挿入するデータを準備 (プレースホルダー用のタプルのリスト)
        # 巨大なリストを一度にメモリにロードするのを避けるため、ジェネレータを使うこともできるが、
        # 今回はまずシンプルなリストで実装
        
        # まずは件数を見積もる
        placeholders = ','.join('?' for _ in datapaper_dois)
        query_count = f"SELECT COUNT(*) FROM citations WHERE cited_doi IN ({placeholders})"
        total_candidates = cursor.execute(query_count, list(datapaper_dois)).fetchone()[0]
        
        print(f"Found {total_candidates:,} potential candidates. Inserting into the new table...")
        
        # SQLの `INSERT INTO ... SELECT` 構文を使い、DB内で直接データをコピーする（高速）
        # この方法では、tqdmでの進捗表示は難しいが、はるかに効率的
        
        # 既存のデータをクリアするかどうかを選択 (再実行時に便利)
        # cursor.execute("DELETE FROM positive_candidates") 
        
        query_insert = f"""
            INSERT OR IGNORE INTO positive_candidates (citing_doi, cited_datapaper_doi)
            SELECT DISTINCT citing_doi, cited_doi
            FROM citations
            WHERE cited_doi IN ({placeholders})
        """
        
        cursor.execute(query_insert, list(datapaper_dois))
        conn.commit()
        
        # 挿入された件数を取得
        inserted_count = cursor.execute("SELECT COUNT(*) FROM positive_candidates").fetchone()[0]

    # --- 2.4: 結果の報告 ---
    print("\n" + "="*50)
    print("--- Process Complete ---")
    print(f"Successfully identified and stored {inserted_count:,} positive candidate pairs.")
    print(f"You can now find these in the `positive_candidates` table in {S2ORC_DB_PATH}")

# --- 実行 ---
identify_and_store_candidates()

  from .autonotebook import tqdm as notebook_tqdm


--- Identifying and Storing Positive Candidates ---
Loaded 13,339 unique data paper DOIs to check against.
Creating `positive_candidates` table if it doesn't exist...
Searching for positive candidates in the `citations` table...
Found 14,968 potential candidates. Inserting into the new table...

--- Process Complete ---
Successfully identified and stored 12,429 positive candidate pairs.
You can now find these in the `positive_candidates` table in ../data/processed/s2orc_filtered.db


In [3]:
import pandas as pd
import sqlite3
import os
from IPython.display import display

# --- Step 1: 設定 ---

# 確認したいデータベースのファイルパス
DB_PATH = "../data/processed/s2orc_filtered.db"

# --- Step 2: データベースの検証 ---

def verify_database():
    """
    構築されたデータベースの全テーブルの内容を確認し、統計情報を表示する
    """
    print("--- Verifying Constructed Database ---")

    if not os.path.exists(DB_PATH):
        print(f"❌ Error: Database file not found at {DB_PATH}")
        return

    try:
        with sqlite3.connect(DB_PATH) as conn:
            
            # --- `papers` テーブル ---
            print("\n--- `papers` Table Summary ---")
            paper_count = pd.read_sql_query("SELECT COUNT(*) FROM papers", conn).iloc[0, 0]
            print(f"Total papers saved: {paper_count:,}")
            if paper_count > 0:
                print("\n[Sample of `papers` table]")
                display(pd.read_sql_query("SELECT * FROM papers LIMIT 5", conn))
            
            # --- `citations` テーブル ---
            print("\n" + "="*50)
            print("--- `citations` Table Summary ---")
            citation_count = pd.read_sql_query("SELECT COUNT(*) FROM citations", conn).iloc[0, 0]
            print(f"Total citation links saved: {citation_count:,}")
            if citation_count > 0:
                print("\n[Sample of `citations` table]")
                display(pd.read_sql_query("SELECT * FROM citations LIMIT 5", conn))

            # ▼▼▼ 修正点: `positive_candidates` テーブルの確認を追加 ▼▼▼
            print("\n" + "="*50)
            print("--- `positive_candidates` Table Summary ---")
            
            try:
                candidate_count_df = pd.read_sql_query("SELECT COUNT(*) FROM positive_candidates", conn)
                candidate_count = candidate_count_df.iloc[0, 0]
                print(f"Total positive candidates found: {candidate_count:,}")

                if candidate_count > 0:
                    print("\n[Sample of `positive_candidates` table]")
                    df_candidates_sample = pd.read_sql_query("SELECT * FROM positive_candidates LIMIT 5", conn)
                    display(df_candidates_sample)
            
            except sqlite3.OperationalError as e:
                # テーブルが存在しない場合にエラーメッセージを表示
                if "no such table" in str(e):
                    print("⚠️ `positive_candidates` table not found.")
                    print("   Please run the `06_identify_and_store_candidates.ipynb` script first.")
                else:
                    raise e # その他のDBエラーはそのまま表示

    except Exception as e:
        print(f"💥 An error occurred while verifying the database: {e}")

# --- 実行 ---
verify_database()

--- Verifying Constructed Database ---

--- `papers` Table Summary ---
Total papers saved: 11,619,136

[Sample of `papers` table]


Unnamed: 0,corpus_id,doi,title,abstract
0,14,10.4103/0250-474X.59543,Resolving Issues of Content Uniformity and Low...,The aim of present study were to arrest the pr...
1,16,10.3390/FI4020563,The U-City Paradigm: Opportunities and Risks f...,Volunteered Geographic Information (VGI) tools...
2,25,10.1038/CDDISCOVERY.2016.53,Generating kidney tissue from pluripotent stem...,With the isolation of human pluripotent stem c...
3,32,10.1155/2016/6183679,Influence of PEEK Coating on Hip Implant Stres...,Stress shielding is a well-known failure facto...
4,88,10.1216/RMJM/1181071856,Q-Reflexive Banach Spaces,Let E be a Banach space. There are several nat...



--- `citations` Table Summary ---
Total citation links saved: 91,393,156

[Sample of `citations` table]


Unnamed: 0,citing_doi,cited_doi
0,10.3390/DIAGNOSTICS13091662,10.1007/S00590-017-1945-5
1,10.3390/DIAGNOSTICS13091662,10.3390/IJERPH191912484
2,10.3390/DIAGNOSTICS13091662,10.1148/RG.2019190046
3,10.3390/DIAGNOSTICS13091662,10.1007/S00256-021-03806-8
4,10.3390/DIAGNOSTICS13091662,10.3390/S20164481



--- `positive_candidates` Table Summary ---
Total positive candidates found: 12,429

[Sample of `positive_candidates` table]


Unnamed: 0,citing_doi,cited_datapaper_doi,annotation_status,annotation_source
0,10.3390/CIMB44090267,10.1016/J.DIB.2017.08.003,0,
1,10.1016/J.DIB.2021.107641,10.1016/J.DIB.2019.104905,0,
2,10.1016/J.DIB.2021.107641,10.1016/J.DIB.2018.02.038,0,
3,10.1016/J.DIB.2021.107641,10.1016/J.DIB.2017.08.043,0,
4,10.1016/J.DIB.2021.107641,10.1016/J.DIB.2015.09.041,0,


In [1]:
import pandas as pd
import sqlite3
import json
import os
from tqdm.auto import tqdm

# --- Step 1: 設定 ---
DATAPAPERS_FILE = "../data/datapapers/raw/all_datapapers_raw.json"
S2ORC_DB_PATH = "../data/processed/s2orc_filtered.db"

def migrate_candidates_table():
    """
    既存のアノテーション情報を保持したまま、
    positive_candidatesテーブルにタイトル列を追加する移行処理。
    """
    print("--- Migrating `positive_candidates` table ---")

    # --- 1. Scopusデータ論文のDOIとタイトルの対応辞書を作成 ---
    with open(DATAPAPERS_FILE, 'r', encoding='utf-8') as f:
        scopus_papers = json.load(f)
    doi_to_title_map = {p['prism:doi'].upper(): p['dc:title'] for p in scopus_papers if p.get('prism:doi') and p.get('dc:title')}
    
    # --- 2. データベースに接続 ---
    with sqlite3.connect(S2ORC_DB_PATH) as conn:
        cursor = conn.cursor()

        # --- 2.1. 旧テーブルをバックアップとしてリネーム ---
        try:
            cursor.execute("ALTER TABLE positive_candidates RENAME TO positive_candidates_old")
            print("   1. Backed up existing table to 'positive_candidates_old'.")
        except sqlite3.OperationalError as e:
            if "no such table" in str(e):
                print("   - No existing table found. Will create a new one.")
            else:
                raise e

        # --- 2.2. 新しい構造でテーブルを作成 ---
        cursor.execute('''
            CREATE TABLE IF NOT EXISTS positive_candidates (
                citing_doi TEXT,
                cited_datapaper_doi TEXT,
                cited_datapaper_title TEXT,
                annotation_status INTEGER DEFAULT 0,
                annotation_source TEXT,
                PRIMARY KEY (citing_doi, cited_datapaper_doi)
            )
        ''')
        print("   2. Created new table 'positive_candidates' with title column.")

        # --- 2.3. 旧テーブルから新テーブルへデータを移行 ---
        # 旧テーブルから全データを読み込む
        try:
            old_data = cursor.execute("SELECT citing_doi, cited_datapaper_doi, annotation_status, annotation_source FROM positive_candidates_old").fetchall()
            
            # 新テーブルに挿入するデータを準備（タイトル情報を付与）
            new_data_to_insert = []
            for row in old_data:
                citing_doi, cited_doi, status, source = row
                title = doi_to_title_map.get(cited_doi) # DOIに対応するタイトルを取得
                new_data_to_insert.append((citing_doi, cited_doi, title, status, source))

            # 新テーブルにデータを一括挿入
            cursor.executemany(
                'INSERT OR IGNORE INTO positive_candidates VALUES (?,?,?,?,?)',
                new_data_to_insert
            )
            conn.commit()
            print(f"   3. Migrated {len(new_data_to_insert)} records from old table to new table.")

            # --- 2.4. バックアップテーブルを削除 ---
            cursor.execute("DROP TABLE positive_candidates_old")
            print("   4. Removed backup table.")

        except sqlite3.OperationalError as e:
            if "no such table" in str(e):
                print("   - No old data to migrate.")
            else:
                raise e

    print("\n✅ Migration complete. Your annotation data is preserved.")

# --- 実行 ---
migrate_candidates_table()

  from .autonotebook import tqdm as notebook_tqdm


--- Migrating `positive_candidates` table ---
   1. Backed up existing table to 'positive_candidates_old'.
   2. Created new table 'positive_candidates' with title column.
   3. Migrated 12429 records from old table to new table.
   4. Removed backup table.

✅ Migration complete. Your annotation data is preserved.
