In [1]:
import pandas as pd
import sqlite3
import os
from IPython.display import display

# --- Step 1: 設定 ---

# データベースのファイルパス
DB_PATH = "../data/processed/s2orc_filtered.db"

# --- Step 2: データベースの分析 ---

def verify_annotation_results():
    """
    `positive_candidates`テーブルのアノテーション結果を検証し、要約を表示する
    """
    print("--- Verifying Annotation Results ---")

    if not os.path.exists(DB_PATH):
        print(f"❌ Error: Database file not found at {DB_PATH}")
        return

    try:
        with sqlite3.connect(DB_PATH) as conn:
            
            # テーブルの存在を確認
            cursor = conn.cursor()
            cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='positive_candidates'")
            if cursor.fetchone() is None:
                print("⚠️ `positive_candidates` table not found.")
                return

            # --- 2.1: アノテーションステータスの分布を計算 ---
            print("\n--- Annotation Status Distribution ---")
            
            query_dist = """
                SELECT
                    CASE annotation_status
                        WHEN 1 THEN 'Used'
                        WHEN -1 THEN 'Not Used'
                        WHEN 0 THEN 'Unprocessed'
                        ELSE 'Unknown'
                    END AS status,
                    COUNT(*) AS count
                FROM
                    positive_candidates
                GROUP BY
                    annotation_status;
            """
            df_dist = pd.read_sql_query(query_dist, conn)
            
            if df_dist.empty:
                print("No annotation data found.")
                return
            
            # 割合を計算して追加
            total_count = df_dist['count'].sum()
            df_dist['percentage'] = (df_dist['count'] / total_count * 100).map('{:.2f}%'.format)
            display(df_dist)

            # --- 2.2: 「使用(Used)」と判定されたサンプルの表示 ---
            print("\n" + "="*50)
            print("--- Sample of papers annotated as 'Used' (status=1) ---")
            
            query_used = """
                SELECT
                    pc.cited_datapaper_doi,
                    p_cited.title AS data_paper_title,
                    pc.citing_doi,
                    p_citing.title AS citing_paper_title
                FROM
                    positive_candidates pc
                JOIN
                    papers p_cited ON pc.cited_datapaper_doi = p_cited.doi
                JOIN
                    papers p_citing ON pc.citing_doi = p_citing.doi
                WHERE
                    pc.annotation_status = 1
                LIMIT 5;
            """
            df_used_sample = pd.read_sql_query(query_used, conn)
            
            if df_used_sample.empty:
                print("No papers were annotated as 'Used'.")
            else:
                display(df_used_sample)

            # --- 2.3: 「不使用(Not Used)」と判定されたサンプルの表示 ---
            print("\n" + "="*50)
            print("--- Sample of papers annotated as 'Not Used' (status=-1) ---")

            query_not_used = query_used.replace("pc.annotation_status = 1", "pc.annotation_status = -1")
            df_not_used_sample = pd.read_sql_query(query_not_used, conn)

            if df_not_used_sample.empty:
                print("No papers were annotated as 'Not Used'.")
            else:
                display(df_not_used_sample)

    except Exception as e:
        print(f"💥 An error occurred while verifying the database: {e}")

# --- 実行 ---
verify_annotation_results()

--- Verifying Annotation Results ---

--- Annotation Status Distribution ---


Unnamed: 0,status,count,percentage
0,Not Used,6065,48.80%
1,Unprocessed,2167,17.44%
2,Used,4197,33.77%



--- Sample of papers annotated as 'Used' (status=1) ---


Unnamed: 0,cited_datapaper_doi,data_paper_title,citing_doi,citing_paper_title
0,10.1016/J.DIB.2017.08.003,Dataset on antitumor properties of silver nano...,10.3390/CIMB44090267,
1,10.1016/J.DIB.2019.104905,Environmental and economic data on energy effi...,10.1016/J.DIB.2021.107641,Data on nearly zero energy buildings (NZEBs) p...
2,10.1038/S41597-020-00609-9,a structured open dataset of government interv...,10.1016/J.CITIES.2022.103770,A systematic review of the impacts of the coro...
3,10.1038/S41597-022-01307-4,"Dynamic World, Near real-time global 10 m land...",10.1038/S41597-022-01307-4,"Dynamic World, Near real-time global 10 m land..."
4,10.1038/S41597-020-00688-8,a cross-country database of COVID-19 testing,10.1128/MSYSTEMS.00035-22,Mapping Data to Deep Understanding: Making the...



--- Sample of papers annotated as 'Not Used' (status=-1) ---


Unnamed: 0,cited_datapaper_doi,data_paper_title,citing_doi,citing_paper_title
0,10.1016/J.DIB.2018.02.038,Data on cost-optimal Nearly Zero Energy Buildi...,10.1016/J.DIB.2021.107641,Data on nearly zero energy buildings (NZEBs) p...
1,10.1016/J.DIB.2017.08.043,Data on European non-residential buildings,10.1016/J.DIB.2021.107641,Data on nearly zero energy buildings (NZEBs) p...
2,10.1016/J.DIB.2015.09.041,High performance solutions and data for nZEBs ...,10.1016/J.DIB.2021.107641,Data on nearly zero energy buildings (NZEBs) p...
3,10.1038/S41597-019-0178-3,Discharge performance and dynamic behavior of ...,10.3390/IJMS21093113,Binder-Free α-MnO 2 Nanowires on Carbon Cloth ...
4,10.1038/S41597-020-0478-7,Estimating nitrogen and phosphorus concentrati...,10.1371/JOURNAL.PONE.0271458,Machine learning-based estimation of riverine ...


In [3]:
import pandas as pd
import sqlite3
import os
from IPython.display import display

# --- Step 1: 設定 ---
DB_PATH = "../data/processed/s2orc_filtered.db"

# --- Step 2: データベースの分析 ---
def debug_unannotated_candidates():
    print("--- Debugging Unannotated Candidates ---")
    if not os.path.exists(DB_PATH):
        print(f"❌ Error: Database file not found at {DB_PATH}")
        return

    with sqlite3.connect(DB_PATH) as conn:
        # --- 2.1: 未処理の候補の数を取得 ---
        query_unannotated = "SELECT citing_doi, cited_datapaper_doi FROM positive_candidates WHERE annotation_status = 0"
        df_unannotated = pd.read_sql_query(query_unannotated, conn)
        
        if df_unannotated.empty:
            print("No unannotated candidates found.")
            return
        
        print(f"Total unannotated candidate pairs: {len(df_unannotated):,}")

        # --- 2.2: `papers`テーブルに存在するDOIのセットを取得 ---
        df_papers_doi = pd.read_sql_query("SELECT doi FROM papers", conn)
        papers_doi_set = set(df_papers_doi['doi'])

        # --- 2.3: 各条件での絞り込みを検証 ---
        # 1. 引用元(cited)データ論文が`papers`テーブルに存在するか
        unannotated_with_cited = df_unannotated[df_unannotated['cited_datapaper_doi'].isin(papers_doi_set)]
        
        # 2. 引用先(citing)論文が`papers`テーブルに存在するか
        unannotated_with_citing = df_unannotated[df_unannotated['citing_doi'].isin(papers_doi_set)]
        
        # 3. 両方が`papers`テーブルに存在するか
        unannotated_with_both = df_unannotated[
            df_unannotated['cited_datapaper_doi'].isin(papers_doi_set) &
            df_unannotated['citing_doi'].isin(papers_doi_set)
        ]
        
        print("\n--- Breakdown of Unannotated Candidates ---")
        print(f"Candidates where 'cited_datapaper_doi' is in `papers` table: {len(unannotated_with_cited):,}")
        print(f"Candidates where 'citing_doi' is in `papers` table: {len(unannotated_with_citing):,}")
        print("-" * 50)
        print(f"Candidates where BOTH DOIs are in `papers` table: {len(unannotated_with_both):,}")
        
        if len(unannotated_with_both) > 0:
             print("\n✅ These are the papers that can be processed for annotation.")
        else:
             print("\n⚠️ It appears no unannotated candidates have all the required info in the `papers` table.")
             print("This means the remaining unannotated papers were filtered out during the DB construction.")

# --- 実行 ---
debug_unannotated_candidates()

--- Debugging Unannotated Candidates ---
Total unannotated candidate pairs: 3,618

--- Breakdown of Unannotated Candidates ---
Candidates where 'cited_datapaper_doi' is in `papers` table: 99
Candidates where 'citing_doi' is in `papers` table: 3,618
--------------------------------------------------
Candidates where BOTH DOIs are in `papers` table: 99

✅ These are the papers that can be processed for annotation.


In [None]:
import pandas as pd
import sqlite3
import os
import gzip
from tqdm.auto import tqdm
from multiprocessing import Pool, cpu_count

# --- Step 1: 設定 ---
S2ORC_DIR = "../data/raw/s2orc/"
DB_PATH = "../data/processed/s2orc_filtered.db"

# --- Step 2: 各件数を調査する関数 ---

def count_lines_in_file(filepath):
    """単一のgzipファイル内の行数をカウントするヘルパー関数"""
    count = 0
    try:
        with gzip.open(filepath, 'rt', encoding='utf-8') as f:
            for _ in f:
                count += 1
    except Exception:
        return 0
    return count

def get_s2orc_raw_count():
    """S2ORCの生データ全体の論文数を並列処理でカウントする"""
    print("--- 1. Counting total papers in raw S2ORC files... (This may take several minutes) ---")
    try:
        filepaths = [os.path.join(S2ORC_DIR, f) for f in os.listdir(S2ORC_DIR) if f.endswith('.gz')]
        if not filepaths:
            print(f"No .gz files found in {S2ORC_DIR}")
            return 0
        
        with Pool(processes=cpu_count()) as pool:
            # imap_unorderedで各ファイルの処理結果を順次受け取り、tqdmで進捗表示
            counts = list(tqdm(pool.imap_unordered(count_lines_in_file, filepaths), total=len(filepaths), desc="Counting raw files"))
        
        total_raw_count = sum(counts)
        print(f"Total raw paper count: {total_raw_count:,}")
        return total_raw_count
    except Exception as e:
        print(f"💥 Error counting raw files: {e}")
        return "Error"


def get_filtered_db_counts():
    """フィルタリング済みのデータベース内の論文数と正例候補数をカウントする"""
    print("\n--- 2. Counting papers in the filtered database ---")
    if not os.path.exists(DB_PATH):
        print(f"❌ Error: Database file not found at {DB_PATH}")
        return "Not found", "Not found"
        
    try:
        with sqlite3.connect(DB_PATH) as conn:
            # フィルタリング後の論文総数
            filtered_count = pd.read_sql_query("SELECT COUNT(*) FROM papers", conn).iloc[0, 0]
            print(f"Total filtered paper count: {filtered_count:,}")

            # 正例候補（データ論文を引用している）のユニークな論文数
            try:
                candidate_count = pd.read_sql_query("SELECT COUNT(DISTINCT citing_doi) FROM positive_candidates", conn).iloc[0, 0]
                print(f"Positive candidate count: {candidate_count:,}")
            except pd.io.sql.DatabaseError:
                # positive_candidatesテーブルがまだ存在しない場合
                candidate_count = "Table not found"
                print("`positive_candidates` table not found.")

            return filtered_count, candidate_count
    except Exception as e:
        print(f"💥 Error reading database: {e}")
        return "Error", "Error"

# --- Step 3: メイン処理と結果の表示 ---

def display_summary():
    """全ての統計情報を取得し、サマリーとして表示する"""
    
    raw_count = get_s2orc_raw_count()
    filtered_count, candidate_count = get_filtered_db_counts()
    
    # 結果をDataFrameにまとめて表示
    summary_data = {
        "Description": [
            "S2ORCの総論文数（フィルタリング前）",
            "S2ORCの論文数（品質フィルタリング後）",
            "データ論文を引用している論文数（正例候補）"
        ],
        "Count": [
            f"{raw_count:,}" if isinstance(raw_count, int) else raw_count,
            f"{filtered_count:,}" if isinstance(filtered_count, int) else filtered_count,
            f"{candidate_count:,}" if isinstance(candidate_count, int) else candidate_count
        ]
    }
    
    df_summary = pd.DataFrame(summary_data)
    
    print("\n\n" + "="*50)
    print("📋 Dataset Construction Summary")
    print("="*50)
    display(df_summary)

# --- 実行 ---
if __name__ == '__main__':
    display_summary()

  from .autonotebook import tqdm as notebook_tqdm


--- 1. Counting total papers in raw S2ORC files... (This may take several minutes) ---


Counting raw files:   0%|          | 0/297 [00:00<?, ?it/s]