In [None]:
import pandas as pd
import sqlite3
import os
from IPython.display import display, HTML

# --- Step 1: 設定 ---
DB_PATH = "../data/processed/s2orc_filtered.db"
EVALUATION_DATAPAPERS_FILE = "../data/datapapers/sampled/evaluation_data_papers.csv"

def check_evaluation_progress():
    """
    評価用データセットの目視アノテーションの進捗状況を確認し、
    サマリーと詳細テーブルを表示する。
    """
    print("--- Checking Manual Annotation Progress for Evaluation Set ---")

    if not os.path.exists(DB_PATH) or not os.path.exists(EVALUATION_DATAPAPERS_FILE):
        print(f"❌ Error: Database or evaluation data paper file not found.")
        return

    try:
        with sqlite3.connect(DB_PATH) as conn:
            # --- 1. 評価用のデータ論文リストを読み込み ---
            df_eval_papers = pd.read_csv(EVALUATION_DATAPAPERS_FILE)
            eval_datapaper_dois = tuple(df_eval_papers['cited_datapaper_doi'].unique())
            
            # --- 2. 評価対象の全体像を取得 ---
            placeholders = ','.join('?' for _ in eval_datapaper_dois)
            
            query_total = f"""
                SELECT COUNT(*) FROM positive_candidates
                WHERE cited_datapaper_doi IN ({placeholders}) AND llm_annotation_status = 1
            """
            total_to_annotate = conn.execute(query_total, eval_datapaper_dois).fetchone()[0]

            query_annotated = f"""
                SELECT COUNT(*) FROM positive_candidates
                WHERE cited_datapaper_doi IN ({placeholders}) AND llm_annotation_status = 1 AND human_annotation_status != 0
            """
            total_annotated = conn.execute(query_annotated, eval_datapaper_dois).fetchone()[0]

            # --- 3. 全体進捗サマリーを表示 ---
            progress_percent = (total_annotated / total_to_annotate * 100) if total_to_annotate > 0 else 0
            
            print("\n" + "="*50)
            print("--- Overall Progress Summary ---")
            print(f"評価対象のデータ論文数: {len(eval_datapaper_dois):,}")
            print(f"目視確認対象の候補論文総数 (LLMが'Used'と判定): {total_to_annotate:,}")
            print("-" * 50)
            print(f"アノテーション完了数: {total_annotated:,}")
            print(f"進捗率: {progress_percent:.2f}%")
            print("="*50)

            # --- 4. データ論文ごとの詳細な進捗テーブルを表示 ---
            print("\n--- Progress by Data Paper ---")
            
            query_details = f"""
                SELECT
                    cited_datapaper_doi,
                    COUNT(citing_doi) AS total_candidates,
                    SUM(CASE WHEN human_annotation_status != 0 THEN 1 ELSE 0 END) AS annotated_count
                FROM
                    positive_candidates
                WHERE
                    cited_datapaper_doi IN ({placeholders}) AND llm_annotation_status = 1
                GROUP BY
                    cited_datapaper_doi
            """
            df_details = pd.read_sql_query(query_details, conn, params=eval_datapaper_dois)
            
            df_details['progress_%'] = (df_details['annotated_count'] / df_details['total_candidates'] * 100).fillna(0)
            
            # ▼▼▼ 修正点: `papers`テーブルからタイトル情報を取得 ▼▼▼
            df_papers_info = pd.read_sql_query("SELECT doi, title FROM papers", conn)
            df_final_details = pd.merge(
                df_details, 
                df_papers_info, 
                left_on='cited_datapaper_doi',
                right_on='doi',
                how='left'
            )
            
            df_final_details = df_final_details[['title', 'annotated_count', 'total_candidates', 'progress_%']]
            df_final_details['progress_%'] = df_final_details['progress_%'].map('{:.1f}%'.format)
            
            display(df_final_details.sort_values(by='annotated_count', ascending=True))

    except Exception as e:
        print(f"💥 An error occurred: {e}")

# --- 実行 ---
check_evaluation_progress()

--- Checking Manual Annotation Progress for Evaluation Set ---

--- Overall Progress Summary ---
評価対象のデータ論文数: 100
目視確認対象の候補論文総数 (LLMが'Used'と判定): 587
--------------------------------------------------
アノテーション完了数: 2
進捗率: 0.34%

--- Progress by Data Paper ---
