In [1]:
import pandas as pd
import sqlite3
import os

# --- 1. Ë®≠ÂÆö ---
DB_PATH = "data/processed/s2orc_filtered.db"
EVAL_PAPERS_FILE = "data/datapapers/sampled/evaluation_data_papers_50.csv"

print("--- Checking Evaluation Ground Truth ---")

# --- 2. Ë©ï‰æ°Áî®„Éá„Éº„ÇøË´ñÊñáÔºà50‰ª∂Ôºâ„ÅÆ„É™„Çπ„Éà„Çí„É≠„Éº„Éâ ---
try:
    df_eval_papers = pd.read_csv(EVAL_PAPERS_FILE)
    eval_data_paper_dois = tuple(df_eval_papers['cited_datapaper_doi'].unique())
    print(f"Loaded {len(eval_data_paper_dois)} data papers from {EVAL_PAPERS_FILE}")
except Exception as e:
    print(f"Error loading CSV: {e}")
    raise

# --- 3. DB„ÇíË™øÊüª ---
results = []
try:
    with sqlite3.connect(DB_PATH) as conn:
        for data_paper_doi in eval_data_paper_dois:
            # Ê≠£Ëß£DOI (Human=1) „ÅÆÊï∞„Çí„Ç´„Ç¶„É≥„Éà
            query_gt = "SELECT COUNT(citing_doi) FROM positive_candidates WHERE cited_datapaper_doi = ? AND human_annotation_status = 1"
            count = conn.execute(query_gt, (data_paper_doi,)).fetchone()[0]
            results.append({
                'data_paper_doi': data_paper_doi,
                'human_used_count': count
            })

    # --- 4. ÁµêÊûú„ÅÆË°®Á§∫ ---
    df_results = pd.DataFrame(results)
    
    # Ê≠£Ëß£„Åå0‰ª∂„ÅÆ„ÇÇ„ÅÆ„ÇíË°®Á§∫
    df_missing = df_results[df_results['human_used_count'] == 0]
    
    print("\n" + "="*50)
    if not df_missing.empty:
        print(f"Found {len(df_missing)} data papers with ZERO 'Human: Used' (human_status=1) entries:")
        print(df_missing)
    else:
        print("All 50 evaluation data papers have at least 1 'Human: Used' entry.")
    print("="*50)

except Exception as e:
    print(f"An error occurred: {e}")

--- Checking Evaluation Ground Truth ---
Loaded 50 data papers from data/datapapers/sampled/evaluation_data_papers_50.csv

Found 2 data papers with ZERO 'Human: Used' (human_status=1) entries:
               data_paper_doi  human_used_count
15  10.1016/J.DIB.2016.05.025                 0
33  10.1016/J.DIB.2018.11.111                 0


In [3]:
import pandas as pd
import sqlite3
import os
import numpy as np
# ‚ñº‚ñº‚ñº tabulate„Åå‰∏çË¶Å„Å™ 'display' „Çí„Ç§„É≥„Éù„Éº„Éà ‚ñº‚ñº‚ñº
from IPython.display import display

# --- 1. Ë®≠ÂÆö ---
DB_PATH = "data/processed/s2orc_filtered.db"
EVAL_PAPERS_FILE = "data/datapapers/sampled/evaluation_data_papers_50.csv"
TRAINING_PAPERS_FILE = "data/datapapers/sampled/training_data_papers_50.csv"

N_REPLACEMENTS_NEEDED = 2 # 2‰ª∂‰∏çË∂≥„Åó„Å¶„ÅÑ„Çã

print("--- Finding Replacement Evaluation Papers (Top-K Cited) ---")

try:
    with sqlite3.connect(DB_PATH) as conn:
        # 1. ÂÖÉ„ÅÆÊØçÈõÜÂõ£ÔºàLLM=Used„Åå2‰ª∂‰ª•‰∏äÔºâ„ÅÆÂÖ®„É™„Çπ„Éà„ÇíÂèñÂæó
        query = """
            SELECT
                cited_datapaper_doi,
                COUNT(citing_doi) AS llm_used_count
            FROM
                positive_candidates
            WHERE
                llm_annotation_status = 1
            GROUP BY
                cited_datapaper_doi
            HAVING
                COUNT(citing_doi) >= 2;
        """
        df_all_eligible = pd.read_sql_query(query, conn)
        all_eligible_dois = set(df_all_eligible['cited_datapaper_doi'])
        print(f"Total eligible pool size: {len(all_eligible_dois)}")

    # 2. Êó¢„Å´‰ΩøÁî®Ê∏à„Åø„ÅÆDOI„Çí„É≠„Éº„Éâ
    df_eval = pd.read_csv(EVAL_PAPERS_FILE)
    eval_dois = set(df_eval['cited_datapaper_doi'])
    
    df_train = pd.read_csv(TRAINING_PAPERS_FILE)
    train_dois = set(df_train['cited_datapaper_doi'])
    
    used_dois = eval_dois.union(train_dois)
    print(f"Already used DOIs (Train + Eval): {len(used_dois)}")

    # 3. Êú™‰ΩøÁî®„ÅÆDOI„Éó„Éº„É´„Çí‰ΩúÊàê
    available_dois = list(all_eligible_dois - used_dois)
    df_available = df_all_eligible[df_all_eligible['cited_datapaper_doi'].isin(available_dois)]
    
    if len(df_available) < N_REPLACEMENTS_NEEDED:
        print(f"‚ùå Error: Not enough available papers in the pool to find {N_REPLACEMENTS_NEEDED} replacements.")
    else:
        # 4. Ë¢´ÂºïÁî®Êï∞Ôºàllm_used_countÔºâ„ÅåÂ§ö„ÅÑÈ†Ü„Å´„ÇΩ„Éº„Éà„Åó„ÄÅ‰∏ä‰Ωç2‰ª∂„ÇíÂèñÂæó
        replacements_df = df_available.sort_values(by='llm_used_count', ascending=False).head(N_REPLACEMENTS_NEEDED)

        print("\n" + "="*50)
        print(f"‚úÖ Found {N_REPLACEMENTS_NEEDED} replacement data papers (Top {N_REPLACEMENTS_NEEDED} by LLM 'Used' count):")
        
        # ‚ñº‚ñº‚ñº ‰øÆÊ≠£ÁÇπ: .to_markdown() „Çí display() „Å´Â§âÊõ¥ ‚ñº‚ñº‚ñº
        display(replacements_df)
        print("="*50)
        
        # 5. ÂÖÉ„ÅÆCSV„Éï„Ç°„Ç§„É´„Å´ËøΩË®ò
        print(f"Appending these {N_REPLACEMENTS_NEEDED} papers to {EVAL_PAPERS_FILE}...")
        replacements_df.to_csv(EVAL_PAPERS_FILE, mode='a', header=False, index=False)
        
        df_new_eval = pd.read_csv(EVAL_PAPERS_FILE)
        print(f"Successfully updated. New evaluation set size: {len(df_new_eval)} papers.")
        
        print("\n--- ‚ùó Ê¨°„ÅÆ„Çπ„ÉÜ„ÉÉ„Éó ‚ùó ---")
        print("1. `evaluation_data_papers_50.csv` „Åå 52‰ª∂ „Å´Êõ¥Êñ∞„Åï„Çå„Åæ„Åó„Åü„ÄÇ")
        print("2. `annotator_app` „Çí `ANNOTATION_MODE = \"evaluation\"` „Å´Ë®≠ÂÆö„Åó„Å¶Ëµ∑Âãï„Åó„ÄÅ")
        print("   ‰ªäËøΩÂä†„Åï„Çå„Åü 2‰ª∂ „ÅÆË´ñÊñá„Å´Á¥ê„Å•„ÅèÂÄôË£ú„Çí**ÁõÆË¶ñ„Åß„Ç¢„Éé„ÉÜ„Éº„Ç∑„Éß„É≥**„Åó„Å¶„Åè„Å†„Åï„ÅÑ„ÄÇ")
        print("3. „Ç¢„Éé„ÉÜ„Éº„Ç∑„Éß„É≥„ÅåÂÆå‰∫Ü„Åó„ÄÅÊñ∞„Åó„ÅÑ2‰ª∂„Å´ `human_status=1` „ÅÆÊ≠£Ëß£„Åå‰ΩúÊàê„Åï„Çå„Åü„Çâ„ÄÅ")
        print("   Ë©ï‰æ°„Çπ„ÇØ„É™„Éó„Éà (`22b_...ipynb`) „ÇíÂÜçÂÆüË°å„Åó„Å¶„Åè„Å†„Åï„ÅÑ„ÄÇ")
        print("   (‰ªäÂ∫¶„ÅØ 50‰ª∂‰∏≠50‰ª∂ „ÅåÂá¶ÁêÜ„Åï„Çå„Çã„ÅØ„Åö„Åß„Åô)")

except Exception as e:
    print(f"üí• An error occurred: {e}")

--- Finding Replacement Evaluation Papers (Top-K Cited) ---
Total eligible pool size: 721
Already used DOIs (Train + Eval): 200

‚úÖ Found 2 replacement data papers (Top 2 by LLM 'Used' count):


Unnamed: 0,cited_datapaper_doi,llm_used_count
0,10.1002/ECY.2663,3
364,10.1038/S41597-019-0217-0,3


Appending these 2 papers to data/datapapers/sampled/evaluation_data_papers_50.csv...
Successfully updated. New evaluation set size: 52 papers.

--- ‚ùó Ê¨°„ÅÆ„Çπ„ÉÜ„ÉÉ„Éó ‚ùó ---
1. `evaluation_data_papers_50.csv` „Åå 52‰ª∂ „Å´Êõ¥Êñ∞„Åï„Çå„Åæ„Åó„Åü„ÄÇ
2. `annotator_app` „Çí `ANNOTATION_MODE = "evaluation"` „Å´Ë®≠ÂÆö„Åó„Å¶Ëµ∑Âãï„Åó„ÄÅ
   ‰ªäËøΩÂä†„Åï„Çå„Åü 2‰ª∂ „ÅÆË´ñÊñá„Å´Á¥ê„Å•„ÅèÂÄôË£ú„Çí**ÁõÆË¶ñ„Åß„Ç¢„Éé„ÉÜ„Éº„Ç∑„Éß„É≥**„Åó„Å¶„Åè„Å†„Åï„ÅÑ„ÄÇ
3. „Ç¢„Éé„ÉÜ„Éº„Ç∑„Éß„É≥„ÅåÂÆå‰∫Ü„Åó„ÄÅÊñ∞„Åó„ÅÑ2‰ª∂„Å´ `human_status=1` „ÅÆÊ≠£Ëß£„Åå‰ΩúÊàê„Åï„Çå„Åü„Çâ„ÄÅ
   Ë©ï‰æ°„Çπ„ÇØ„É™„Éó„Éà (`22b_...ipynb`) „ÇíÂÜçÂÆüË°å„Åó„Å¶„Åè„Å†„Åï„ÅÑ„ÄÇ
   (‰ªäÂ∫¶„ÅØ 50‰ª∂‰∏≠50‰ª∂ „ÅåÂá¶ÁêÜ„Åï„Çå„Çã„ÅØ„Åö„Åß„Åô)


In [None]:
import pandas as pd
import sqlite3
import os
from IPython.display import display

# --- 1. Ë®≠ÂÆö ---
DB_PATH = "data/processed/s2orc_filtered.db"

# ‚ñº‚ñº‚ñº „Å©„ÅÆ„É¢„Éº„Éâ„ÅÆ„Éá„Éº„Çø„Çí‰∏ÄÊã¨ÊâøË™ç„Åô„Çã„ÅãÈÅ∏Êäû ‚ñº‚ñº‚ñº
# 'evaluation'     : Ë©ï‰æ°Áî®„Éá„Éº„ÇøÔºàÁ¥Ñ50„Ç∞„É´„Éº„ÉóÔºâ„ÅÆÊú™Âá¶ÁêÜÂàÜ
# 'training'       : Ë®ìÁ∑¥Áî®„Éá„Éº„ÇøÔºà150„Ç∞„É´„Éº„ÉóÔºâ„ÅÆÊú™Âá¶ÁêÜÂàÜ
# 'training_advanced': Ë®ìÁ∑¥Áî®„Éà„ÉÉ„Éó20„ÅÆÊú™Âá¶ÁêÜÂàÜ
UPDATE_MODE = "evaluation" 
# ‚ñ≤‚ñ≤‚ñ≤ --------------------------------- ‚ñ≤‚ñ≤‚ñ≤

# „Ç¢„Éé„ÉÜ„Éº„Ç∑„Éß„É≥ÂØæË±°„ÅÆDOI„É™„Çπ„Éà„ÇíË™≠„ÅøËæº„ÇÄ
EVAL_PAPERS_FILE = "data/datapapers/sampled/evaluation_data_papers_50.csv"
TRAINING_PAPERS_FILE = "data/datapapers/sampled/training_data_papers_50.csv"

print(f"Target mode set to: {UPDATE_MODE}")

In [None]:
# --- 2. ÂØæË±°DOI„É™„Çπ„Éà„ÅÆË™≠„ÅøËæº„Åø ---
TARGET_DOI_LIST = ()
try:
    if UPDATE_MODE == "evaluation":
        df_papers = pd.read_csv(EVAL_PAPERS_FILE)
        TARGET_DOI_LIST = tuple(df_papers['cited_datapaper_doi'].str.upper().tolist())
    elif UPDATE_MODE == "training":
        df_papers = pd.read_csv(TRAINING_PAPERS_FILE)
        TARGET_DOI_LIST = tuple(df_papers['cited_datapaper_doi'].str.upper().tolist())
    elif UPDATE_MODE == "training_advanced":
        df_papers = pd.read_csv(TRAINING_PAPERS_FILE)
        df_top_20 = df_papers.nlargest(20, 'used_paper_count')
        TARGET_DOI_LIST = tuple(df_top_20['cited_datapaper_doi'].str.upper().tolist())
    else:
        raise ValueError(f"Unknown UPDATE_MODE: {UPDATE_MODE}")
        
    print(f"Loaded {len(TARGET_DOI_LIST)} data paper DOIs for mode '{UPDATE_MODE}'.")

except Exception as e:
    print(f"An error occurred loading DOI lists: {e}")

In [None]:
# --- 3. „Éá„Éº„Çø„Éô„Éº„Çπ„ÅÆ‰∏ÄÊã¨Êõ¥Êñ∞ ---
if TARGET_DOI_LIST:
    print("Connecting to database...")
    try:
        with sqlite3.connect(DB_PATH) as conn:
            cursor = conn.cursor()
            
            # 1. Êõ¥Êñ∞ÂØæË±°„ÅÆ‰ª∂Êï∞„Çí‰∫ãÂâç„Å´„Ç´„Ç¶„É≥„Éà
            placeholders = ','.join('?' for _ in TARGET_DOI_LIST)
            count_query = f"""
                SELECT COUNT(*)
                FROM positive_candidates
                WHERE cited_datapaper_doi IN ({placeholders})
                  AND llm_annotation_status = 1
                  AND human_annotation_status = 0
            """
            # (TARGET_DOI_LIST„Çí2ÂõûÊ∏°„ÅôÂøÖË¶Å„Åå„ÅÇ„Çã„Åü„ÇÅ„ÄÅ„É™„Çπ„Éà„ÇíÁµêÂêà)
            params = TARGET_DOI_LIST
            count_before = cursor.execute(count_query, params).fetchone()[0]
            
            if count_before == 0:
                print("No unprocessed items found. Database is already up to date.")
            else:
                print(f"Found {count_before:,} items to approve (set human_status=1)...")
                
                # 2. ‰∏ÄÊã¨Êõ¥Êñ∞ÔºàUPDATEÔºâ„ÅÆÂÆüË°å
                update_query = f"""
                    UPDATE positive_candidates
                    SET human_annotation_status = 1
                    WHERE cited_datapaper_doi IN ({placeholders})
                      AND llm_annotation_status = 1
                      AND human_annotation_status = 0
                """
                
                cursor.execute(update_query, params)
                updated_rows = cursor.rowcount # ÂÆüÈöõ„Å´Êõ¥Êñ∞„Åï„Çå„ÅüË°åÊï∞
                conn.commit()
                
                print("\n" + "="*50)
                print(f"‚úÖ Successfully updated {updated_rows:,} rows.")
                print(f"Mode '{UPDATE_MODE}' is now fully annotated.")
                print("="*50)

    except Exception as e:
        print(f"üí• An error occurred during database update: {e}")
else:
    print("No target DOIs loaded. Script stopped.")