In [1]:
import pandas as pd
import sqlite3
import os
import numpy as np
from IPython.display import display

# --- 1. 設定 ---
DB_PATH = "data/processed/s2orc_filtered.db"

# ★全ての既知のファイルをリストアップ
ORIGINAL_EVAL_FILE = "data/datapapers/sampled/evaluation_data_papers_50.csv"
CURRENT_EVAL_FILE = "data/datapapers/sampled/evaluation_data_papers_50_v2.csv"
TRAINING_PAPERS_FILE = "data/datapapers/sampled/training_data_papers_50.csv"

# 新しく作成するファイル
FINAL_EVAL_PAPERS_FILE = "data/datapapers/sampled/evaluation_data_papers_50_v2.csv"

TARGET_EVAL_SIZE = 50 
print(f"--- Fixing Evaluation Set (FINAL ATTEMPT) ---")
print(f"Output file: {FINAL_EVAL_PAPERS_FILE}")

# --- 2. 現状の評価セット（_v2.csv）から「有効なもの」だけを取得 ---
print(f"\nLoading current evaluation set: {CURRENT_EVAL_FILE}")
df_eval_current = pd.read_csv(CURRENT_EVAL_FILE)
current_eval_dois = tuple(df_eval_current['cited_datapaper_doi'].unique())

valid_eval_dois = []
with sqlite3.connect(DB_PATH) as conn:
    for doi in current_eval_dois:
        query_gt = "SELECT COUNT(citing_doi) FROM positive_candidates WHERE cited_datapaper_doi = ? AND human_annotation_status = 1"
        count = conn.execute(query_gt, (doi,)).fetchone()[0]
        if count >= 2:
            valid_eval_dois.append(doi)

df_eval_valid = df_eval_current[df_eval_current['cited_datapaper_doi'].isin(valid_eval_dois)].copy()
n_needed = TARGET_EVAL_SIZE - len(df_eval_valid)
print(f"Found {len(df_eval_valid)} valid papers (Human Used >= 2). Need {n_needed} more.")

# --- 3. ★「完全な」使用済みDOIプールを作成 ---
df_train = pd.read_csv(TRAINING_PAPERS_FILE)
df_orig_eval = pd.read_csv(ORIGINAL_EVAL_FILE)

# 訓練(150) + _v2(50) + オリジナル(54) の全てのDOIを結合
used_dois = set(df_train['cited_datapaper_doi']) \
            .union(set(df_eval_current['cited_datapaper_doi'])) \
            .union(set(df_orig_eval['cited_datapaper_doi']))
print(f"Total DOIs already in use (all known files): {len(used_dois)}")

# --- 4. 不足分の候補を選定 ---
if n_needed > 0:
    with sqlite3.connect(DB_PATH) as conn:
        # 元の母集団（LLM=Usedが2件以上）の全リストを取得
        query = """
            SELECT
                cited_datapaper_doi,
                COUNT(citing_doi) AS llm_used_count
            FROM positive_candidates
            WHERE llm_annotation_status = 1
            GROUP BY cited_datapaper_doi
            HAVING COUNT(citing_doi) >= 2;
        """
        df_all_eligible = pd.read_sql_query(query, conn)
        
        # 未使用のDOIプールを作成（★これが「真の」未使用プール）
        df_available = df_all_eligible[~df_all_eligible['cited_datapaper_doi'].isin(used_dois)]
        print(f"Available 'FRESH' pool size: {len(df_available)}")
        
        if len(df_available) < n_needed:
            print(f"❌ Error: Not enough 'FRESH' papers in the pool.")
        else:
            # 被引用数が多い順にソートし、上位n_needed件を取得
            df_replacements = df_available.sort_values(by='llm_used_count', ascending=False).head(n_needed)
            
            print("\n" + "="*50)
            print(f"✅ Selected {n_needed} new 'FRESH' data papers:")
            print(df_replacements.to_markdown(index=False))
            print("="*50)

            # --- 5. DBの一括更新 ---
            new_dois_to_update = tuple(df_replacements['cited_datapaper_doi'].unique())
            print(f"\nUpdating DB for {len(new_dois_to_update)} new papers...")
            
            cursor = conn.cursor()
            # ★これらの論文は「FRESH」なので、human_status=0 であることが保証されている
            update_query = f"""
                UPDATE positive_candidates
                SET human_annotation_status = 1
                WHERE cited_datapaper_doi IN ({','.join('?' for _ in new_dois_to_update)})
                  AND llm_annotation_status = 1
            """
            cursor.execute(update_query, new_dois_to_update)
            updated_rows = cursor.rowcount
            conn.commit()
            print(f"✅ DB Update Complete. {updated_rows:,} rows affected.")

            # --- 6. 追加した {n_needed} 件の「検証」 ---
            print("\nVerifying new papers (post-update)...")
            final_valid_replacements = []
            for doi in new_dois_to_update:
                query_gt = "SELECT COUNT(citing_doi) FROM positive_candidates WHERE cited_datapaper_doi = ? AND human_annotation_status = 1"
                count = conn.execute(query_gt, (doi,)).fetchone()[0]
                if count >= 2:
                    final_valid_replacements.append(doi)
                    print(f"  -> {doi} is VALID (Count: {count})")
                else:
                    print(f"  -> ⚠️ WARNING: Replacement paper {doi} still has < 2 Human_Used count ({count}).")
            
            if len(final_valid_replacements) == n_needed:
                print("✅ All new replacements are valid.")
            
            # --- 7. 最終的なCSVの作成と保存 ---
            df_final_eval_set = pd.concat([df_eval_valid, df_replacements], ignore_index=True)
            print(f"\nSaving final evaluation set with {len(df_final_eval_set)} papers to NEW FILE: {FINAL_EVAL_PAPERS_FILE}...")
            df_final_eval_set.to_csv(FINAL_EVAL_PAPERS_FILE, index=False)
            print("✅ New CSV file created successfully.")
            print("\n--- Process Complete ---")

else:
    print(f"\nEvaluation set already has {len(df_eval_valid)} valid papers. Saving as new file...")
    df_eval_valid.to_csv(FINAL_EVAL_PAPERS_FILE, index=False)
    print("✅ New CSV file created successfully.")

--- Fixing Evaluation Set (FINAL ATTEMPT) ---
Output file: data/datapapers/sampled/evaluation_data_papers_50_v2.csv

Loading current evaluation set: data/datapapers/sampled/evaluation_data_papers_50_v2.csv
Found 48 valid papers (Human Used >= 2). Need 2 more.
Total DOIs already in use (all known files): 203
Available 'FRESH' pool size: 518

✅ Selected 2 new 'FRESH' data papers:
| cited_datapaper_doi       |   llm_used_count |
|:--------------------------|-----------------:|
| 10.1016/J.DIB.2018.11.111 |                5 |
| 10.1016/J.DIB.2020.106438 |                3 |

Updating DB for 2 new papers...
✅ DB Update Complete. 8 rows affected.

Verifying new papers (post-update)...
  -> 10.1016/J.DIB.2018.11.111 is VALID (Count: 5)
  -> 10.1016/J.DIB.2020.106438 is VALID (Count: 3)
✅ All new replacements are valid.

Saving final evaluation set with 50 papers to NEW FILE: data/datapapers/sampled/evaluation_data_papers_50_v2.csv...
✅ New CSV file created successfully.

--- Process Complete 