In [1]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel # ★ AutoModelのみでOK
import accelerate
import sqlite3
import faiss # ★ Faiss をインポート
import json
from tqdm.auto import tqdm
import time

# --- 1. GPUの確認 ---
if torch.cuda.is_available():
    print(f"✅ GPU is available. Device: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda")
else:
    print("⚠️ GPU not found. Running on CPU.")
    device = torch.device("cpu")

✅ GPU is available. Device: NVIDIA RTX A6000


In [2]:
# --- 2. 設定とリソースのロード ---
DB_PATH = "data/processed/s2orc_filtered.db"

# ▼▼▼ 修正点: 事前学習済みモデルを使用 ▼▼▼
MODEL_CHECKPOINT = "allenai/scibert_scivocab_uncased" 

# ▼▼▼ 修正点: 事前学習済みモデルのインデックスを使用 ▼▼▼
FAISS_INDEX_FILE = "data/processed/pretrained_scibert.faiss"
DOI_MAP_FILE = "data/processed/pretrained_doi_map.json"

EVAL_PAPERS_FILE = "data/datapapers/sampled/evaluation_data_papers_50_v2.csv"
TRAIN_FILE = "data/processed/training_dataset_hard_negatives_1to3.csv"

RICH_RESULTS_FILE = "data/processed/pretrained_scibert_evaluation_results.json"

MAX_LENGTH = 512

# 評価するkの値
# ランキング上位の精度を見るための小さいk
small_k_values = [1, 5, 10, 30, 50, 100, 200, 300, 500]
# 網羅性を見るための大きいk (1000刻みで30000まで)
large_k_values = list(range(1000, 30001, 1000))

EVAL_K_VALUES = sorted(list(set(small_k_values + large_k_values)))

# SAVE_TOP_K も評価の最大値に合わせる
SAVE_TOP_K = 30000 

print(f"Loading tokenizer & PRE-TRAINED model from: {MODEL_CHECKPOINT}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
# ▼▼▼ 修正点: AutoModel (エンコーダーのみ) をロード ▼▼▼
model = AutoModel.from_pretrained(MODEL_CHECKPOINT).to(device)
model.eval()

print(f"Loading DOI map from {DOI_MAP_FILE}...")
with open(DOI_MAP_FILE, 'r') as f:
    doi_to_index = json.load(f)
id_to_doi = {v: k for k, v in doi_to_index.items()}

# ▼▼▼ 修正点: Faissインデックスをロード ▼▼▼
print(f"Loading Faiss index: {FAISS_INDEX_FILE}")
index = faiss.read_index(FAISS_INDEX_FILE)
total_vectors = index.ntotal
print(f"Faiss index loaded. Total vectors: {total_vectors:,}")

Loading tokenizer & PRE-TRAINED model from: allenai/scibert_scivocab_uncased




Loading DOI map from data/processed/pretrained_doi_map.json...
Loading Faiss index: data/processed/pretrained_scibert.faiss
Faiss index loaded. Total vectors: 11,619,136


In [3]:
# --- 3. 評価データ（クエリと正解）の準備 ---
print("Preparing evaluation queries and ground truth...")

# 1. 訓練データ（除外リスト）のDOIをロード
df_train = pd.read_csv(TRAIN_FILE)
df_train = df_train.dropna(subset=['abstract_a', 'abstract_b'])
train_dois = set(df_train['abstract_a']) | set(df_train['abstract_b'])
print(f"Loaded {len(train_dois):,} training DOIs to exclude.")

# 2. 評価用データ論文（50件）のリストをロード
df_eval_papers = pd.read_csv(EVAL_PAPERS_FILE)
eval_data_paper_dois = tuple(df_eval_papers['cited_datapaper_doi'].unique())

# 3. 「クエリ論文」と「正解DOIリスト」を作成
evaluation_queries = [] 
with sqlite3.connect(DB_PATH) as conn:
    for data_paper_doi in tqdm(eval_data_paper_dois, desc="Building Ground Truth"):
        query_gt = "SELECT citing_doi FROM positive_candidates WHERE cited_datapaper_doi = ? AND human_annotation_status = 1"
        gt_rows = conn.execute(query_gt, (data_paper_doi,)).fetchall()
        ground_truth_dois = {row[0] for row in gt_rows}
        
        if len(ground_truth_dois) >= 2:
            query_doi = ground_truth_dois.pop()
            query_text = conn.execute("SELECT abstract FROM papers WHERE doi = ?", (query_doi,)).fetchone()
            if query_text:
                evaluation_queries.append({
                    "query_doi": query_doi,
                    "query_abstract": query_text[0],
                    "ground_truth_dois": list(ground_truth_dois)
                })

print(f"Prepared {len(evaluation_queries)} valid evaluation queries.")

Preparing evaluation queries and ground truth...
Loaded 2,338 training DOIs to exclude.


Building Ground Truth:   0%|          | 0/50 [00:00<?, ?it/s]

Prepared 50 valid evaluation queries.


In [4]:
# --- 4. 評価の実行 (事前学習モデル / Faiss L2距離) ---
print(f"Starting evaluation... Scoring ALL {total_vectors:,} candidates using Faiss (CPU).")

all_first_hit_ranks = [] 
all_recalls_at_k = {k: [] for k in EVAL_K_VALUES}
all_rich_results = [] 

# ▼▼▼ 修正点: GPUインデックスの作成をコメントアウト ▼▼▼
# res = faiss.StandardGpuResources()
# gpu_index = faiss.index_cpu_to_gpu(res, 0, index)
# print("Faiss index transferred to GPU.")
print("Using CPU Faiss index (GPU index k limit is 2048).")
# ▲▲▲ ------------------------------------------ ▲▲▲

with torch.no_grad():
    for query_data in tqdm(evaluation_queries, desc="Evaluating Queries (Total)"):
        
        query_abstract = query_data["query_abstract"]
        ground_truth = set(query_data["ground_truth_dois"])

        # 1. クエリをベクトル化
        inputs = tokenizer(query_abstract, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt").to(device)
        outputs = model(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        query_vector = outputs.pooler_output.cpu().numpy().astype(np.float32)

        # --- 2. ★Faissで全DB（1160万件）を検索★ ---
        k_to_search = max(max(EVAL_K_VALUES), SAVE_TOP_K)
        
        # ▼▼▼ 修正点: 'gpu_index' ではなく 'index' (CPU) を使用 ▼▼▼
        distances, sorted_indices = index.search(query_vector, k_to_search)
        
        distances = distances[0]
        sorted_indices = sorted_indices[0]
        
        # 3. 採点 と 4. リッチ結果の保存
        first_hit_rank = 0 
        hits_count = 0
        hits_at_k = {k: 0 for k in EVAL_K_VALUES}
        top_k_results_list = []
        ranks_of_all_hits = []
        current_rank = 1
        
        for i, idx in enumerate(sorted_indices):
            if idx == -1: continue
            if idx not in id_to_doi: continue
            
            doi = id_to_doi[idx]
            if doi in train_dois: continue
            
            is_correct = (doi in ground_truth)
            
            if is_correct:
                hits_count += 1
                ranks_of_all_hits.append(current_rank)
                if first_hit_rank == 0:
                    first_hit_rank = current_rank
            
            for k in EVAL_K_VALUES:
                if current_rank <= k and is_correct:
                    hits_at_k[k] += 1
            
            top_k_results_list.append({
                "rank": current_rank,
                "doi": doi,
                "score": float(distances[i]),
                "is_correct": is_correct
            })
            
            current_rank += 1 

        # --- 5. 最終メトリクスの計算 ---
        all_first_hit_ranks.append(first_hit_rank)
            
        for k in EVAL_K_VALUES:
            recall = hits_at_k[k] / len(ground_truth) if ground_truth else 0
            all_recalls_at_k[k].append(recall)
            
        all_rich_results.append({
            "query_doi": query_data["query_doi"],
            "total_ground_truth": len(ground_truth),
            "ground_truth_dois": list(ground_truth),
            "first_hit_rank": first_hit_rank,
            "ranks_of_all_hits": ranks_of_all_hits,
            "top_k_results": top_k_results_list
        })

print("Evaluation complete.")

Starting evaluation... Scoring ALL 11,619,136 candidates using Faiss (CPU).
Using CPU Faiss index (GPU index k limit is 2048).


Evaluating Queries (Total):   0%|          | 0/50 [00:00<?, ?it/s]

Evaluation complete.


In [5]:
# --- 5. 最終結果の集計 ---

# 5-1. 豊富な結果をJSONファイルに保存
print(f"Saving rich evaluation results to {RICH_RESULTS_FILE}...")
with open(RICH_RESULTS_FILE, 'w') as f:
    json.dump(all_rich_results, f, indent=2)
print("Rich results saved.")

# 5-2. 平均スコアのサマリーを表示
print("\n" + "="*50)
print(f"--- Final Evaluation Results: PRE-TRAINED SciBERT (Faiss L2 Distance) ---")
print(f"(Based on {len(evaluation_queries)} queries, searching ALL {total_vectors:,} documents)")
print("="*50)

# MRR
mrr_scores = [1.0 / r for r in all_first_hit_ranks if r > 0]
mrr = np.mean(mrr_scores) if mrr_scores else 0.0
print(f"MRR (Mean Reciprocal Rank): {mrr:.4f}")

print(f"Ranks of First Hit (0 = Not Found in Top {SAVE_TOP_K}):")
print(all_first_hit_ranks)

print("\n--- Recall@K ---")
for k in EVAL_K_VALUES:
    recall_k = np.mean(all_recalls_at_k[k])
    print(f"Recall@{k:<4}: {recall_k:.4f} ({(recall_k * 100):.2f}%)")
print("="*50)

not_found_count = len([r for r in all_first_hit_ranks if r == 0])
print(f"Queries where first hit was NOT found (in Top {SAVE_TOP_K}): {not_found_count} / {len(evaluation_queries)}")

Saving rich evaluation results to data/processed/pretrained_scibert_evaluation_results.json...
Rich results saved.

--- Final Evaluation Results: PRE-TRAINED SciBERT (Faiss L2 Distance) ---
(Based on 50 queries, searching ALL 11,619,136 documents)
MRR (Mean Reciprocal Rank): 0.0310
Ranks of First Hit (0 = Not Found in Top 30000):
[0, 37, 0, 0, 0, 0, 0, 0, 24224, 995, 0, 0, 0, 4, 0, 0, 0, 0, 14438, 0, 0, 0, 6871, 0, 0, 1847, 0, 0, 0, 0, 0, 5265, 0, 4713, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

--- Recall@K ---
Recall@1   : 0.0000 (0.00%)
Recall@5   : 0.0033 (0.33%)
Recall@10  : 0.0033 (0.33%)
Recall@30  : 0.0033 (0.33%)
Recall@50  : 0.0041 (0.41%)
Recall@100 : 0.0041 (0.41%)
Recall@200 : 0.0041 (0.41%)
Recall@300 : 0.0041 (0.41%)
Recall@500 : 0.0041 (0.41%)
Recall@1000: 0.0122 (1.22%)
Recall@2000: 0.0180 (1.80%)
Recall@3000: 0.0194 (1.94%)
Recall@4000: 0.0194 (1.94%)
Recall@5000: 0.0261 (2.61%)
Recall@6000: 0.0328 (3.28%)
Recall@7000: 0.0385 (3.85%)
Recall@8000: 0.0385 (3.85%)
