In [1]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, BertPreTrainedModel
from transformers.modeling_outputs import SequenceClassifierOutput
import accelerate
import sqlite3
import faiss # （Faissは使わないが、ライブラリ確認のため残置）
import json
from tqdm.auto import tqdm

# --- 1. GPUの確認 ---
if torch.cuda.is_available():
    print(f"✅ GPU is available. Device: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda")
else:
    print("⚠️ GPU not found. Running on CPU.")
    device = torch.device("cpu")

✅ GPU is available. Device: NVIDIA RTX A6000


In [2]:
# --- 2. カスタムモデルクラスの定義 (CLS Pooling) ---
# (notebooks/18d... と同一の定義)

class SiameseRankNetModel(BertPreTrainedModel):
    def __init__(self, config):
        super(SiameseRankNetModel, self).__init__(config)
        self.bert = AutoModel.from_config(config)
        self.classifier_head = nn.Sequential(
            nn.Linear(config.hidden_size * 4, config.hidden_size),
            nn.ReLU(),
            nn.Linear(config.hidden_size, 1)
        )
        self.init_weights()
    
    def _get_vector(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return output.pooler_output 

    def _calculate_score(self, vec_a, vec_b):
        diff = torch.abs(vec_a - vec_b)
        prod = vec_a * vec_b
        features = torch.cat([vec_a, vec_b, diff, prod], dim=1)
        return self.classifier_head(features)

    def forward(self, input_ids=None, **kwargs):
        pass # 推論時には不要

print("Custom model class 'SiameseRankNetModel' defined.")

Custom model class 'SiameseRankNetModel' defined.


In [3]:
# --- 3. 設定とリソースのロード ---
DB_PATH = "data/processed/s2orc_filtered.db"
TRAINED_MODEL_PATH = "models/sbert_ranknet_v3/best_model"

EMBEDDINGS_FILE = "data/processed/ranknet_scibert_cls_embeddings.npy"
DOI_MAP_FILE = "data/processed/ranknet_doi_map.json"

EVAL_PAPERS_FILE = "data/datapapers/sampled/evaluation_data_papers_50.csv"
TRAIN_FILE = "data/processed/training_dataset_abstract_cleaned_v3.csv"

# 評価設定
EVAL_K_VALUES = [1, 5, 10, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000]
SCORE_BATCH_SIZE = 1000000
MAX_LENGTH = 512

# ▼▼▼ 修正点: 保存するTop-K件数と、リッチな結果の保存先を定義 ▼▼▼
SAVE_TOP_K = 1000
RICH_RESULTS_FILE = "data/processed/ranknet_evaluation_results.json"

print(f"Loading tokenizer & model from: {TRAINED_MODEL_PATH}")
tokenizer = AutoTokenizer.from_pretrained(TRAINED_MODEL_PATH)
model = SiameseRankNetModel.from_pretrained(TRAINED_MODEL_PATH).to(device)
model.eval()

print(f"Loading DOI map: {DOI_MAP_FILE}")
with open(DOI_MAP_FILE, 'r') as f:
    doi_to_index = json.load(f)
id_to_doi = {v: k for k, v in doi_to_index.items()}

print(f"Loading Embeddings (mmap_mode): {EMBEDDINGS_FILE}")
d = 768
file_size = os.path.getsize(EMBEDDINGS_FILE)
dtype_size = np.dtype(np.float32).itemsize
total_vectors = file_size // (d * dtype_size)

all_db_embeddings = np.memmap(
    EMBEDDINGS_FILE, dtype=np.float32, mode='r', shape=(total_vectors, d)
)
print(f"Loaded {total_vectors:,} embeddings.")

Loading tokenizer & model from: models/sbert_ranknet_v3/best_model
Loading DOI map: data/processed/ranknet_doi_map.json
Loading Embeddings (mmap_mode): data/processed/ranknet_scibert_cls_embeddings.npy
Loaded 11,619,136 embeddings.


In [4]:
# --- 4. 評価データ（クエリと正解）の準備 ---
print("Preparing evaluation queries and ground truth...")

# 1. 訓練データ（除外リスト）のDOIをロード
df_train = pd.read_csv(TRAIN_FILE)
df_train = df_train.dropna(subset=['abstract_a', 'abstract_b'])
train_dois = set(df_train['abstract_a']) | set(df_train['abstract_b'])
print(f"Loaded {len(train_dois):,} training DOIs to exclude.")

# 2. 評価用データ論文（54件）のリストをロード
df_eval_papers = pd.read_csv(EVAL_PAPERS_FILE)
eval_data_paper_dois = tuple(df_eval_papers['cited_datapaper_doi'].unique())

# 3. 「クエリ論文」と「正解DOIリスト」を作成
evaluation_queries = [] 

with sqlite3.connect(DB_PATH) as conn:
    for data_paper_doi in tqdm(eval_data_paper_dois, desc="Building Ground Truth"):
        # 正解DOI (Human=1) を取得
        query_gt = "SELECT citing_doi FROM positive_candidates WHERE cited_datapaper_doi = ? AND human_annotation_status = 1"
        gt_rows = conn.execute(query_gt, (data_paper_doi,)).fetchall()
        ground_truth_dois = {row[0] for row in gt_rows}
        
        # ▼▼▼ 修正点: 正解が2件以上ある場合のみを評価対象とする ▼▼▼
        if len(ground_truth_dois) >= 2:
            # 1件をクエリとして使用
            query_doi = ground_truth_dois.pop() 
            
            query_text = conn.execute("SELECT abstract FROM papers WHERE doi = ?", (query_doi,)).fetchone()
            if query_text:
                evaluation_queries.append({
                    "query_doi": query_doi,
                    "query_abstract": query_text[0],
                    "ground_truth_dois": ground_truth_dois # ★残りの正解 (1件以上)
                })
        # ▲▲▲ ------------------------------------------ ▲▲▲

print(f"Prepared {len(evaluation_queries)} valid evaluation queries (must have >= 2 ground truths).")

Preparing evaluation queries and ground truth...
Loaded 28,824 training DOIs to exclude.


Building Ground Truth:   0%|          | 0/54 [00:00<?, ?it/s]

Prepared 46 valid evaluation queries (must have >= 2 ground truths).


In [5]:
# --- 5. 評価の実行 (全DBスコア計算 + リッチ結果保存) ---
print(f"Starting evaluation... Scoring ALL {total_vectors:,} candidates.")

all_first_hit_ranks = [] 
all_recalls_at_k = {k: [] for k in EVAL_K_VALUES}
all_rich_results = [] # ★すべての結果を保存するリスト

with torch.no_grad():
    for query_data in tqdm(evaluation_queries, desc="Evaluating Queries (Total)"):
        
        query_abstract = query_data["query_abstract"]
        ground_truth = query_data["ground_truth_dois"]

        # 1. クエリをベクトル化
        inputs = tokenizer(query_abstract, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt").to(device)
        query_vector = model._get_vector(inputs['input_ids'], inputs['attention_mask'])

        # 2. 全DB（1160万件）のスコアをバッチ処理で計算
        all_scores = []
        for i in tqdm(range(0, total_vectors, SCORE_BATCH_SIZE), desc="Scoring DB", leave=False):
            candidate_vectors_batch = torch.tensor(
                all_db_embeddings[i : i + SCORE_BATCH_SIZE]
            ).to(device)
            query_vector_tiled = query_vector.repeat(len(candidate_vectors_batch), 1)
            scores_batch = model._calculate_score(query_vector_tiled, candidate_vectors_batch)
            all_scores.append(scores_batch.cpu().numpy())
        
        final_scores = np.vstack(all_scores).flatten()
        
        # 3. スコア順に並び替え
        sorted_indices = np.argsort(final_scores)[::-1]
        
        # --- 4. 採点 と 5. リッチ結果の保存 (Top-K) ---
        
        first_hit_rank = 0 # 0は「ヒットなし」
        hits_count = 0
        hits_at_k = {k: 0 for k in EVAL_K_VALUES}
        top_k_results_list = [] # ★バグ修正: このリストを作成する
        ranks_of_all_hits = []  # ★バグ修正: 正解の順位を記録
        
        current_rank = 1 # 1位からスタート
        
        for idx in sorted_indices:
            # ランクがSAVE_TOP_K (1000) を超え、かつ正解をすべて見つけたら探索終了
            if current_rank > SAVE_TOP_K and hits_count == len(ground_truth):
                break
                
            if idx not in id_to_doi:
                continue
                
            doi = id_to_doi[idx]
            
            # 5a. 訓練データを除外
            if doi in train_dois:
                continue
            
            # 5b. ランク付けと採点
            is_correct = (doi in ground_truth)
            
            if is_correct:
                hits_count += 1
                ranks_of_all_hits.append(current_rank) # ★バグ修正: 正解の順位を保存
                if first_hit_rank == 0:
                    first_hit_rank = current_rank
            
            # 5c. Recall@k のカウント
            for k in EVAL_K_VALUES:
                if current_rank <= k and is_correct:
                    hits_at_k[k] += 1
            
            # 5d. Top-K の結果を保存
            if current_rank <= SAVE_TOP_K:
                top_k_results_list.append({
                    "rank": current_rank,
                    "doi": doi,
                    "score": float(final_scores[idx]),
                    "is_correct": is_correct
                })
            
            current_rank += 1 # ランクを更新

        # --- 6. 最終メトリクスの計算 (1クエリ分) ---
        all_first_hit_ranks.append(first_hit_rank)
            
        for k in EVAL_K_VALUES:
            recall = hits_at_k[k] / len(ground_truth) if ground_truth else 0
            all_recalls_at_k[k].append(recall)
            
        # 7. ★バグ修正: 豊富な結果をリストに追加する★
        all_rich_results.append({
            "query_doi": query_data["query_doi"],
            "total_ground_truth": len(ground_truth),
            "ground_truth_dois": list(ground_truth),
            "first_hit_rank": first_hit_rank,
            "ranks_of_all_hits": ranks_of_all_hits,
            "top_k_results": top_k_results_list
        })

print("Evaluation complete.")

Starting evaluation... Scoring ALL 11,619,136 candidates.


Evaluating Queries (Total):   0%|          | 0/46 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/12 [00:00<?, ?it/s]

Evaluation complete.


In [7]:
# --- 6. 最終結果の集計 ---

# 6-1. 豊富な結果をJSONファイルに保存
# (セル 5 で 'all_rich_results' にデータが追加されているはず)
print(f"Saving rich evaluation results to {RICH_RESULTS_FILE}...")
with open(RICH_RESULTS_FILE, 'w') as f:
    json.dump(all_rich_results, f, indent=2)
print("Rich results saved.")

# 6-2. 平均スコアのサマリーを表示
print("\n" + "="*50)
print(f"--- Final Evaluation Results: S-BERT (RankNet) ---")
print(f"(Based on {len(evaluation_queries)} queries, scoring ALL {total_vectors:,} documents)")
print(f"(Rich results for Top {SAVE_TOP_K} saved to {RICH_RESULTS_FILE})")
print("="*50)

# ▼▼▼ 修正点: 'all_reciprocal_ranks' ではなく 'all_first_hit_ranks' を使用 ▼▼▼
# (all_first_hit_ranks は セル 5 で定義されています)
mrr_scores = [1.0 / r for r in all_first_hit_ranks if r > 0]
mrr = np.mean(mrr_scores) if mrr_scores else 0.0
print(f"MRR (Mean Reciprocal Rank): {mrr:.4f}")

# (参考) 各クエリの最初の正解順位を表示
print(f"Ranks of First Hit (0 = Not Found):")
print(all_first_hit_ranks)

print("\n--- Recall@K ---")
for k in EVAL_K_VALUES:
    recall_k = np.mean(all_recalls_at_k[k])
    print(f"Recall@{k:<4}: {recall_k:.4f} ({(recall_k * 100):.2f}%)")
print("="*50)

# (参考) 最初の正解が見つからなかったクエリの数
# (全件検索しているので、0は「ヒットなし」を意味する)
not_found_count = len([r for r in all_first_hit_ranks if r == 0])
print(f"Queries where first hit was NOT found (in all 11.6M docs): {not_found_count} / {len(evaluation_queries)}")

Saving rich evaluation results to data/processed/ranknet_evaluation_results.json...
Rich results saved.

--- Final Evaluation Results: S-BERT (RankNet) ---
(Based on 46 queries, scoring ALL 11,619,136 documents)
(Rich results for Top 1000 saved to data/processed/ranknet_evaluation_results.json)
MRR (Mean Reciprocal Rank): 0.0001
Ranks of First Hit (0 = Not Found):
[3677, 18868, 3031, 159495, 368261, 1159, 217725, 28994, 49391, 661900, 7647, 29723, 75734, 375725, 406129, 156158, 251680, 414, 8960, 2046083, 843, 73822, 413223, 13872, 746109, 15052, 919631, 698075, 1740850, 21819, 7950529, 193814, 39213, 471266, 35689, 693699, 379184, 714220, 34420, 235317, 977123, 972345, 134061, 1418135, 886911, 297192]

--- Recall@K ---
Recall@1   : 0.0000 (0.00%)
Recall@5   : 0.0000 (0.00%)
Recall@10  : 0.0000 (0.00%)
Recall@50  : 0.0000 (0.00%)
Recall@100 : 0.0000 (0.00%)
Recall@150 : 0.0000 (0.00%)
Recall@200 : 0.0000 (0.00%)
Recall@250 : 0.0000 (0.00%)
Recall@300 : 0.0000 (0.00%)
Recall@350 : 0.000