In [1]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, BertPreTrainedModel
from transformers.modeling_outputs import SequenceClassifierOutput
import accelerate
import sqlite3
import faiss # （Faissは使わないが、ライブラリ確認のため残置）
import json
from tqdm.auto import tqdm

# --- 1. GPUの確認 ---
if torch.cuda.is_available():
    print(f"✅ GPU is available. Device: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda")
else:
    print("⚠️ GPU not found. Running on CPU.")
    device = torch.device("cpu")

✅ GPU is available. Device: NVIDIA RTX A6000


In [2]:
# --- 2. カスタムモデルクラスの定義 (CLS Pooling) ---
# (notebooks/18d... と同一の定義)

class SiameseRankNetModel(BertPreTrainedModel):
    def __init__(self, config):
        super(SiameseRankNetModel, self).__init__(config)
        self.bert = AutoModel.from_config(config)
        self.classifier_head = nn.Sequential(
            nn.Linear(config.hidden_size * 4, config.hidden_size),
            nn.ReLU(),
            nn.Linear(config.hidden_size, 1)
        )
        self.init_weights()
    
    def _get_vector(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return output.pooler_output 

    def _calculate_score(self, vec_a, vec_b):
        diff = torch.abs(vec_a - vec_b)
        prod = vec_a * vec_b
        features = torch.cat([vec_a, vec_b, diff, prod], dim=1)
        return self.classifier_head(features)

    def forward(self, input_ids=None, **kwargs):
        pass # 推論時には不要

print("Custom model class 'SiameseRankNetModel' defined.")

Custom model class 'SiameseRankNetModel' defined.


In [None]:
# --- 3. 設定とリソースのロード ---
DB_PATH = "data/processed/s2orc_filtered.db"
TRAINED_MODEL_PATH = "models/sbert_ranknet_v3/best_model"

# ▼▼▼ 訓練済みRankNetモデルの「エンコーダー」で作成したベクトルを使用 ▼▼▼
EMBEDDINGS_FILE = "data/processed/ranknet_scibert_cls_embeddings.npy"
DOI_MAP_FILE = "data/processed/ranknet_doi_map.json"

# 評価クエリと正解
EVAL_PAPERS_FILE = "data/datapapers/sampled/evaluation_data_papers_50.csv"
# 訓練データ (除外リスト作成用)
TRAIN_FILE = "data/processed/training_dataset_abstract_cleaned_v3.csv"

# 評価設定
EVAL_K_VALUES = [1, 5, 10, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000]
# スコア計算時のバッチサイズ（GPUメモリに応じて調整）
SCORE_BATCH_SIZE = 8192 
MAX_LENGTH = 512

print(f"Loading tokenizer & model from: {TRAINED_MODEL_PATH}")
tokenizer = AutoTokenizer.from_pretrained(TRAINED_MODEL_PATH)
model = SiameseRankNetModel.from_pretrained(TRAINED_MODEL_PATH).to(device)
model.eval()

print(f"Loading DOI map: {DOI_MAP_FILE}")
with open(DOI_MAP_FILE, 'r') as f:
    doi_to_index = json.load(f)
id_to_doi = {v: k for k, v in doi_to_index.items()}

print(f"Loading Embeddings (mmap_mode): {EMBEDDINGS_FILE}")
d = 768
file_size = os.path.getsize(EMBEDDINGS_FILE)
dtype_size = np.dtype(np.float32).itemsize
total_vectors = file_size // (d * dtype_size)

# ▼▼▼ 全DB（1160万件）のベクトルをメモリマップでロード ▼▼▼
all_db_embeddings = np.memmap(
    EMBEDDINGS_FILE, dtype=np.float32, mode='r', shape=(total_vectors, d)
)
print(f"Loaded {total_vectors:,} embeddings.")

Loading tokenizer & model from: models/sbert_ranknet_v3/best_model
Loading DOI map: data/processed/ranknet_doi_map.json
Loading Embeddings (mmap_mode): data/processed/ranknet_scibert_cls_embeddings.npy
Loaded 11,619,136 embeddings.


In [4]:
# --- 4. 評価データ（クエリと正解）の準備 ---
print("Preparing evaluation queries and ground truth...")

# 1. 訓練データ（除外リスト）のDOIをロード
df_train = pd.read_csv(TRAIN_FILE)
df_train = df_train.dropna(subset=['abstract_a', 'abstract_b'])
train_dois = set(df_train['abstract_a']) | set(df_train['abstract_b'])
print(f"Loaded {len(train_dois):,} training DOIs to exclude.")

# 2. 評価用データ論文（50件）のリストをロード
df_eval_papers = pd.read_csv(EVAL_PAPERS_FILE)
eval_data_paper_dois = tuple(df_eval_papers['cited_datapaper_doi'].unique())

# 3. 50件の「クエリ論文」と「正解DOIリスト」を作成
evaluation_queries = [] 

with sqlite3.connect(DB_PATH) as conn:
    for data_paper_doi in tqdm(eval_data_paper_dois, desc="Building Ground Truth"):
        # 正解DOI (Human=1) を取得
        query_gt = "SELECT citing_doi FROM positive_candidates WHERE cited_datapaper_doi = ? AND human_annotation_status = 1"
        gt_rows = conn.execute(query_gt, (data_paper_doi,)).fetchall()
        ground_truth_dois = {row[0] for row in gt_rows}
        
        if not ground_truth_dois:
            continue

        # クエリ論文（正解の中から1件）のテキストを取得
        query_doi = ground_truth_dois.pop()
        
        query_text = conn.execute("SELECT abstract FROM papers WHERE doi = ?", (query_doi,)).fetchone()
        if query_text:
            evaluation_queries.append({
                "query_doi": query_doi,
                "query_abstract": query_text[0],
                "ground_truth_dois": ground_truth_dois 
            })

print(f"Prepared {len(evaluation_queries)} valid evaluation queries.")

Preparing evaluation queries and ground truth...
Loaded 28,824 training DOIs to exclude.


Building Ground Truth:   0%|          | 0/50 [00:00<?, ?it/s]

Prepared 48 valid evaluation queries.


In [None]:
# --- 5. 評価の実行 (全DBスコア計算) ---
print(f"Starting evaluation... Scoring ALL {total_vectors:,} candidates.")

# ▼▼▼ 修正点: 逆順位(1/R)ではなく、順位(R)を保存するリストに変更 ▼▼▼
all_first_hit_ranks = [] 
all_recalls_at_k = {k: [] for k in EVAL_K_VALUES}

with torch.no_grad():
    for query_data in tqdm(evaluation_queries, desc="Evaluating Queries (Total)"):
        
        query_abstract = query_data["query_abstract"]
        ground_truth = query_data["ground_truth_dois"]

        # 1. クエリをベクトル化
        inputs = tokenizer(query_abstract, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt").to(device)
        query_vector = model._get_vector(inputs['input_ids'], inputs['attention_mask'])

        # 2. 全DB（1160万件）のスコアをバッチ処理で計算
        all_scores = []
        for i in range(0, total_vectors, SCORE_BATCH_SIZE): # tqdmを内側から削除（重複表示を避けるため）
            candidate_vectors_batch = torch.tensor(
                all_db_embeddings[i : i + SCORE_BATCH_SIZE]
            ).to(device)
            query_vector_tiled = query_vector.repeat(len(candidate_vectors_batch), 1)
            scores_batch = model._calculate_score(query_vector_tiled, candidate_vectors_batch)
            all_scores.append(scores_batch.cpu().numpy())
        
        final_scores = np.vstack(all_scores).flatten()
        
        # 3. スコア順に並び替え
        sorted_indices = np.argsort(final_scores)[::-1]
        
        # 4. 最終ランキングリストを作成 (DOI)
        final_ranking_dois = [id_to_doi[idx] for idx in sorted_indices if idx in id_to_doi]

        # 5. 訓練データを除外
        filtered_ranking = [doi for doi in final_ranking_dois if doi not in train_dois]
        
        # --- 6. 採点 ---
        
        # MRR
        first_hit_rank = 0 # 0は「ヒットなし」を意味する
        for i, doi in enumerate(filtered_ranking):
            if doi in ground_truth:
                first_hit_rank = i + 1 # ランクは1始まり
                break
        
        # ▼▼▼ 修正点: 順位(R)そのものを保存 ▼▼▼
        all_first_hit_ranks.append(first_hit_rank)
            
        # Recall@k
        for k in EVAL_K_VALUES:
            hits_at_k = ground_truth.intersection(set(filtered_ranking[:k]))
            recall = len(hits_at_k) / len(ground_truth) if ground_truth else 0
            all_recalls_at_k[k].append(recall)

print("Evaluation complete.")

Starting evaluation... Scoring ALL 11,619,136 candidates.


Evaluating Queries (Total):   0%|          | 0/48 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Scoring DB:   0%|          | 0/2837 [00:00<?, ?it/s]

Evaluation complete.


In [None]:
# --- 6. 最終結果の集計 ---
print("\n" + "="*50)
print(f"--- Final Evaluation Results: S-BERT (RankNet) ---")
print(f"(Based on {len(evaluation_queries)} queries, scoring ALL {total_vectors:,} documents)")
print("="*50)

# ▼▼▼ 修正点: MRRを順位リストから計算 ▼▼▼
mrr_scores = [1.0 / r for r in all_first_hit_ranks if r > 0]
mrr = np.mean(mrr_scores) if mrr_scores else 0.0
print(f"MRR (Mean Reciprocal Rank): {mrr:.4f}")

# ▼▼▼ 追加: 各クエリの最初の正解順位を表示 ▼▼▼
print(f"Ranks of First Hit (0 = Not Found in {len(filtered_ranking)} docs):")
print(all_first_hit_ranks)

print("\n--- Recall@K ---")
for k in EVAL_K_VALUES:
    recall_k = np.mean(all_recalls_at_k[k])
    print(f"Recall@{k:<4}: {recall_k:.4f} ({(recall_k * 100):.2f}%)")
print("="*50)


--- Final Evaluation Results: S-BERT (RankNet) ---
(Based on 48 queries, scoring ALL 11,619,136 documents)
MRR (Mean Reciprocal Rank): 0.0004

--- Recall@K ---
Recall@1   : 0.0000 (0.00%)
Recall@5   : 0.0000 (0.00%)
Recall@10  : 0.0000 (0.00%)
Recall@50  : 0.0000 (0.00%)
Recall@100 : 0.0021 (0.21%)
Recall@150 : 0.0021 (0.21%)
Recall@200 : 0.0021 (0.21%)
Recall@250 : 0.0021 (0.21%)
Recall@300 : 0.0090 (0.90%)
Recall@350 : 0.0090 (0.90%)
Recall@400 : 0.0090 (0.90%)
Recall@450 : 0.0090 (0.90%)
Recall@500 : 0.0142 (1.42%)
Recall@550 : 0.0142 (1.42%)
Recall@600 : 0.0142 (1.42%)
Recall@650 : 0.0142 (1.42%)
Recall@700 : 0.0142 (1.42%)
Recall@750 : 0.0142 (1.42%)
Recall@800 : 0.0142 (1.42%)
Recall@850 : 0.0142 (1.42%)
Recall@900 : 0.0142 (1.42%)
Recall@950 : 0.0142 (1.42%)
Recall@1000: 0.0142 (1.42%)
