In [1]:
import pandas as pd
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel, BertPreTrainedModel
from transformers.modeling_outputs import SequenceClassifierOutput
import accelerate
import sqlite3
import json
from tqdm.auto import tqdm

# --- 1. GPUの確認 ---
if torch.cuda.is_available():
    print(f"✅ GPU is available: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda")
else:
    print("⚠️ GPU not found. Running on CPU.")
    device = torch.device("cpu")

✅ GPU is available: NVIDIA RTX A6000


In [2]:
# --- 2. カスタムモデルクラスの定義 ---
class SiameseRankNetModel(BertPreTrainedModel):
    """
    S-BERT (RankNet) モデル
    """
    def __init__(self, config):
        super(SiameseRankNetModel, self).__init__(config)
        self.bert = AutoModel.from_config(config)
        self.classifier_head = nn.Sequential(
            nn.Linear(config.hidden_size * 4, config.hidden_size),
            nn.ReLU(),
            nn.Linear(config.hidden_size, 1) # 最終的に1つのスコアを出力
        )
        self.init_weights()
    
    def _get_vector(self, input_ids, attention_mask):
        """SciBERTとCLSプーリングを実行するヘルパー関数"""
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return output.pooler_output # CLSトークンのベクトル

    def _calculate_score(self, vec_a, vec_b):
        """2つのベクトルから1つのスコアを計算するヘルパー関数"""
        diff = torch.abs(vec_a - vec_b)
        prod = vec_a * vec_b
        features = torch.cat([vec_a, vec_b, diff, prod], dim=1)
        return self.classifier_head(features)

    def forward(self, input_ids=None, **kwargs):
        pass # 推論時には不要

print("Custom model class 'SiameseRankNetModel' defined.")

Custom model class 'SiameseRankNetModel' defined.


In [3]:
# --- 3. 設定とリソースのロード ---
DB_PATH = "data/processed/s2orc_filtered.db"

# 訓練済みモデル
TRAINED_MODEL_PATH = "models/sbert_ranknet_v4/best_model"

# ベクトルDBとマップファイル (v4モデルで作成したもの)
EMBEDDINGS_FILE = "data/processed/ranknet_v4_scibert_cls_embeddings.npy"
DOI_MAP_FILE = "data/processed/ranknet_v4_doi_map.json"

# 評価データ
EVAL_PAPERS_FILE = "data/datapapers/sampled/evaluation_data_papers_50_v2.csv"
TRAIN_FILE = "data/processed/training_dataset_hard_negatives_1to3.csv"

# 出力ファイル
RICH_RESULTS_FILE = "data/processed/ranknet_v4_evaluation_results.json"

# 評価するkの値
# ランキング上位の精度を見るための小さいk
small_k_values = [1, 5, 10, 30, 50, 100, 200, 300, 500]
# 網羅性を見るための大きいk (1000刻みで30000まで)
large_k_values = list(range(1000, 30001, 1000))

EVAL_K_VALUES = sorted(list(set(small_k_values + large_k_values)))

# SAVE_TOP_K も評価の最大値に合わせる
SAVE_TOP_K = 30000 

MAX_LENGTH = 512

print(f"Loading tokenizer & model from: {TRAINED_MODEL_PATH}")
tokenizer = AutoTokenizer.from_pretrained(TRAINED_MODEL_PATH)
model = SiameseRankNetModel.from_pretrained(TRAINED_MODEL_PATH).to(device)
model.eval()

print(f"Loading DOI map from {DOI_MAP_FILE}...")
with open(DOI_MAP_FILE, 'r') as f:
    doi_to_index = json.load(f)
id_to_doi = {v: k for k, v in doi_to_index.items()}

print(f"Loading Embeddings (mmap_mode): {EMBEDDINGS_FILE}")
d = 768
file_size = os.path.getsize(EMBEDDINGS_FILE)
dtype_size = np.dtype(np.float32).itemsize
total_vectors = file_size // (d * dtype_size)

# メモリマップでロード
all_db_embeddings = np.memmap(
    EMBEDDINGS_FILE, dtype=np.float32, mode='r', shape=(total_vectors, d)
)
print(f"Loaded {total_vectors:,} embeddings.")

Loading tokenizer & model from: models/sbert_ranknet_v4/best_model
Loading DOI map from data/processed/ranknet_v4_doi_map.json...
Loading Embeddings (mmap_mode): data/processed/ranknet_v4_scibert_cls_embeddings.npy
Loaded 11,619,136 embeddings.


In [4]:
# --- 4. 評価データ（クエリと正解）の準備 ---
print("Preparing evaluation queries and ground truth...")

# 1. 訓練データ（除外リスト）のDOIをロード
df_train = pd.read_csv(TRAIN_FILE)
df_train = df_train.dropna(subset=['abstract_a', 'abstract_b'])
train_dois = set(df_train['abstract_a']) | set(df_train['abstract_b'])
print(f"Loaded {len(train_dois):,} training DOIs to exclude.")

# 2. 評価用データ論文（50件）のリストをロード
df_eval_papers = pd.read_csv(EVAL_PAPERS_FILE)
eval_data_paper_dois = tuple(df_eval_papers['cited_datapaper_doi'].unique())

# 3. 「クエリ論文」と「正解DOIリスト」を作成
evaluation_queries = [] 

with sqlite3.connect(DB_PATH) as conn:
    for data_paper_doi in tqdm(eval_data_paper_dois, desc="Building Ground Truth"):
        query_gt = "SELECT citing_doi FROM positive_candidates WHERE cited_datapaper_doi = ? AND human_annotation_status = 1"
        gt_rows = conn.execute(query_gt, (data_paper_doi,)).fetchall()
        ground_truth_dois = {row[0] for row in gt_rows}
        
        if len(ground_truth_dois) >= 2:
            query_doi = ground_truth_dois.pop() # 1件をクエリとして使用
            query_text = conn.execute("SELECT abstract FROM papers WHERE doi = ?", (query_doi,)).fetchone()
            
            if query_text:
                evaluation_queries.append({
                    "query_doi": query_doi,
                    "query_abstract": query_text[0],
                    "ground_truth_dois": list(ground_truth_dois)
                })

print(f"Prepared {len(evaluation_queries)} valid evaluation queries.")

Preparing evaluation queries and ground truth...
Loaded 2,338 training DOIs to exclude.


Building Ground Truth:   0%|          | 0/50 [00:00<?, ?it/s]

Prepared 50 valid evaluation queries.


In [5]:
# --- 5. 評価の実行 (全DBスコア計算) ---
print(f"Starting evaluation... Scoring ALL {total_vectors:,} candidates.")

# 1. 全DBのベクトルをGPUにロード (33GB)
# (これは成功しているのでそのまま維持)
print("Loading ALL embeddings into GPU memory...")
all_db_embeddings_gpu = torch.tensor(all_db_embeddings).to(device)
print(f"Successfully loaded {all_db_embeddings_gpu.shape} tensor onto GPU.")

all_first_hit_ranks = [] 
all_recalls_at_k = {k: [] for k in EVAL_K_VALUES}
all_rich_results = [] 

# ▼▼▼ 修正点: 計算時のバッチサイズを安全な値に設定 ▼▼▼
# (100万件なら一時メモリは約3GBで済み、残り14GBの空きに十分収まる)
CALC_BATCH_SIZE = 100000 

with torch.no_grad():
    for query_data in tqdm(evaluation_queries, desc="Evaluating Queries (Total)"):
        
        query_abstract = query_data["query_abstract"]
        ground_truth = set(query_data["ground_truth_dois"])

        # 1. クエリをベクトル化
        inputs = tokenizer(query_abstract, padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt").to(device)
        query_vector = model._get_vector(inputs['input_ids'], inputs['attention_mask'])

        # --- 2. 全DB（1160万件）のスコアをバッチ処理で計算 ---
        all_scores = []
        
        # ▼▼▼ 修正点: 固定バッチサイズでループ ▼▼▼
        for i in range(0, total_vectors, CALC_BATCH_SIZE):
            end = min(i + CALC_BATCH_SIZE, total_vectors)
            
            # DBベクトルからバッチ分をスライス (これはViewなのでメモリを食わない)
            candidates_chunk = all_db_embeddings_gpu[i:end]
            
            # クエリベクトルを複製 (ここでVRAMを使用)
            query_vector_tiled = query_vector.repeat(len(candidates_chunk), 1)

            # スコア計算
            scores_chunk = model._calculate_score(query_vector_tiled, candidates_chunk)
            
            # 結果をCPUに移してリストに追加 (VRAM解放)
            all_scores.append(scores_chunk.cpu().numpy())
            
            # 明示的に削除してVRAM確保
            del query_vector_tiled
            del scores_chunk
        
        # 全スコアを結合
        final_scores = np.vstack(all_scores).flatten()
        
        # 3. スコア順に並び替え
        sorted_indices = np.argsort(final_scores)[::-1] # 降順
        
        # 4. 採点 と 結果保存
        first_hit_rank = 0 
        hits_count = 0
        hits_at_k = {k: 0 for k in EVAL_K_VALUES}
        top_k_results_list = []
        ranks_of_all_hits = []
        current_rank = 1
        
        for idx in sorted_indices:
            if current_rank > SAVE_TOP_K and hits_count == len(ground_truth):
                break
                
            if idx not in id_to_doi: continue
            doi = id_to_doi[idx]
            if doi in train_dois: continue
            
            is_correct = (doi in ground_truth)
            
            if is_correct:
                hits_count += 1
                ranks_of_all_hits.append(current_rank)
                if first_hit_rank == 0:
                    first_hit_rank = current_rank
            
            for k in EVAL_K_VALUES:
                if current_rank <= k and is_correct:
                    hits_at_k[k] += 1
            
            if current_rank <= SAVE_TOP_K:
                top_k_results_list.append({
                    "rank": current_rank,
                    "doi": doi,
                    "score": float(final_scores[idx]),
                    "is_correct": is_correct
                })
            
            current_rank += 1 

        # 5. 集計
        all_first_hit_ranks.append(first_hit_rank)
            
        for k in EVAL_K_VALUES:
            recall = hits_at_k[k] / len(ground_truth) if ground_truth else 0
            all_recalls_at_k[k].append(recall)
            
        all_rich_results.append({
            "query_doi": query_data["query_doi"],
            "total_ground_truth": len(ground_truth),
            "ground_truth_dois": list(ground_truth),
            "first_hit_rank": first_hit_rank,
            "ranks_of_all_hits": ranks_of_all_hits,
            "top_k_results": top_k_results_list
        })

print("Evaluation complete.")

Starting evaluation... Scoring ALL 11,619,136 candidates.
Loading ALL embeddings into GPU memory...
Successfully loaded torch.Size([11619136, 768]) tensor onto GPU.


Evaluating Queries (Total):   0%|          | 0/50 [00:00<?, ?it/s]

Evaluation complete.


In [6]:
# --- 6. 最終結果の集計 ---

print(f"Saving rich evaluation results to {RICH_RESULTS_FILE}...")
with open(RICH_RESULTS_FILE, 'w') as f:
    json.dump(all_rich_results, f, indent=2)
print("Rich results saved.")

print("\n" + "="*50)
print(f"--- Final Evaluation Results: S-BERT (RankNet v4) ---")
print(f"(Based on {len(evaluation_queries)} queries, scoring ALL {total_vectors:,} documents)")
print("="*50)

# MRR
mrr_scores = [1.0 / r for r in all_first_hit_ranks if r > 0]
mrr = np.mean(mrr_scores) if mrr_scores else 0.0
print(f"MRR (Mean Reciprocal Rank): {mrr:.4f}")

print(f"Ranks of First Hit (0 = Not Found):")
print(all_first_hit_ranks)

print("\n--- Recall@K ---")
for k in EVAL_K_VALUES:
    recall_k = np.mean(all_recalls_at_k[k])
    print(f"Recall@{k:<4}: {recall_k:.4f} ({(recall_k * 100):.2f}%)")
print("="*50)

not_found_count = len([r for r in all_first_hit_ranks if r == 0])
print(f"Queries where first hit was NOT found: {not_found_count} / {len(evaluation_queries)}")

Saving rich evaluation results to data/processed/ranknet_v4_evaluation_results.json...
Rich results saved.

--- Final Evaluation Results: S-BERT (RankNet v4) ---
(Based on 50 queries, scoring ALL 11,619,136 documents)
MRR (Mean Reciprocal Rank): 0.0003
Ranks of First Hit (0 = Not Found):
[1122, 1629, 496, 6816, 16003, 403, 84983, 83030, 3072, 47930, 414548, 202572, 7744, 113385, 146150, 558062, 213067, 3522, 17964, 14237, 660, 2013739, 414, 2485, 8723, 102664, 825798, 228892, 225148, 9235003, 10853678, 205815, 3436, 744877, 162885, 3253, 91634, 605134, 60003, 258781, 234485, 5317290, 122220, 74552, 293573, 114330, 442, 555053, 21202, 407755]

--- Recall@K ---
Recall@1   : 0.0000 (0.00%)
Recall@5   : 0.0000 (0.00%)
Recall@10  : 0.0000 (0.00%)
Recall@30  : 0.0000 (0.00%)
Recall@50  : 0.0000 (0.00%)
Recall@100 : 0.0000 (0.00%)
Recall@200 : 0.0000 (0.00%)
Recall@300 : 0.0000 (0.00%)
Recall@500 : 0.0178 (1.78%)
Recall@1000: 0.0305 (3.05%)
Recall@2000: 0.0324 (3.24%)
Recall@3000: 0.0391 (3.9