In [1]:
# --- 1. ライブラリのインポート ---
import pandas as pd
import os
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, BertPreTrainedModel
from transformers.modeling_outputs import SequenceClassifierOutput
import sqlite3
import json
from tqdm.auto import tqdm
import time

# --- 2. GPUの確認 ---
if torch.cuda.is_available():
    print(f"✅ GPU is available. Device: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda")
else:
    print("⚠️ GPU not found. Running on CPU.")
    device = torch.device("cpu")

# --- 3. 訓練時と同じモデルクラスの定義 ---
class SiameseRankNetModel(BertPreTrainedModel):
    def __init__(self, config):
        super(SiameseRankNetModel, self).__init__(config)
        self.bert = AutoModel.from_config(config)
        self.classifier_head = nn.Sequential(
            nn.Linear(config.hidden_size * 4, config.hidden_size),
            nn.ReLU(),
            nn.Linear(config.hidden_size, 1)
        )
        # self.init_weights() # from_pretrainedを使うので不要

    def _get_vector(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return output.pooler_output 

    def _calculate_score(self, vec_a, vec_b):
        diff = torch.abs(vec_a - vec_b)
        prod = vec_a * vec_b
        features = torch.cat([vec_a, vec_b, diff, prod], dim=1)
        return self.classifier_head(features)

    def forward(self, input_ids, attention_mask=None, **kwargs):
        # 推論用に、単一のベクトルを返すシンプルなforwardに修正
        return self._get_vector(input_ids, attention_mask)

print("Custom model class 'SiameseRankNetModel' defined.")

✅ GPU is available. Device: NVIDIA RTX A6000
Custom model class 'SiameseRankNetModel' defined.


In [2]:
# --- 4. パスと設定 ---

# --- 入力ファイルパス ---
DB_PATH = "data/processed/s2orc_filtered.db"
TRAINED_MODEL_PATH = "models/sbert_ranknet_v3/best_model"
EMBEDDINGS_FILE = "data/processed/ranknet_scibert_cls_embeddings.npy"
DOI_MAP_FILE = "data/processed/ranknet_doi_map.json"
EVAL_PAPERS_FILE = "data/datapapers/sampled/evaluation_data_papers_50.csv"
TRAIN_FILE = "data/processed/training_dataset_hard_negatives_1to3.csv"

# --- 出力ファイルパス ---
RICH_RESULTS_FILE = "data/processed/ranknet_evaluation_results_v2.json"

# --- モデルと評価のハイパーパラメータ ---
MAX_LENGTH = 512
EMBEDDING_DIM = 768 # SciBERT-baseの隠れ層サイズ
EVAL_BATCH_SIZE = 4096  #  vektor化時のバッチサイズ
SCORE_BATCH_SIZE = 131072 # スコア計算時のバッチサイズ (GPUメモリに応じて調整)

# 評価するkの値
# ランキング上位の精度を見るための小さいk
small_k_values = [1, 5, 10, 30, 50, 100, 200, 300, 500]
# 網羅性を見るための大きいk (1000刻みで30000まで)
large_k_values = list(range(1000, 30001, 1000))

EVAL_K_VALUES = sorted(list(set(small_k_values + large_k_values)))

# SAVE_TOP_K も評価の最大値に合わせる
SAVE_TOP_K = 30000 

print(f"k values for evaluation: {EVAL_K_VALUES}")
print(f"Top k results to save: {SAVE_TOP_K}")
print("Paths and settings are configured.")

k values for evaluation: [1, 5, 10, 30, 50, 100, 200, 300, 500, 1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000, 11000, 12000, 13000, 14000, 15000, 16000, 17000, 18000, 19000, 20000, 21000, 22000, 23000, 24000, 25000, 26000, 27000, 28000, 29000, 30000]
Top k results to save: 30000
Paths and settings are configured.


In [None]:
# --- 5. リソースのロード ---
print(f"Loading tokenizer & model from: {TRAINED_MODEL_PATH}")
tokenizer = AutoTokenizer.from_pretrained(TRAINED_MODEL_PATH)
model = SiameseRankNetModel.from_pretrained(TRAINED_MODEL_PATH).to(device)
model.eval()

print(f"Loading DOI map from {DOI_MAP_FILE}...")
with open(DOI_MAP_FILE, 'r') as f:
    doi_to_index = json.load(f)
index_to_doi = {v: k for k, v in doi_to_index.items()}

print(f"Loading embeddings with mmap_mode from {EMBEDDINGS_FILE}...")
# memmapで1次元配列として読み込む
raw_embeddings = np.memmap(EMBEDDINGS_FILE, dtype=np.float32, mode='r')

# 論文の総数を正しく計算
total_papers = raw_embeddings.shape[0] // EMBEDDING_DIM
print(f"Total elements: {raw_embeddings.shape[0]:,}, Total papers: {total_papers:,}")

# 正しい形状にreshape
corpus_embeddings = raw_embeddings.reshape(total_papers, EMBEDDING_DIM)
print(f"Embeddings reshaped to: {corpus_embeddings.shape}")

# これ以降のコードで使うのは total_papers
total_vectors = total_papers 
print(f"Loaded {total_vectors:,} embeddings.")

Loading tokenizer & model from: models/sbert_ranknet_v3/best_model
Loading DOI map from data/processed/ranknet_doi_map.json...
Loading embeddings with mmap_mode from data/processed/ranknet_scibert_cls_embeddings.npy...


ValueError: cannot reshape array of size 8923496448 into shape (8923496448,768)

In [None]:
# --- 6. 評価データ（クエリと正解）の準備 ---
print("Preparing evaluation queries and ground truth...")

df_train = pd.read_csv(TRAIN_FILE)
train_dois = set(df_train['abstract_a']) | set(df_train['abstract_b'])
# 訓練データDOIのインデックスセットを事前に作成
train_indices = {doi_to_index[doi] for doi in train_dois if doi in doi_to_index}
print(f"Loaded {len(train_dois):,} training DOIs to exclude.")

df_eval_papers = pd.read_csv(EVAL_PAPERS_FILE)
eval_data_paper_dois = tuple(df_eval_papers['cited_datapaper_doi'].unique())

evaluation_queries = [] 
with sqlite3.connect(DB_PATH) as conn:
    for data_paper_doi in tqdm(eval_data_paper_dois, desc="Building Ground Truth"):
        query_gt = "SELECT citing_doi FROM positive_candidates WHERE cited_datapaper_doi = ? AND human_annotation_status = 1"
        gt_rows = conn.execute(query_gt, (data_paper_doi,)).fetchall()
        ground_truth_dois = {row[0] for row in gt_rows}
        
        if len(ground_truth_dois) >= 2:
            query_doi = ground_truth_dois.pop() 
            query_text_res = conn.execute("SELECT abstract FROM papers WHERE doi = ?", (query_doi,)).fetchone()
            if query_text_res and query_text_res[0]:
                evaluation_queries.append({
                    "query_doi": query_doi,
                    "query_abstract": query_text_res[0],
                    "ground_truth_dois": list(ground_truth_dois)
                })

print(f"Prepared {len(evaluation_queries)} valid evaluation queries.")

In [None]:
# --- 7. 評価の実行 ---
print(f"Starting evaluation for {len(evaluation_queries)} queries...")

all_query_results = []
total_start_time = time.time()
max_rank_to_check = max(max(EVAL_K_VALUES), SAVE_TOP_K)

with torch.no_grad():
    for query_data in tqdm(evaluation_queries, desc="Evaluating Queries"):
        
        # 1. クエリをベクトル化
        inputs = tokenizer(query_data["query_abstract"], padding=True, truncation=True, max_length=MAX_LENGTH, return_tensors="pt").to(device)
        query_vector = model(**inputs)

        # 2. 全候補論文とのスコアをバッチ計算
        all_scores = []
        for i in range(0, total_vectors, SCORE_BATCH_SIZE):
            candidate_vectors_batch = torch.from_numpy(corpus_embeddings[i : i + SCORE_BATCH_SIZE]).to(device)
            query_vector_tiled = query_vector.expand(len(candidate_vectors_batch), -1)
            scores_batch = model._calculate_score(query_vector_tiled, candidate_vectors_batch)
            all_scores.append(scores_batch.cpu())
        
        final_scores = torch.cat(all_scores, dim=0).flatten().numpy()

        # 3. スコア順に並び替え
        sorted_indices = np.argsort(final_scores)[::-1]
        
        # 4. 評価とリッチな結果の保存
        ground_truth_indices = {doi_to_index[doi] for doi in query_data["ground_truth_dois"] if doi in doi_to_index}
        
        ranks_of_hits = []
        top_k_results = []
        rank_counter = 0

        for doc_index in sorted_indices:
            if doc_index in train_indices:
                continue
            
            rank_counter += 1
            is_correct = doc_index in ground_truth_indices
            
            if is_correct:
                ranks_of_hits.append(rank_counter)
            
            if rank_counter <= SAVE_TOP_K:
                top_k_results.append({
                    "rank": rank_counter,
                    "doi": index_to_doi.get(int(doc_index), "N/A"),
                    "score": float(final_scores[doc_index]),
                    "is_correct": is_correct
                })
            
            # 探索打ち切り判定
            if rank_counter >= max_rank_to_check:
                # 全ての正解を見つけた場合、さらに早く打ち切る
                if len(ranks_of_hits) == len(ground_truth_indices):
                    break
        
        all_query_results.append({
            "query_doi": query_data["query_doi"],
            "ground_truth_dois": query_data["ground_truth_dois"],
            "ranks_of_hits": sorted(ranks_of_hits),
            "top_k_results": top_k_results
        })

print(f"\nEvaluation finished. Total time: {(time.time() - total_start_time)/60:.2f} minutes.")

In [None]:
# --- 8. 最終結果の集計と保存 ---

# 8-1. リッチな結果をJSONファイルに保存
print(f"Saving rich evaluation results to {RICH_RESULTS_FILE}...")
with open(RICH_RESULTS_FILE, 'w') as f:
    json.dump(all_query_results, f, indent=2)
print("Rich results saved.")

# 8-2. 平均スコアのサマリーを計算
mrr_scores = []
recall_at_k_scores = {k: [] for k in EVAL_K_VALUES}

for res in all_query_results:
    ranks = res["ranks_of_hits"]
    
    # MRRの計算 (最初のヒットの順位)
    if ranks:
        mrr_scores.append(1.0 / ranks[0])
    else:
        mrr_scores.append(0.0)
    
    # Recall@kの計算
    for k in EVAL_K_VALUES:
        # 上位kに1つでも正解があればヒット (Recall=1)
        if ranks and any(r <= k for r in ranks):
            recall_at_k_scores[k].append(1.0)
        else:
            recall_at_k_scores[k].append(0.0)

# 平均を計算
final_mrr = np.mean(mrr_scores)
final_recall_at_k = {k: np.mean(scores) for k, scores in recall_at_k_scores.items()}

# --- 9. 結果の表示 ---
print("\n" + "="*50)
print(f"--- Final Evaluation Results: S-BERT (RankNet) ---")
print(f"(Based on {len(evaluation_queries)} queries)")
print("="*50)

print(f"MRR (Mean Reciprocal Rank): {final_mrr:.4f}\n")
print("--- Recall@K ---")
df_results = pd.DataFrame(columns=['Metric', 'Value'])
for k, recall in final_recall_at_k.items():
    metric_name = f"Recall@{k}"
    print(f"{metric_name:<10}: {recall:.4f} ({(recall * 100):.2f}%)")
    df_results.loc[len(df_results)] = [metric_name, recall]

df_results.loc[len(df_results)] = ['MRR', final_mrr]
print("="*50)

# (参考) 正解が1つも見つからなかったクエリの数
not_found_count = len([res for res in all_query_results if not res["ranks_of_hits"]])
print(f"Queries where NO hit was found: {not_found_count} / {len(evaluation_queries)}")

print("\n--- Results Table ---")
display(df_results)