In [1]:
import pandas as pd
import os
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModel, 
    BertPreTrainedModel, 
    TrainingArguments, 
    Trainer
)
from transformers.modeling_outputs import SequenceClassifierOutput
import accelerate
import sqlite3
from tqdm.auto import tqdm

# --- 1. GPUの確認 ---
if torch.cuda.is_available():
    print(f"✅ GPU is available. Device: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda")
else:
    print("⚠️ GPU not found. Running on CPU.")
    device = torch.device("cpu")

✅ GPU is available. Device: NVIDIA RTX A6000


In [2]:
# --- 2. 設定 ---
DB_PATH = "data/processed/s2orc_filtered.db"
# ▼▼▼ 訓練済みのS-BERT (Contrastive) モデルのパス ▼▼▼
TRAINED_MODEL_PATH = "models/sbert_contrastive_v1/best_model" 

# 出力ファイル（ベクトルとDOIのマップ）
EMBEDDINGS_OUTPUT_FILE = "data/processed/embeddings_sbert_contrastive.npy"
DOI_MAP_OUTPUT_FILE = "data/processed/embeddings_doi_map.json"

# モデルのハイパーパラメータ
MODEL_CHECKPOINT = "allenai/scibert_scivocab_uncased"
MAX_LENGTH = 512
BATCH_SIZE = 128 # 推論（Inference）なので、訓練時より大きいバッチサイズが使える

print("Configuration set for embedding generation.")

Configuration set for embedding generation.


In [3]:
# --- 3. カスタムモデルクラスの定義 (CLS Pooling) ---
# (notebooks/18c... と同一の定義)

class SiameseContrastiveModel(BertPreTrainedModel):
    """
    S-BERT (Contrastive) モデル (CLS Pooling)
    """
    def __init__(self, config):
        super(SiameseContrastiveModel, self).__init__(config)
        self.bert = AutoModel.from_config(config)
        self.init_weights()

    def get_embedding(self, input_ids, attention_mask):
        """
        単一のアブストラクトを入力とし、CLSベクトルを返すヘルパー関数
        """
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return output.pooler_output # CLSトークンのベクトル

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        input_ids_b=None,
        attention_mask_b=None,
        labels=None,
        **kwargs
    ):
        # 訓練用のforwardパス（評価では直接使わない）
        output_a = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        output_b = self.bert(input_ids=input_ids_b, attention_mask=attention_mask_b)
        vec_x = output_a.pooler_output
        vec_y = output_b.pooler_output
        return SequenceClassifierOutput(loss=None, logits=(vec_x, vec_y))

print("Custom model class 'SiameseContrastiveModel' defined.")

Custom model class 'SiameseContrastiveModel' defined.


In [4]:
# --- 4. 訓練済みモデルとトークナイザのロード ---
print(f"Loading tokenizer from: {MODEL_CHECKPOINT}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

print(f"Loading trained model from: {TRAINED_MODEL_PATH}")
# 訓練済みの重みをロード
model = SiameseContrastiveModel.from_pretrained(TRAINED_MODEL_PATH).to(device)
model.eval() # ★ 必須: モデルを「評価モード」に切り替える
print("Model and tokenizer loaded successfully.")

Loading tokenizer from: allenai/scibert_scivocab_uncased




Loading trained model from: models/sbert_contrastive_v1/best_model
Model and tokenizer loaded successfully.


In [5]:
# --- 5. データベースからのデータ読み込み (ジェネレータ) ---

def get_abstract_batches(db_path, batch_size=1000):
    """
    DBからアブストラクトをバッチ単位で読み込むジェネレータ
    """
    print(f"Opening database connection: {db_path}")
    with sqlite3.connect(db_path) as conn:
        cursor = conn.cursor()
        
        # 'papers'テーブルの総数を取得 (進捗表示用)
        total_rows = cursor.execute("SELECT COUNT(doi) FROM papers WHERE abstract IS NOT NULL").fetchone()[0]
        print(f"Total abstracts to process: {total_rows:,}")
        
        cursor.execute("SELECT doi, abstract FROM papers WHERE abstract IS NOT NULL")
        
        batch = []
        for row in tqdm(cursor, total=total_rows, desc="Reading Abstracts"):
            batch.append(row)
            if len(batch) >= batch_size:
                yield batch
                batch = []
        if batch:
            yield batch

print("Database generator defined.")

Database generator defined.


In [6]:
# --- 6. 全アブストラクトのベクトル化を実行 ---
all_embeddings = []
all_dois = []

# torch.no_grad() で勾配計算を無効化し、メモリ消費を抑えて高速化
with torch.no_grad():
    # DBから1000件ずつ論文を読み込む
    for batch in get_abstract_batches(DB_PATH, batch_size=1000):
        
        dois, abstracts = zip(*batch)
        
        # 1000件のテキストをトークナイズ
        inputs = tokenizer(
            list(abstracts), 
            padding="max_length", 
            truncation=True, 
            max_length=MAX_LENGTH, 
            return_tensors="pt"
        ).to(device)
        
        # --- モデルを使ってベクトル化 ---
        # (Siameseモデル本体(bert)を使ってベクトルを計算)
        outputs = model.bert(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        embeddings = outputs.pooler_output # CLSトークン
        
        # GPUからCPUにデータを戻し、Numpy配列に変換
        all_embeddings.append(embeddings.cpu().numpy())
        all_dois.extend(list(dois))

# 最後に、リストに分割されているNumpy配列を一つの巨大な配列に結合
print("Concatenating all embeddings...")
final_embeddings = np.vstack(all_embeddings)

print(f"Embedding generation complete. Shape: {final_embeddings.shape}")

Opening database connection: data/processed/s2orc_filtered.db
Total abstracts to process: 11,619,136


Reading Abstracts:   0%|          | 0/11619136 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
# --- 7. 最終的なベクトルDBファイルとDOIマップの保存 ---
import json

# ベクトルをNumpyファイルとして保存
print(f"Saving embeddings to {EMBEDDINGS_OUTPUT_FILE}...")
np.save(EMBEDDINGS_OUTPUT_FILE, final_embeddings)

# DOIとインデックス（Numpy配列の何行目か）の対応表をJSONで保存
doi_to_index_map = {doi: i for i, doi in enumerate(all_dois)}

print(f"Saving DOI-to-Index map to {DOI_MAP_OUTPUT_FILE}...")
with open(DOI_MAP_OUTPUT_FILE, 'w') as f:
    json.dump(doi_to_index_map, f)

print("\n--- Step 1 Complete ---")
print(f"Total embeddings saved: {len(doi_to_index_map)}")