In [1]:
# セル 1
import pandas as pd
import os
import numpy as np
import torch
import torch.nn as nn
from transformers import AutoTokenizer, AutoModel, BertPreTrainedModel
import sqlite3
from tqdm.auto import tqdm
import json
import shutil

if torch.cuda.is_available():
    print(f"✅ GPU is available: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda")
else:
    device = torch.device("cpu")

# セル 2: モデル定義 (ロード用)
class SiameseRankNetModel(BertPreTrainedModel):
    def __init__(self, config):
        super(SiameseRankNetModel, self).__init__(config)
        self.bert = AutoModel.from_config(config)
        self.classifier_head = nn.Sequential(
            nn.Linear(config.hidden_size * 4, config.hidden_size),
            nn.ReLU(),
            nn.Linear(config.hidden_size, 1)
        )
        self.init_weights()
    
    def _get_vector(self, input_ids, attention_mask):
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return output.pooler_output 

    def forward(self, input_ids=None, **kwargs): pass

# セル 3: 設定
DB_PATH = "data/processed/s2orc_filtered.db"
# ▼▼▼ 今回訓練したモデルのパス ▼▼▼
TRAINED_MODEL_PATH = "models/sbert_ranknet_v4/best_model" 

EMBEDDINGS_OUTPUT_FILE = "data/processed/ranknet_v4_scibert_cls_embeddings.npy"
DOI_MAP_OUTPUT_FILE = "data/processed/ranknet_v4_doi_map.json"
TEMP_EMBED_DIR = "data/processed/embeddings_tmp_ranknet_v4"
TEMP_DOI_DIR = "data/processed/dois_tmp_ranknet_v4"

MAX_LENGTH = 512
INFERENCE_BATCH_SIZE = 512

print(f"Loading tokenizer & model from: {TRAINED_MODEL_PATH}")
tokenizer = AutoTokenizer.from_pretrained(TRAINED_MODEL_PATH)
model = SiameseRankNetModel.from_pretrained(TRAINED_MODEL_PATH).to(device)
model.eval()

# セル 4: DB読み込みジェネレータ
def get_abstract_batches(db_path, batch_size=1000):
    with sqlite3.connect(db_path) as conn:
        cursor = conn.cursor()
        total_rows = cursor.execute("SELECT COUNT(doi) FROM papers WHERE abstract IS NOT NULL AND abstract != ''").fetchone()[0]
        yield total_rows 
        cursor.execute("SELECT doi, abstract FROM papers WHERE abstract IS NOT NULL AND abstract != ''")
        batch = []
        for row in cursor:
            batch.append(row)
            if len(batch) >= batch_size: yield batch; batch = []
        if batch: yield batch

# セル 5: ベクトル化実行
os.makedirs(TEMP_EMBED_DIR, exist_ok=True)
os.makedirs(TEMP_DOI_DIR, exist_ok=True)

with torch.no_grad():
    batch_generator = get_abstract_batches(DB_PATH, batch_size=1000)
    total_rows = next(batch_generator)
    
    for i, batch in tqdm(enumerate(batch_generator), total=(total_rows+999)//1000, desc="Vectorizing"):
        dois, abstracts = zip(*batch)
        batch_embeddings = []
        
        for j in range(0, len(abstracts), INFERENCE_BATCH_SIZE):
            sub_abstracts = abstracts[j : j + INFERENCE_BATCH_SIZE]
            inputs = tokenizer(list(sub_abstracts), padding="max_length", truncation=True, max_length=MAX_LENGTH, return_tensors="pt").to(device)
            embeddings = model._get_vector(inputs['input_ids'], inputs['attention_mask'])
            batch_embeddings.append(embeddings.cpu().numpy())
            
        np.save(os.path.join(TEMP_EMBED_DIR, f"batch_{i:05d}.npy"), np.vstack(batch_embeddings).astype(np.float32))
        with open(os.path.join(TEMP_DOI_DIR, f"batch_{i:05d}.json"), 'w') as f: json.dump(dois, f)

# セル 6: 結合と保存
all_dois = []
doi_files = sorted([f for f in os.listdir(TEMP_DOI_DIR) if f.endswith('.json')])
for f in tqdm(doi_files, desc="Reading DOIs"):
    with open(os.path.join(TEMP_DOI_DIR, f), 'r') as fp: all_dois.extend(json.load(fp))

final_embeddings = np.memmap(EMBEDDINGS_OUTPUT_FILE, dtype=np.float32, mode='w+', shape=(len(all_dois), 768))
idx = 0
for f in tqdm(doi_files, desc="Merging Embeddings"):
    data = np.load(os.path.join(TEMP_EMBED_DIR, f.replace('.json', '.npy')))
    final_embeddings[idx : idx + len(data)] = data
    idx += len(data)
final_embeddings.flush()

with open(DOI_MAP_OUTPUT_FILE, 'w') as f: json.dump({doi: i for i, doi in enumerate(all_dois)}, f)
shutil.rmtree(TEMP_EMBED_DIR)
shutil.rmtree(TEMP_DOI_DIR)
print("Done.")

✅ GPU is available: NVIDIA RTX A6000
Loading tokenizer & model from: models/sbert_ranknet_v4/best_model


Vectorizing:   0%|          | 0/11620 [00:00<?, ?it/s]

Reading DOIs:   0%|          | 0/11620 [00:00<?, ?it/s]

Merging Embeddings:   0%|          | 0/11620 [00:00<?, ?it/s]

Done.
