In [1]:
import pandas as pd
import os
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel, AutoConfig, BertPreTrainedModel
from tqdm import tqdm

# --- 1. 設定 ---
CORPUS_FILE = "data/raw/all_papers_corpus.csv"
ABSTRACT_COLUMN = "abstract"
PAPER_ID_COLUMN = "paper_id"
MODEL_PATH = "models/sbert_contrastive_with_head_v1/best_model" 
OUTPUT_EMBEDDINGS_FILE = "data/processed/corpus_embeddings_sbert_contrastive_head_hard_neg.npy"
PROGRESS_FILE = "data/processed/encoding_progress.log"
MAX_LENGTH = 512
BATCH_SIZE = 256
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

print(f"--- Configuration ---")
print(f"Corpus File: {CORPUS_FILE}")
print(f"Model Path: {MODEL_PATH}")
print(f"Output File: {OUTPUT_EMBEDDINGS_FILE}")
print(f"Batch Size: {BATCH_SIZE}")
print(f"Device: {DEVICE}")
print("---------------------\n")

# --- 2. 必要なクラス定義 ---
# ▼▼▼ ここに訓練スクリプトからクラス定義をコピー ▼▼▼
class SiameseContrastiveWithHeadModel(BertPreTrainedModel):
    def __init__(self, config):
        super(SiameseContrastiveWithHeadModel, self).__init__(config)
        self.bert = AutoModel.from_config(config)
        
        self.classifier_head = nn.Sequential(
            nn.Linear(config.hidden_size * 4, config.hidden_size),
            nn.ReLU(),
            nn.Linear(config.hidden_size, 1)
        )
        self.init_weights()

    def _get_vector(self, input_ids, attention_mask):
        """この関数をエンコード時に直接呼び出す"""
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return output.pooler_output

    def forward(self, input_ids=None, attention_mask=None, **kwargs):
        # 推論時にはベクトルを返すだけで良いので、_get_vectorを直接使う
        # このforwardはTrainer用なので、直接は呼ばない
        return self._get_vector(input_ids, attention_mask)

class CorpusDataset(Dataset):
    def __init__(self, texts, tokenizer, max_len):
        self.texts = texts
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_token_type_ids=False,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }

# --- 3. エンコード関数 ---
def encode_corpus(model, dataloader, device, output_file, progress_file):
    model.eval()
    
    start_index = 0
    if os.path.exists(progress_file):
        with open(progress_file, 'r') as f:
            try:
                start_index = int(f.read().strip())
                print(f"Resuming from index {start_index}...")
            except ValueError:
                start_index = 0
    
    total_samples = len(dataloader.dataset)
    embedding_dim = model.config.hidden_size
    embeddings_mmap = np.memmap(output_file, dtype=np.float32, mode='w+', shape=(total_samples, embedding_dim))
    
    processed_batches = start_index // dataloader.batch_size
    
    try:
        with torch.no_grad():
            pbar = tqdm(enumerate(dataloader), total=len(dataloader), initial=processed_batches)
            for i, batch in pbar:
                if i < processed_batches:
                    continue

                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                
                with torch.cuda.amp.autocast():
                    # ▼▼▼ _get_vectorを直接呼び出すように変更 ▼▼▼
                    embeddings = model._get_vector(input_ids=input_ids, attention_mask=attention_mask)

                start = i * dataloader.batch_size
                end = start + embeddings.shape[0]
                embeddings_mmap[start:end] = embeddings.cpu().numpy()
                
                if (i + 1) % 100 == 0:
                    embeddings_mmap.flush()
                    with open(progress_file, 'w') as f:
                        f.write(str(end))

    except Exception as e:
        print(f"\nAn error occurred: {e}")
    finally:
        print("\nFlushing final embeddings to disk...")
        embeddings_mmap.flush()
        with open(progress_file, 'w') as f:
            f.write(str(total_samples))
        print("Encoding process finished.")

# --- 4. メイン処理 ---
if __name__ == '__main__':
    print("Loading model and tokenizer...")
    model = SiameseContrastiveWithHeadModel.from_pretrained(MODEL_PATH).to(DEVICE)
    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
    print("Model and tokenizer loaded.")
    
    print(f"Loading corpus from {CORPUS_FILE}...")
    df_corpus = pd.read_csv(CORPUS_FILE)
    corpus_texts = df_corpus[ABSTRACT_COLUMN].tolist()
    print(f"Loaded {len(corpus_texts):,} abstracts.")

    corpus_dataset = CorpusDataset(corpus_texts, tokenizer, MAX_LENGTH)
    corpus_dataloader = DataLoader(corpus_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True) # pin_memory=Trueを追加
    
    encode_corpus(model, corpus_dataloader, DEVICE, OUTPUT_EMBEDDINGS_FILE, PROGRESS_FILE)

--- Configuration ---
Corpus File: data/raw/all_papers_corpus.csv
Model Path: models/sbert_contrastive_with_head_v1/best_model
Output File: data/processed/corpus_embeddings_sbert_contrastive_head_hard_neg.npy
Batch Size: 256
Device: cuda
---------------------

Loading model and tokenizer...


OSError: Can't load tokenizer for 'models/sbert_contrastive_with_head_v1/best_model'. If you were trying to load it from 'https://huggingface.co/models', make sure you don't have a local directory with the same name. Otherwise, make sure 'models/sbert_contrastive_with_head_v1/best_model' is the correct path to a directory containing all relevant files for a BertTokenizerFast tokenizer.