In [1]:
import pandas as pd
import numpy as np
import os
import torch
import faiss
import json
import sqlite3
import re
from transformers import AutoTokenizer, AutoModel
from tqdm.auto import tqdm

# --- 1. GPUの確認 ---
if torch.cuda.is_available():
    print(f"✅ GPU is available. Device: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda")
else:
    print("⚠️ GPU not found. Running on CPU.")
    device = torch.device("cpu")

✅ GPU is available. Device: NVIDIA RTX A6000


In [2]:
# --- 2. 設定 ---

# 入力: 既存の訓練データ (v3) - ここから正例のみを抽出します
INPUT_TRAIN_FILE = "data/processed/training_dataset_abstract_cleaned_v3.csv"

# 参照: 構築したFaissインデックスとDOIマップ
FAISS_INDEX_FILE = "data/processed/pretrained_scibert.faiss"
DOI_MAP_FILE = "data/processed/pretrained_doi_map.json"
DB_PATH = "data/processed/s2orc_filtered.db"

# 出力: Hard Negativeのみで構成された新しい訓練データ
OUTPUT_FILE = "data/processed/training_dataset_hard_negatives_1to3.csv"

# モデル (クエリのベクトル化用)
MODEL_CHECKPOINT = "allenai/scibert_scivocab_uncased"
MAX_LENGTH = 512
BATCH_SIZE = 256

# マイニング設定
# 正例1に対して負例3を作るため、フィルタリング(正解の除外)余裕を見て多めに検索
SEARCH_TOP_K = 50 
NEG_RATIO = 3 # 正例 : 負例 = 1 : 3

print(f"Settings defined. Ratio 1:{NEG_RATIO} (Hard Negatives only).")

Settings defined. Ratio 1:3 (Hard Negatives only).


In [3]:
# --- 3. データの読み込み ---
print(f"Loading training data: {INPUT_TRAIN_FILE}")
df_train = pd.read_csv(INPUT_TRAIN_FILE)
df_train = df_train.dropna(subset=['abstract_a', 'abstract_b'])

# 正例ペアのみを抽出
df_positives = df_train[df_train['label'] == 1].copy().reset_index(drop=True)
print(f"Loaded {len(df_positives):,} positive pairs.")

# ユニークなアンカー(abstract_a)のリストを作成（検索回数を減らすため）
unique_anchors = df_positives['abstract_a'].unique()
print(f"Unique anchors to query: {len(unique_anchors):,}")

# アンカー -> 正解アブストラクト(abstract_b) のマッピングを作成
# (検索結果から「正解」を除外してHard Negativeにするため)
# 1つのアンカーに対して複数の正解がある場合に対応するため set で保持
anchor_to_positives = df_positives.groupby('abstract_a')['abstract_b'].apply(set).to_dict()
print("Anchor-to-Positives map created.")

Loading training data: data/processed/training_dataset_abstract_cleaned_v3.csv
Loaded 7,013 positive pairs.
Unique anchors to query: 369
Anchor-to-Positives map created.


In [4]:
# --- 4. リソースのロード ---

# モデルとトークナイザ
print(f"Loading model: {MODEL_CHECKPOINT}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)
model = AutoModel.from_pretrained(MODEL_CHECKPOINT).to(device)
model.eval()

# Faissインデックス
print(f"Loading Faiss index: {FAISS_INDEX_FILE}")
index = faiss.read_index(FAISS_INDEX_FILE)
print(f"Index loaded. Total vectors: {index.ntotal:,}")

# DOIマップ (Index ID -> DOI)
print(f"Loading DOI map: {DOI_MAP_FILE}")
with open(DOI_MAP_FILE, 'r') as f:
    doi_map = json.load(f)
# 検索結果のIDからDOIを引けるように逆転させる
id_to_doi = {v: k for k, v in doi_map.items()}
print("DOI map loaded and inverted.")

Loading model: allenai/scibert_scivocab_uncased




Loading Faiss index: data/processed/pretrained_scibert.faiss
Index loaded. Total vectors: 11,619,136
Loading DOI map: data/processed/pretrained_doi_map.json
DOI map loaded and inverted.


In [5]:
# --- 5. アンカーのベクトル化と検索 ---
# アンカーごとに「Hard Negative候補のDOIリスト」を保持する辞書
anchor_to_hard_neg_dois = {}

print("Starting search...")

all_anchors = list(unique_anchors)
# バッチ処理で検索
for i in tqdm(range(0, len(all_anchors), BATCH_SIZE), desc="Mining Hard Negatives"):
    batch_anchors = all_anchors[i : i + BATCH_SIZE]
    
    # 1. ベクトル化
    inputs = tokenizer(
        batch_anchors, 
        padding="max_length", 
        truncation=True, 
        max_length=MAX_LENGTH, 
        return_tensors="pt"
    ).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
        embeddings = outputs.pooler_output.cpu().numpy().astype(np.float32)
    
    # 2. Faissで検索
    distances, indices = index.search(embeddings, SEARCH_TOP_K)
    
    # 3. 結果の保存
    for j, anchor_text in enumerate(batch_anchors):
        search_results_ids = indices[j]
        
        # IDをDOIに変換してリスト化
        candidate_dois = []
        for result_id in search_results_ids:
            if result_id != -1 and result_id in id_to_doi:
                candidate_dois.append(id_to_doi[result_id])
        
        anchor_to_hard_neg_dois[anchor_text] = candidate_dois

print("Search complete. Candidates collected.")

Starting search...


Mining Hard Negatives:   0%|          | 0/2 [00:00<?, ?it/s]

Search complete. Candidates collected.


In [6]:
# --- 6. DBからHard Negativeのアブストラクトを取得 ---

# 必要な全DOIのユニークリストを作成
all_needed_dois = set()
for dois in anchor_to_hard_neg_dois.values():
    all_needed_dois.update(dois)

needed_dois_list = list(all_needed_dois)
print(f"Fetching text for {len(needed_dois_list):,} unique DOIs from DB...")

doi_to_abstract_text = {}

def get_abstracts_from_db(dois):
    with sqlite3.connect(DB_PATH) as conn:
        chunk_size = 1000
        for i in tqdm(range(0, len(dois), chunk_size), desc="Querying DB"):
            chunk = dois[i:i+chunk_size]
            placeholders = ','.join('?' for _ in chunk)
            query = f"SELECT doi, abstract FROM papers WHERE doi IN ({placeholders})"
            cursor = conn.execute(query, chunk)
            for row in cursor:
                doi_to_abstract_text[row[0]] = row[1]

get_abstracts_from_db(needed_dois_list)
print(f"Retrieved {len(doi_to_abstract_text):,} abstracts.")

Fetching text for 17,630 unique DOIs from DB...


Querying DB:   0%|          | 0/18 [00:00<?, ?it/s]

Retrieved 17,630 abstracts.


In [7]:
# --- 7. テキストのクリーニング ---
# (notebooks/17c と同じロジック)

STOP_WORDS = [
    'introduction', 'keywords', 'key words', 'references', 'acknowledgments',
    'acknowledgements', 'bibliography', 'pubmed abstract', 'publisher full text', 'full text'
]
STOP_PATTERN = re.compile(r'\b(' + '|'.join(STOP_WORDS) + r')\b', re.IGNORECASE)
URL_PATTERN = re.compile(r'https?://\S+|www\.\S+')
EMAIL_PATTERN = re.compile(r'\S*@\S*\s?')
NON_ASCII_PATTERN = re.compile(r'[^\x00-\x7F]+')
SPECIAL_CHARS_PATTERN = re.compile(r'[^\w\s\.\,\!\?\-\'\(\)\[\]\{\}\<\>\/\=\+\*\%]')
MAX_CHAR_LIMIT = 3000

def clean_retrieved_text(text):
    if not isinstance(text, str): return ""
    # 1. 本文切り捨て
    match = STOP_PATTERN.search(text)
    if match: text = text[:match.start()]
    # 2. ノイズ削除
    text = URL_PATTERN.sub('', text)
    text = EMAIL_PATTERN.sub('', text)
    text = NON_ASCII_PATTERN.sub('', text)
    text = SPECIAL_CHARS_PATTERN.sub('', text)
    text = re.sub(r'\s+', ' ', text).strip()
    # 3. 文字数制限
    return text[:MAX_CHAR_LIMIT]

print("Cleaning function defined.")

Cleaning function defined.


In [8]:
# --- 8. 最終データセットの構築 ---
final_rows = []

print(f"Constructing dataset with 1:{NEG_RATIO} ratio...")

# 全正例ペアに対してループ
for idx, row in tqdm(df_positives.iterrows(), total=len(df_positives), desc="Building Pairs"):
    anchor = row['abstract_a']
    positive = row['abstract_b']
    data_paper_doi = row['data_paper_doi']
    
    # 1. 正例ペアを追加
    final_rows.append({
        'abstract_a': anchor,
        'abstract_b': positive,
        'label': 1,
        'data_paper_doi': data_paper_doi
    })
    
    # 2. 負例ペア (Hard Negative) を3つ追加
    # このアンカーに対する候補DOIリスト
    candidate_dois = anchor_to_hard_neg_dois.get(anchor, [])
    
    # このアンカーに対する「正解セット」 (フィルタリング用)
    true_positives = anchor_to_positives.get(anchor, set())
    
    added_negatives = 0
    for neg_doi in candidate_dois:
        if added_negatives >= NEG_RATIO:
            break
            
        # テキスト取得
        raw_text = doi_to_abstract_text.get(neg_doi, "")
        if not raw_text: continue
        
        # クリーニング
        neg_text = clean_retrieved_text(raw_text)
        if len(neg_text) < 50: continue # 短すぎるものは除外
        
        # --- フィルタリング (自分自身 or 正解 との一致チェック) ---
        if neg_text == anchor: continue
        if neg_text in true_positives: continue
        
        # 合格 -> 負例ペアとして追加
        final_rows.append({
            'abstract_a': anchor,
            'abstract_b': neg_text,
            'label': 0,
            'data_paper_doi': None
        })
        added_negatives += 1
    
    # もしHard Negativeが足りない場合（稀ですが）、ランダム負例で埋める
    # (ここでは簡易的に、直前の負例を複製するか、スキップする)
    # 今回はスキップ（比率が厳密でなくても、HardNegがあることが重要）

df_final = pd.DataFrame(final_rows)

print("\n--- Dataset Construction Complete ---")
print(f"Total rows: {len(df_final):,}")
print("Label distribution:")
print(df_final['label'].value_counts())

# 保存
print(f"Saving to {OUTPUT_FILE}...")
df_final.to_csv(OUTPUT_FILE, index=False)
print("Done.")

Constructing dataset with 1:3 ratio...


Building Pairs:   0%|          | 0/7013 [00:00<?, ?it/s]


--- Dataset Construction Complete ---
Total rows: 28,052
Label distribution:
label
0    21039
1     7013
Name: count, dtype: int64
Saving to data/processed/training_dataset_hard_negatives_1to3.csv...
Done.
