In [1]:
import pandas as pd
import os
import numpy as np
from datasets import Dataset
from tqdm.auto import tqdm

tqdm.pandas()

# --- 1. 設定 ---
INPUT_FILE = "data/processed/training_dataset_abstract_cleaned_v3.csv"
OUTPUT_FILE = "data/processed/triplet_dataset.csv" # ★マスターTripletデータ

print("Settings defined.")

Settings defined.


In [2]:
# --- 2. データの読み込みとTripletへの変換 ---
print(f"Loading full dataset: {INPUT_FILE}")
df_full = pd.read_csv(INPUT_FILE)
df_full = df_full.dropna(subset=['abstract_a', 'abstract_b', 'label'])
df_full['label'] = df_full['label'].astype(int)
print(f"Full dataset size: {len(df_full)}")

# 全データセットを正例ペアと負例ペアに分割
pos_df = df_full[df_full['label'] == 1]
neg_df = df_full[df_full['label'] == 0]

if pos_df.empty or neg_df.empty:
    raise ValueError("Full dataset must contain both positive and negative samples.")

# --- Tripletの作成 ---
print("Creating triplets from full dataset...")
triplets = []
# 負例のアブストラクトBのリスト（ランダムサンプリング用）
# (アブストラクトAはアンカーとして使われることが多いため、B列のみを使用)
negative_abstracts = neg_df['abstract_b'].unique().tolist()
if not negative_abstracts:
    negative_abstracts = pos_df['abstract_b'].unique().tolist() # フォールバック

for index, row in tqdm(pos_df.iterrows(), total=len(pos_df), desc="Creating Triplets"):
    anchor = row['abstract_a']
    positive = row['abstract_b']
    
    # 負例プールからランダムに1つ選ぶ
    negative = np.random.choice(negative_abstracts)
    
    triplets.append({
        'anchor': anchor,
        'positive': positive,
        'negative': negative
    })

df_triplets = pd.DataFrame(triplets)
print(f"Created {len(df_triplets)} triplets.")

Loading full dataset: data/processed/training_dataset_abstract_cleaned_v3.csv
Full dataset size: 34624
Creating triplets from full dataset...


Creating Triplets:   0%|          | 0/7013 [00:00<?, ?it/s]

Created 7013 triplets.


In [3]:
# --- 3. 保存 ---
print(f"Saving master triplet dataset to {OUTPUT_FILE}...")
df_triplets.to_csv(OUTPUT_FILE, index=False)
print("Save complete.")

Saving master triplet dataset to data/processed/triplet_dataset.csv...
Save complete.
