In [1]:
import pandas as pd
import os
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
import accelerate
from IPython.display import display

# --- 1. 設定 ---
MASTER_TRIPLET_FILE = "data/processed/triplet_dataset.csv"
TOKENIZED_OUTPUT_DIR = "data/processed/tokenized_siamese_dataset" # ★グループA用

MODEL_CHECKPOINT = "allenai/scibert_scivocab_uncased"
MAX_LENGTH = 512

print("Settings defined for Group A (Siamese) tokenization.")

Settings defined for Group A (Siamese) tokenization.


In [2]:
# --- 2. データの読み込み ---
print(f"Loading master triplet dataset from: {MASTER_TRIPLET_FILE}")
df_triplets = pd.read_csv(MASTER_TRIPLET_FILE)
df_triplets = df_triplets.dropna()

# Trainerの評価ループを動作させるため、ダミーのlabels列を追加
df_triplets['labels'] = 0

print(f"Loaded {len(df_triplets)} triplets.")
raw_dataset = Dataset.from_pandas(df_triplets)

# --- トークナイズ ---
print("Initializing tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def tokenize_triplet_function(examples):
    # Anchor, Positive, Negative の3つを別々にトークン化
    tokenized_anchor = tokenizer(examples["anchor"], padding="max_length", truncation=True, max_length=MAX_LENGTH)
    tokenized_positive = tokenizer(examples["positive"], padding="max_length", truncation=True, max_length=MAX_LENGTH)
    tokenized_negative = tokenizer(examples["negative"], padding="max_length", truncation=True, max_length=MAX_LENGTH)
    
    # Trainerが 'input_ids' を必須とするため、'anchor' を 'input_ids' にリネーム
    return {
        "input_ids": tokenized_anchor["input_ids"],
        "attention_mask": tokenized_anchor["attention_mask"],
        "input_ids_positive": tokenized_positive["input_ids"],
        "attention_mask_positive": tokenized_positive["attention_mask"],
        "input_ids_negative": tokenized_negative["input_ids"],
        "attention_mask_negative": tokenized_negative["attention_mask"],
    }

print("Tokenizing triplet dataset (this will take a while)...")
tokenized_datasets = raw_dataset.map(
    tokenize_triplet_function, 
    batched=True, 
    num_proc=4, # Docker環境のCPUコア数に応じて調整
    remove_columns=["anchor", "positive", "negative"]
)
tokenized_datasets.set_format("torch")
print("Tokenization complete.")
display(tokenized_datasets)

Loading master triplet dataset from: data/processed/triplet_dataset.csv
Loaded 7013 triplets.
Initializing tokenizer...




Tokenizing triplet dataset (this will take a while)...


Map (num_proc=4):   0%|          | 0/7013 [00:00<?, ? examples/s]

Tokenization complete.


Dataset({
    features: ['labels', 'input_ids', 'attention_mask', 'input_ids_positive', 'attention_mask_positive', 'input_ids_negative', 'attention_mask_negative'],
    num_rows: 7013
})

In [3]:
# --- 3. 保存 ---
print(f"Saving tokenized dataset to disk at: {TOKENIZED_OUTPUT_DIR}")
tokenized_datasets.save_to_disk(TOKENIZED_OUTPUT_DIR)
print("Save complete.")
print("\n--- Group A (Siamese) is ready for training. ---")

Saving tokenized dataset to disk at: data/processed/tokenized_siamese_dataset


Saving the dataset (0/1 shards):   0%|          | 0/7013 [00:00<?, ? examples/s]

Save complete.

--- Group A (Siamese) is ready for training. ---
