In [1]:
import pandas as pd
import os
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer
import accelerate
from IPython.display import display

# --- 1. 設定 ---
MASTER_TRIPLET_FILE = "data/processed/triplet_dataset.csv"
TOKENIZED_OUTPUT_DIR = "data/processed/tokenized_cencoder_dataset" # ★グループB用

# ▼▼▼ 修正点: グループBはLongformerを使用 ▼▼▼
MODEL_CHECKPOINT = "allenai/longformer-base-4096"
MAX_LENGTH = 2048 # ▼▼▼ 修正点: Longformerの最大長 ▼▼▼

print("Settings defined for Group B (Cross-Encoder) tokenization.")

Settings defined for Group B (Cross-Encoder) tokenization.


In [2]:
# --- 2. データの読み込み ---
print(f"Loading master triplet dataset from: {MASTER_TRIPLET_FILE}")
df_triplets = pd.read_csv(MASTER_TRIPLET_FILE)
df_triplets = df_triplets.dropna()

# Trainerの評価ループを動作させるため、ダミーのlabels列を追加
df_triplets['labels'] = 0

print(f"Loaded {len(df_triplets)} triplets.")
raw_dataset = Dataset.from_pandas(df_triplets)

# --- トークナイズ ---
print(f"Initializing tokenizer: {MODEL_CHECKPOINT}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# ▼▼▼ 修正点: Cross-Encoder用のトークン化関数 ▼▼▼
def tokenize_cencoder_margin_function(examples):
    # (Anchor, Positive) ペアを連結してトークン化
    tokenized_pos = tokenizer(
        examples["anchor"], examples["positive"], 
        padding="max_length", truncation=True, max_length=MAX_LENGTH
    )
    # (Anchor, Negative) ペアを連結してトークン化
    tokenized_neg = tokenizer(
        examples["anchor"], examples["negative"], 
        padding="max_length", truncation=True, max_length=MAX_LENGTH
    )
    
    # 'input_ids' は (A,P) ペア
    # 'input_ids_neg' は (A,N) ペア
    return {
        "input_ids": tokenized_pos["input_ids"],
        "attention_mask": tokenized_pos["attention_mask"],
        "input_ids_neg": tokenized_neg["input_ids"],
        "attention_mask_neg": tokenized_neg["attention_mask"],
    }

print("Tokenizing C-Encoder Margin dataset (this will take a while)...")
tokenized_datasets = raw_dataset.map(
    tokenize_cencoder_margin_function, 
    batched=True, 
    num_proc=4, # Docker環境のCPUコア数に応じて調整
    remove_columns=["anchor", "positive", "negative"]
)
tokenized_datasets.set_format("torch")
print("Tokenization complete.")
display(tokenized_datasets)

Loading master triplet dataset from: data/processed/triplet_dataset.csv
Loaded 7013 triplets.
Initializing tokenizer: allenai/longformer-base-4096...




Tokenizing C-Encoder Margin dataset (this will take a while)...


Map (num_proc=4):   0%|          | 0/7013 [00:00<?, ? examples/s]

Tokenization complete.


Dataset({
    features: ['labels', 'input_ids', 'attention_mask', 'input_ids_neg', 'attention_mask_neg'],
    num_rows: 7013
})

In [3]:
# --- 3. 保存 ---
print(f"Saving tokenized dataset to disk at: {TOKENIZED_OUTPUT_DIR}")
tokenized_datasets.save_to_disk(TOKENIZED_OUTPUT_DIR)
print("Save complete.")
print("\n--- Group B (Cross-Encoder) is ready for training. ---")

Saving tokenized dataset to disk at: data/processed/tokenized_cencoder_dataset


Saving the dataset (0/1 shards):   0%|          | 0/7013 [00:00<?, ? examples/s]

Save complete.

--- Group B (Cross-Encoder) is ready for training. ---
