In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict, load_dataset
from transformers import AutoTokenizer

# --- 設定 ---
RAW_DATA_FILE = "data/processed/training_dataset_abstract_cleaned_v3.csv"

# グループA (Siamese) 用の設定
SBERT_MODEL_CHECKPOINT = "allenai/scibert_scivocab_uncased"
SBERT_MAX_LENGTH = 512
SBERT_TOKENIZED_DATA_PATH = "tokenized_data/sbert_tokenized"

# グループB (Cross-Encoder) 用の設定
CENCODER_MODEL_CHECKPOINT = "allenai/longformer-base-4096"
CENCODER_MAX_LENGTH = 2048
CENCODER_TOKENIZED_DATA_PATH = "tokenized_data/cencoder_tokenized"

NUM_PROC = 8 # CPUのコア数に合わせて調整

In [None]:
# 元のCSVを読み込み、訓練/検証に分割
df_full = pd.read_csv(RAW_DATA_FILE)
df_full = df_full.dropna(subset=['abstract_a', 'abstract_b', 'label'])
df_full['label'] = df_full['label'].astype(int) # Siamese/Cross-BCE用

raw_dataset = Dataset.from_pandas(df_full)
dataset_split = raw_dataset.train_test_split(test_size=0.2, seed=42)

dataset = DatasetDict({
    'train': dataset_split['train'],
    'validation': dataset_split['test']
})
print("Raw dataset prepared:")
print(dataset)

In [None]:
print("--- Processing for Group A (Siamese) ---")
tokenizer_sbert = AutoTokenizer.from_pretrained(SBERT_MODEL_CHECKPOINT)

def tokenize_siamese_function(examples):
    tokenized_a = tokenizer_sbert(examples["abstract_a"], padding="max_length", truncation=True, max_length=SBERT_MAX_LENGTH)
    tokenized_b = tokenizer_sbert(examples["abstract_b"], padding="max_length", truncation=True, max_length=SBERT_MAX_LENGTH)
    return {
        "input_ids": tokenized_a["input_ids"],
        "attention_mask": tokenized_a["attention_mask"],
        "input_ids_b": tokenized_b["input_ids"],
        "attention_mask_b": tokenized_b["attention_mask"],
    }

tokenized_datasets_sbert = dataset.map(
    tokenize_siamese_function, 
    batched=True, 
    num_proc=NUM_PROC,
    remove_columns=["abstract_a", "abstract_b", "data_paper_doi"]
)

# ディスクに保存
tokenized_datasets_sbert.save_to_disk(SBERT_TOKENIZED_DATA_PATH)
print(f"Siamese tokenized data saved to {SBERT_TOKENIZED_DATA_PATH}")

In [None]:
print("\n--- Processing for Group B (Cross-Encoder) ---")
tokenizer_cencoder = AutoTokenizer.from_pretrained(CENCODER_MODEL_CHECKPOINT)

def tokenize_cencoder_function(examples):
    return tokenizer_cencoder(
        examples["abstract_a"], 
        examples["abstract_b"], 
        padding="max_length", 
        truncation=True, 
        max_length=CENCODER_MAX_LENGTH
    )

tokenized_datasets_cencoder = dataset.map(
    tokenize_cencoder_function, 
    batched=True, 
    num_proc=NUM_PROC,
    remove_columns=["abstract_a", "abstract_b", "data_paper_doi"]
)

# ディスクに保存
tokenized_datasets_cencoder.save_to_disk(CENCODER_TOKENIZED_DATA_PATH)
print(f"Cross-Encoder tokenized data saved to {CENCODER_TOKENIZED_DATA_PATH}")