## import


In [1]:
import pandas as pd
import os
import numpy as np
import torch
import torch.nn as nn
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer
)
from transformers.modeling_outputs import SequenceClassifierOutput
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import accelerate
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display

# --- 1. GPUの確認 ---
if torch.cuda.is_available():
    print(f"✅ GPU is available. Device: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda")
else:
    print("⚠️ GPU not found. Running on CPU.")
    device = torch.device("cpu")

✅ GPU is available. Device: NVIDIA GeForce RTX 3080 Ti


## setting


In [None]:
# --- 2. 設定 ---

# ▼▼▼ 修正点: 最終版のクリーニング済みデータを使用 ▼▼▼
TRAINING_FILE = "data/processed/training_dataset_abstract_cleaned_v3.csv"

# ▼▼▼ 修正点: モデルをSciBERTからLongformerに変更 ▼▼▼
MODEL_CHECKPOINT = "allenai/longformer-base-4096"

# 訓練済みモデルの保存先
OUTPUT_MODEL_DIR = "models/cross_encoder_longformer_v1"

# ▼▼▼ 修正点: トークン長の分析結果に基づき、最大長を2048に設定 ▼▼▼
# (分析では最大1998だったため、2048あればほぼ全てをカバーできる)
MAX_LENGTH = 2048

# ▼▼▼ 警告: Longformerはメモリ消費が激しいため、バッチサイズを小さく設定 ▼▼▼
BATCH_SIZE = 4 # Colab T4 GPU (16GB) では 4程度。メモリ不足なら 2 や 1 に減らす
EPOCHS = 1
LEARNING_RATE = 2e-5

print("Configuration set for Longformer.")

Configuration set for Longformer.


## model


## dataload, tokenize


In [3]:
# --- 4. データセットの読み込みとトークン化 ---

# ▼▼▼ 動作確認用の設定 ▼▼▼
SMOKE_TEST_SIZE = 200 # 読み込むデータ件数を200件に制限

print(f"Loading dataset (SMOKE TEST: {SMOKE_TEST_SIZE} rows)...")
# ▼▼▼ 修正点: nrowsで読み込む行数を制限 ▼▼▼
df = pd.read_csv(TRAINING_FILE, nrows=SMOKE_TEST_SIZE)

df = df.dropna(subset=['abstract_a', 'abstract_b', 'label'])
df['label'] = df['label'].astype(int)
print(f"Loaded {len(df)} pairs.")

raw_dataset = Dataset.from_pandas(df)
dataset_split = raw_dataset.train_test_split(test_size=0.2, seed=42) # 200件のうち20%を検証用
dataset = DatasetDict({
    'train': dataset_split['train'],
    'validation': dataset_split['test']
})
print(f"Dataset split: {dataset}")

# トークナイザのロード
print("Initializing Longformer tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

# Cross-Encoder (NSP類似型) のためのトークン化関数
def tokenize_nsp_function(examples):
    return tokenizer(
        examples["abstract_a"], 
        examples["abstract_b"], 
        padding="max_length", 
        truncation=True, 
        max_length=MAX_LENGTH
    )

print(f"Tokenizing dataset (max_length={MAX_LENGTH})...")
tokenized_datasets = dataset.map(tokenize_nsp_function, batched=True)

tokenized_datasets = tokenized_datasets.remove_columns(["abstract_a", "abstract_b", "data_paper_doi"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
print("Tokenization complete.")

Loading dataset (SMOKE TEST: 200 rows)...
Loaded 200 pairs.
Dataset split: DatasetDict({
    train: Dataset({
        features: ['abstract_a', 'abstract_b', 'label', 'data_paper_doi'],
        num_rows: 160
    })
    validation: Dataset({
        features: ['abstract_a', 'abstract_b', 'label', 'data_paper_doi'],
        num_rows: 40
    })
})
Initializing Longformer tokenizer...




Tokenizing dataset (max_length=2048)...


Map:   0%|          | 0/160 [00:00<?, ? examples/s]

Map:   0%|          | 0/40 [00:00<?, ? examples/s]

Tokenization complete.


## model load, train setting


In [4]:
# --- 5. モデルのロードと訓練設定 ---
print(f"Loading model: {MODEL_CHECKPOINT}")
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=2).to(device)
print("Model loaded.")

# 評価指標を計算する関数
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

# ▼▼▼ 動作確認用の設定 ▼▼▼
MAX_TRAIN_STEPS = 10  # 10ステップで訓練を強制終了

# 訓練の設定
training_args = TrainingArguments(
    output_dir=OUTPUT_MODEL_DIR,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    
    # ▼▼▼ 修正点: 訓練をすぐに終わらせるための設定 ▼▼▼
    num_train_epochs=1,            # 1エポック
    max_steps=MAX_TRAIN_STEPS,     # 10ステップで強制終了
    
    evaluation_strategy="steps",   # ステップごとに評価
    eval_steps=5,                  # 5ステップごとに評価
    save_strategy="steps",
    save_steps=5,
    # ▲▲▲ ---------------------------------- ▲▲▲

    load_best_model_at_end=True,
    logging_steps=1,
    gradient_checkpointing=True, 
)
print("Training arguments set for SMOKE TEST.")

Loading model: allenai/longformer-base-4096


Some weights of LongformerForSequenceClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded.
Training arguments set for SMOKE TEST.


## train start


In [None]:
# --- 6. 訓練の開始 ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("\n--- Starting Model Training (Longformer Cross-Encoder) ---")
trainer.train()
print("--- Model Training Complete ---")

max_steps is given, it will override any value given in num_train_epochs



--- Starting Model Training (Longformer Cross-Encoder) ---


Initializing global attention on CLS token...


## save model


In [None]:
# --- 7. モデルの保存 ---
print("Training complete. Saving best model...")
best_model_path = os.path.join(OUTPUT_MODEL_DIR, "best_model")
trainer.save_model(best_model_path)
print(f"Model saved to {best_model_path}")

## visualize training loss


In [None]:
# --- 8. 訓練結果の可視化 ---
print("\n--- Visualizing Training Results ---")
log_history = trainer.state.log_history
df_log = pd.DataFrame(log_history)

df_train = df_log[df_log['loss'].notna()].copy()
df_eval = df_log[df_log['eval_loss'].notna()].copy()

# 'epoch'列を整数型に（表示のため）
if 'epoch' in df_train.columns:
    df_train['epoch'] = df_train['epoch'].astype(int)
if 'epoch' in df_eval.columns:
    df_eval['epoch'] = df_eval['epoch'].astype(int)

fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), sharex=True)
plt.style.use('seaborn-v0_8-whitegrid')

# グラフ1: 損失 (Loss) の推移
sns.lineplot(data=df_train, x='epoch', y='loss', label='Training Loss', ax=ax1, marker='o')
sns.lineplot(data=df_eval, x='epoch', y='eval_loss', label='Validation Loss', ax=ax1, marker='o')
ax1.set_title('Training vs. Validation Loss', fontsize=16)
ax1.set_ylabel('Loss')
ax1.legend()

# グラフ2: 評価指標 (Metrics) の推移
sns.lineplot(data=df_eval, x='epoch', y='eval_f1', label='Validation F1-Score', ax=ax2, marker='o')
sns.lineplot(data=df_eval, x='epoch', y='eval_accuracy', label='Validation Accuracy', ax=ax2, marker='o')
ax2.set_title('Validation Metrics', fontsize=16)
ax2.set_ylabel('Score')
ax2.set_xlabel('Epoch')
ax2.legend()
ax2.set_ylim(0, 1)

plt.tight_layout()
plt.show()

# 最終的なベストモデルの評価結果を表示
print("\n--- Best Model Evaluation Metrics (from validation set) ---")
if not df_eval.empty:
    best_run = df_eval.loc[df_eval['eval_loss'].idxmin()]
    print(f"Best Epoch (based on min eval_loss): {best_run['epoch']}")
    print(f"Best Validation Loss: {best_run['eval_loss']:.4f}")
    print(f"Best Validation F1: {best_run['eval_f1']:.4f}")
    print(f"Best Validation Accuracy: {best_run['eval_accuracy']:.4f}")
else:
    print("No evaluation steps were completed.")