## import


In [7]:
import pandas as pd
import os
import numpy as np
import torch
import torch.nn as nn
from datasets import Dataset, DatasetDict
from transformers import AutoTokenizer, AutoModel, BertPreTrainedModel, TrainingArguments, Trainer
from transformers.modeling_outputs import SequenceClassifierOutput
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
import accelerate

# --- 1. GPUの確認 ---
# Dockerコンテナが正しくGPUを認識していれば、ここでTrueと表示されます
if torch.cuda.is_available():
    print(f"✅ 成功！Dockerコンテナ経由でGPUを認識しました: {torch.cuda.get_device_name(0)}")
    device = torch.device("cuda")
else:
    print("⚠️ 失敗。GPUが認識されていません。DockerのGPU設定を確認してください。")
    device = torch.device("cpu")

✅ 成功！Dockerコンテナ経由でGPUを認識しました: NVIDIA GeForce RTX 3080 Ti


## setting


In [8]:
# --- 2. 設定 ---
# DockerfileのWORKDIR (/app) からの相対パス
TRAINING_FILE = "data/processed/training_dataset_abstract.csv"

# 使用するベースモデル（SciBERT）
MODEL_CHECKPOINT = "allenai/scibert_scivocab_uncased"

# 訓練済みモデルの保存先
OUTPUT_MODEL_DIR = "models/siamese_scibert_v1"

# モデルのハイパーパラメータ
MAX_LENGTH = 512       # 入力トークンの最大長（アブストラクト1件あたり）
BATCH_SIZE = 16        # GPUメモリに応じたバッチサイズ
EPOCHS = 3             # 訓練エポック数
LEARNING_RATE = 2e-5

print("Configuration set.")

Configuration set.


## モデル


In [9]:
# --- 3. カスタムモデルクラスの定義 ---

class MeanPooling(nn.Module):
    """アテンションマスクを考慮したMean Pooling層"""
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        return sum_embeddings / sum_mask

class SiameseSciBERT(BertPreTrainedModel):
    """
    手法1：二入力ベクトル比較型（Siamese-SciBERT）
    """
    def __init__(self, config):
        super(SiameseSciBERT, self).__init__(config)
        self.bert = AutoModel.from_config(config)
        self.pooler = MeanPooling()
        self.classifier = nn.Linear(config.hidden_size * 4, config.num_labels)
        self.init_weights()

    # ▼▼▼ 修正点: 引数名を変更 ▼▼▼
    # (input_ids_a -> input_ids, attention_mask_a -> attention_mask)
    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        input_ids_b=None,
        attention_mask_b=None,
        labels=None,
        **kwargs 
    ):
        
        # --- 1. エンコーダー層 ---
        # ▼▼▼ 修正点: 引数名を変更 ▼▼▼
        output_a = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        output_b = self.bert(input_ids=input_ids_b, attention_mask=attention_mask_b)

        # --- 2. プーリング層 ---
        # ▼▼▼ 修正点: 引数名を変更 ▼▼▼
        vec_x = self.pooler(output_a.last_hidden_state, attention_mask)
        vec_y = self.pooler(output_b.last_hidden_state, attention_mask_b)

        # --- 3. 特徴量エンジニアリング層 ---
        diff = torch.abs(vec_x - vec_y)
        prod = vec_x * vec_y
        concatenated_features = torch.cat([vec_x, vec_y, diff, prod], dim=1)

        # --- 4. 分類ヘッド ---
        logits = self.classifier(concatenated_features)

        # --- 5. 損失の計算 ---
        loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=None,
            attentions=None,
        )

print("Custom model class 'SiameseSciBERT' defined.")

Custom model class 'SiameseSciBERT' defined.


## データロードとトークナイズ


In [10]:
# --- 4. データセットの読み込みとトークン化 ---
print("Loading dataset...")
df = pd.read_csv(TRAINING_FILE)
df = df.dropna(subset=['abstract_a', 'abstract_b', 'label'])
df['label'] = df['label'].astype(int)

raw_dataset = Dataset.from_pandas(df)
dataset_split = raw_dataset.train_test_split(test_size=0.2, seed=42)
dataset = DatasetDict({
    'train': dataset_split['train'],
    'validation': dataset_split['test']
})
print(f"Dataset loaded: {dataset}")

print("Initializing tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

def tokenize_siamese_function(examples):
    tokenized_a = tokenizer(examples["abstract_a"], padding="max_length", truncation=True, max_length=MAX_LENGTH)
    tokenized_b = tokenizer(examples["abstract_b"], padding="max_length", truncation=True, max_length=MAX_LENGTH)
    
    # ▼▼▼ 修正点: キー名を変更 ▼▼▼
    # ("input_ids_a" -> "input_ids", "attention_mask_a" -> "attention_mask")
    return {
        "input_ids": tokenized_a["input_ids"],
        "attention_mask": tokenized_a["attention_mask"],
        "input_ids_b": tokenized_b["input_ids"],
        "attention_mask_b": tokenized_b["attention_mask"],
    }

print("Tokenizing dataset for Siamese model...")
tokenized_datasets = dataset.map(tokenize_siamese_function, batched=True, num_proc=4)

tokenized_datasets = tokenized_datasets.remove_columns(["abstract_a", "abstract_b", "data_paper_doi"])
tokenized_datasets = tokenized_datasets.rename_column("label", "labels")
tokenized_datasets.set_format("torch")
print("Tokenization complete.")

Loading dataset...
Dataset loaded: DatasetDict({
    train: Dataset({
        features: ['abstract_a', 'abstract_b', 'label', 'data_paper_doi'],
        num_rows: 28252
    })
    validation: Dataset({
        features: ['abstract_a', 'abstract_b', 'label', 'data_paper_doi'],
        num_rows: 7063
    })
})
Initializing tokenizer...




Tokenizing dataset for Siamese model...


Map (num_proc=4):   0%|          | 0/28252 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/7063 [00:00<?, ? examples/s]

Tokenization complete.


## モデルのロードと訓練設定


In [11]:
# --- 5. モデルのロードと訓練設定 ---
print(f"Loading custom model: {MODEL_CHECKPOINT}")
# 2クラス分類（label 0 or 1）のモデルとして、カスタムクラスをロード
model = SiameseSciBERT.from_pretrained(MODEL_CHECKPOINT, num_labels=2).to(device)
print("Custom model loaded.")

# 評価指標を計算する関数
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

# 訓練の設定
training_args = TrainingArguments(
    output_dir=OUTPUT_MODEL_DIR,
    learning_rate=LEARNING_RATE,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=False,
    logging_steps=100, # 100ステップごとにログを表示
)
print("Training arguments and metrics set.")

Loading custom model: allenai/scibert_scivocab_uncased


Some weights of SiameseSciBERT were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Custom model loaded.
Training arguments and metrics set.


## 訓練開始


In [12]:
# --- 6. 訓練の開始 ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

print("\n--- Starting Model Training (Siamese Model) ---")
trainer.train()
print("--- Model Training Complete ---")


--- Starting Model Training (Siamese Model) ---


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

## モデルの保存


In [None]:
# --- 7. モデルの保存 ---
print("Training complete. Saving best model...")
best_model_path = os.path.join(OUTPUT_MODEL_DIR, "best_model")
trainer.save_model(best_model_path)

# docker-compose.ymlのvolumes設定により、このコンテナ内の/app/models/への保存は、
# 自動的にローカルPCの/models/フォルダにも反映（同期）されます。
print(f"Model saved to {best_model_path}")

## 訓練結果の可視化


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from IPython.display import display

print("--- Visualizing Training Results ---")

# 1. 訓練履歴(log_history)を取得
log_history = trainer.state.log_history

# 2. ログをPandas DataFrameに変換
df_log = pd.DataFrame(log_history)

# 3. 訓練ログ(loss)と検証ログ(eval_lossなど)を分離
df_train = df_log[df_log['loss'].notna()].copy()
df_eval = df_log[df_log['eval_loss'].notna()].copy()

# 'epoch'列を整数型に（表示のため）
if 'epoch' in df_train.columns:
    df_train['epoch'] = df_train['epoch'].astype(int)
if 'epoch' in df_eval.columns:
    df_eval['epoch'] = df_eval['epoch'].astype(int)

# 4. グラフの描画 (2つのグラフを縦に並べる)
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 10), sharex=True)
plt.style.use('seaborn-v0_8-whitegrid')

# --- グラフ1: 損失 (Loss) の推移 ---
sns.lineplot(data=df_train, x='epoch', y='loss', label='Training Loss', ax=ax1, marker='o')
sns.lineplot(data=df_eval, x='epoch', y='eval_loss', label='Validation Loss', ax=ax1, marker='o')
ax1.set_title('Training vs. Validation Loss', fontsize=16)
ax1.set_ylabel('Loss')
ax1.legend()

# --- グラフ2: 評価指標 (Metrics) の推移 ---
sns.lineplot(data=df_eval, x='epoch', y='eval_f1', label='Validation F1-Score', ax=ax2, marker='o')
sns.lineplot(data=df_eval, x='epoch', y='eval_accuracy', label='Validation Accuracy', ax=ax2, marker='o')
ax2.set_title('Validation Metrics', fontsize=16)
ax2.set_ylabel('Score')
ax2.set_xlabel('Epoch')
ax2.legend()
ax2.set_ylim(0, 1) # 精度は0%から100%の範囲で表示

plt.tight_layout()
plt.show()

# --- ▼▼▼ 修正点: best_metricの参照方法を変更 ▼▼▼ ---
print("\n--- Best Model Evaluation Metrics (from validation set) ---")

# df_eval (このセルの前半で作成した検証結果のDataFrame) を使用
if not df_eval.empty:
    # 'eval_loss' が最小だった行（=ベストモデル）を取得
    best_run = df_eval.loc[df_eval['eval_loss'].idxmin()]
    
    print(f"Best Epoch (based on min eval_loss): {best_run['epoch']}")
    print(f"Best Validation Loss: {best_run['eval_loss']:.4f}")
    print(f"Best Validation F1: {best_run['eval_f1']:.4f}")
    print(f"Best Validation Accuracy: {best_run['eval_accuracy']:.4f}")
else:
    print("No evaluation steps were completed.")