<a href="https://colab.research.google.com/github/haruto1586-f/YOLO-seg-git/blob/main/active_learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

データセットの準備とResNetの初期化

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader, Subset
import numpy as np

# デバイスの設定
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def get_resnet50_for_mnist():
    """MNIST用にカスタマイズしたResNet50を返す関数"""
    model = models.resnet50(weights=None)
    # 1チャンネル入力に変更
    model.conv1 = nn.Conv2d(1, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    # 10クラス出力に変更
    num_ftrs = model.fc.in_features
    model.fc = nn.Linear(num_ftrs, 10)
    return model.to(device)

# データセットの準備 (ResNetが処理しやすいように32x32にリサイズ)
transform = transforms.Compose([
    transforms.Resize(32),
    transforms.ToTensor(),
    transforms.Normalize((0.5,), (0.5,))
])

full_train_dataset = datasets.MNIST(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.MNIST(root='./data', train=False, download=True, transform=transform)
test_loader = DataLoader(test_dataset, batch_size=256, shuffle=False)

100%|██████████| 9.91M/9.91M [00:00<00:00, 37.1MB/s]
100%|██████████| 28.9k/28.9k [00:00<00:00, 1.14MB/s]
100%|██████████| 1.65M/1.65M [00:00<00:00, 10.4MB/s]
100%|██████████| 4.54k/4.54k [00:00<00:00, 11.6MB/s]


エントロピーサンプリングとクラス指定手動サンプリングの実装

In [None]:
def entropy_sampling(model, unlabeled_indices, dataset, query_size):
    """エントロピーが最も高いサンプルを選択"""
    model.eval()
    unlabeled_loader = DataLoader(Subset(dataset, unlabeled_indices), batch_size=256, shuffle=False)
    entropies = []

    with torch.no_grad():
        for inputs, _ in unlabeled_loader:
            inputs = inputs.to(device)
            outputs = model(inputs)
            probs = torch.softmax(outputs, dim=1)
            # エントロピーの計算 (H = -sum(p * log(p)))
            entropy = -torch.sum(probs * torch.log(probs + 1e-10), dim=1)
            entropies.extend(entropy.cpu().numpy())

    entropies = np.array(entropies)
    # エントロピーが高い順にソートし、上位query_size個のインデックスを取得
    top_indices = np.argsort(entropies)[::-1][:query_size]

    # 元のunlabeled_indices内の実際のインデックスを返す
    return [unlabeled_indices[i] for i in top_indices]

def manual_class_sampling(unlabeled_indices, dataset, class_counts):
    """
    ユーザーが指定したクラスごとの数(class_counts)に基づいてサンプリング
    class_counts: dict (例: {0: 10, 1: 20, 2: 5 ...})
    """
    selected_indices = []
    unlabeled_labels = np.array([dataset.targets[i] for i in unlabeled_indices])

    for cls, count in class_counts.items():
        # 指定クラスのインデックスを取得
        cls_indices_in_unlabeled = np.where(unlabeled_labels == cls)[0]

        # 取得可能な数が要求数より少ない場合の対策
        actual_count = min(count, len(cls_indices_in_unlabeled))
        if actual_count > 0:
            chosen = np.random.choice(cls_indices_in_unlabeled, actual_count, replace=False)
            selected_indices.extend([unlabeled_indices[i] for i in chosen])

    return selected_indices

学習ループと能動学習のメインサイクル

In [None]:
def train_model(model, train_loader, epochs=5):
    """モデルの学習ループ"""
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    model.train()
    for epoch in range(epochs):
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
    return model

def evaluate_model(model, test_loader):
    """モデルの評価"""
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for inputs, labels in test_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    return correct / total

# --- 能動学習のメイン設定 ---
NUM_CYCLES = 5
INITIAL_TRAIN_SIZE = 100
QUERY_SIZE = 100

# 実験の切り替えフラグ
# True: 毎サイクル1から学習, False: 前サイクルから継続学習
reset_model_each_cycle = True

# サンプリング手法の切り替え ('entropy' または 'manual')
sampling_strategy = 'entropy'

# 初期データの分割
all_indices = np.arange(len(full_train_dataset))
np.random.shuffle(all_indices)

labeled_indices = all_indices[:INITIAL_TRAIN_SIZE].tolist()
unlabeled_indices = all_indices[INITIAL_TRAIN_SIZE:].tolist()

# 継続学習用のモデル初期化（リセットしない場合はこれを使い回す）
model = get_resnet50_for_mnist()

for cycle in range(NUM_CYCLES):
    print(f"--- Cycle {cycle + 1} ---")
    print(f"Labeled data size: {len(labeled_indices)}")

    # 毎サイクル初期化する場合
    if reset_model_each_cycle:
        model = get_resnet50_for_mnist()

    train_loader = DataLoader(Subset(full_train_dataset, labeled_indices), batch_size=32, shuffle=True)

    # 学習と評価
    model = train_model(model, train_loader, epochs=3) # デモ用にepoch数は少なめ
    acc = evaluate_model(model, test_loader)
    print(f"Accuracy: {acc:.4f}")

    # 次のサイクルのためのサンプリング (最後のサイクル以外)
    if cycle < NUM_CYCLES - 1:
        if sampling_strategy == 'entropy':
            new_indices = entropy_sampling(model, unlabeled_indices, full_train_dataset, QUERY_SIZE)
        elif sampling_strategy == 'manual':
            # 例: クラス0〜4を多く、5〜9を少なくサンプリングする設定
            manual_counts = {0: 15, 1: 15, 2: 15, 3: 15, 4: 15, 5: 5, 6: 5, 7: 5, 8: 5, 9: 5}
            new_indices = manual_class_sampling(unlabeled_indices, full_train_dataset, manual_counts)

        # インデックスの更新 (UnlabeledからLabeledへ移動)
        labeled_indices.extend(new_indices)
        unlabeled_indices = [idx for idx in unlabeled_indices if idx not in new_indices]