<a href="https://colab.research.google.com/github/jjbmsda/EnsembleModel/blob/main/EnsembleModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from torchvision.models import resnet18, densenet121
import torchaudio
from torchaudio.datasets import LIBRISPEECH
from torchaudio.transforms import MFCC
from sklearn.metrics import accuracy_score, roc_curve
import numpy as np
import os
import torch.nn.functional as F

# 사용자 정의 PadTrim 함수
def pad_trim(audio, target_length, pad_value=0):
    """Pad or trim the audio tensor to a fixed length."""
    length = audio.size(-1)
    if length > target_length:
        audio = audio[..., :target_length]
    elif length < target_length:
        pad_amount = target_length - length
        pad_shape = list(audio.shape[:-1]) + [pad_amount]
        padding = torch.full(pad_shape, pad_value, device=audio.device)
        audio = torch.cat([audio, padding], dim=-1)
    return audio

# 입력 크기 조정 함수
def pad_or_resize(audio, target_height=50, target_width=50):
    """오디오 데이터를 모델에 적합한 크기로 패딩 또는 리사이즈."""
    _, height, width = audio.shape
    if height < target_height or width < target_width:
        pad_height = max(0, target_height - height)
        pad_width = max(0, target_width - width)
        audio = F.pad(audio, (0, pad_width, 0, pad_height))
    elif height > target_height or width > target_width:
        audio = F.interpolate(audio.unsqueeze(0), size=(target_height, target_width), mode="bilinear", align_corners=False).squeeze(0)
    return audio

# LibriSpeechDataset 클래스
class LibriSpeechDataset(Dataset):
    def __init__(self, dataset, transform=None, target_length=None, label_to_index=None):
        self.dataset = dataset
        self.transform = transform
        self.target_length = target_length
        self.label_to_index = label_to_index

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        audio, sr, label, *_ = self.dataset[idx]
        if self.transform:
            audio = self.transform(audio)
        if self.target_length:
            audio = pad_trim(audio, self.target_length)
        audio = pad_or_resize(audio, target_height=50, target_width=50)
        if label not in self.label_to_index:
            raise ValueError(f"Label {label} not found in label_to_index")
        label_idx = self.label_to_index[label]
        label_tensor = torch.tensor(label_idx, dtype=torch.long)
        return audio, label_tensor

# MFCC 변환
mfcc_transform = MFCC(sample_rate=16000, n_mfcc=13, log_mels=True)

# 데이터 디렉토리 생성
os.makedirs("./data", exist_ok=True)

# LibriSpeech 데이터셋 다운로드 및 라벨 매핑 생성
train_data = LIBRISPEECH("./data", url="train-clean-100", download=True)
test_data = LIBRISPEECH("./data", url="test-clean", download=True)

# 라벨 매핑 생성
labels = set(label for _, _, label, *_ in train_data)  # 모든 라벨 수집
label_to_index = {label: idx for idx, label in enumerate(sorted(labels))}
index_to_label = {idx: label for label, idx in label_to_index.items()}

# 최대 길이 계산
max_length = 0
for audio, _, _, *_ in train_data:
    audio = audio[:, :80000]
    mfcc = mfcc_transform(audio)
    max_length = max(max_length, mfcc.shape[-1])
target_length = max(max_length, 50)

# 데이터셋 생성
train_dataset = LibriSpeechDataset(
    train_data,
    transform=mfcc_transform,
    target_length=target_length,
    label_to_index=label_to_index
)
test_dataset = LibriSpeechDataset(
    test_data,
    transform=mfcc_transform,
    target_length=target_length,
    label_to_index=label_to_index
)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True, num_workers=0)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False, num_workers=0)

# ResNet, DenseNet 및 Ensemble 모델 정의
class ResNetModel(nn.Module):
    def __init__(self, num_classes):
        super(ResNetModel, self).__init__()
        self.resnet = resnet18(pretrained=False, num_classes=num_classes)
        self.resnet.conv1 = nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.resnet.maxpool = nn.Identity()

    def forward(self, x):
        return self.resnet(x)

class DenseNetModel(nn.Module):
    def __init__(self, num_classes):
        super(DenseNetModel, self).__init__()
        self.densenet = densenet121(pretrained=False)
        self.densenet.features.conv0 = nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1, bias=False)
        self.densenet.features.pool0 = nn.Identity()
        self.densenet.classifier = nn.Linear(self.densenet.classifier.in_features, num_classes)

    def forward(self, x):
        return self.densenet(x)

class EnsembleModel(nn.Module):
    def __init__(self, resnet, densenet):
        super(EnsembleModel, self).__init__()
        self.resnet = resnet
        self.densenet = densenet

    def forward(self, x):
        resnet_out = self.resnet(x)
        densenet_out = self.densenet(x)
        return (resnet_out + densenet_out) / 2

# GPU 메모리 정리 및 모델 초기화
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_classes = len(labels)
resnet_model = ResNetModel(num_classes).to(device)
densenet_model = DenseNetModel(num_classes).to(device)
ensemble_model = EnsembleModel(resnet_model, densenet_model).to(device)

# 손실 함수 및 옵티마이저
criterion = nn.CrossEntropyLoss()
optimizer_resnet = optim.Adam(resnet_model.parameters(), lr=0.001)
optimizer_densenet = optim.Adam(densenet_model.parameters(), lr=0.001)
optimizer_ensemble = optim.Adam(ensemble_model.parameters(), lr=0.001)

# Mixed Precision 적용한 학습 함수
def train(model, dataloader, criterion, optimizer):
    model.train()
    total_loss = 0
    scaler = torch.cuda.amp.GradScaler()
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        with torch.cuda.amp.autocast():
            outputs = model(inputs)
            loss = criterion(outputs, labels)
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        total_loss += loss.item()
    return total_loss / len(dataloader)

# 평가 함수
def evaluate(model, dataloader):
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for inputs, labels in dataloader:
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            preds.extend(predicted.cpu().numpy())
            targets.extend(labels.cpu().numpy())
    acc = accuracy_score(targets, preds)
    return acc, preds, targets

# 학습 루프
epochs = 5
for epoch in range(epochs):
    resnet_loss = train(resnet_model, train_loader, criterion, optimizer_resnet)
    densenet_loss = train(densenet_model, train_loader, criterion, optimizer_densenet)
    ensemble_loss = train(ensemble_model, train_loader, criterion, optimizer_ensemble)

    resnet_acc, resnet_preds, resnet_targets = evaluate(resnet_model, test_loader)
    densenet_acc, densenet_preds, densenet_targets = evaluate(densenet_model, test_loader)
    ensemble_acc, ensemble_preds, ensemble_targets = evaluate(ensemble_model, test_loader)

    print(f"Epoch {epoch+1}/{epochs}")
    print(f"ResNet - Loss: {resnet_loss:.4f}, Accuracy: {resnet_acc:.4f}")
    print(f"DenseNet - Loss: {densenet_loss:.4f}, Accuracy: {densenet_acc:.4f}")
    print(f"Ensemble - Loss: {ensemble_loss:.4f}, Accuracy: {ensemble_acc:.4f}")


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


KeyError: 'HE HOPED THERE WOULD BE STEW FOR DINNER TURNIPS AND CARROTS AND BRUISED POTATOES AND FAT MUTTON PIECES TO BE LADLED OUT IN THICK PEPPERED FLOUR FATTENED SAUCE'