In [5]:
import os
import glob
import numpy as np
import torch
import torchaudio
import torchaudio.transforms as T
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
import torchvision
from torch.utils.data import DataLoader


In [2]:
# 파일 개수 확인
train_healthy_paths = list(glob.glob('./SVD/train/healthy/*.wav'))
train_pathology_paths = list(glob.glob('./SVD/train/pathology/*.wav'))
test_healthy_paths = list(glob.glob('./SVD/test/healthy/*.wav'))
test_pathology_paths = list(glob.glob('./SVD/test/pathology/*.wav'))
print(f'train healthy   : {len(train_healthy_paths)} audios')
print(f'train_pathology : {len(train_pathology_paths)} audios')
print(f'test_healthy    : {len(test_healthy_paths)} audios')
print(f'test_pathology  : {len(test_pathology_paths)} audios')

train healthy   : 532 audios
train_pathology : 762 audios
test_healthy    : 100 audios
test_pathology  : 100 audios


In [3]:
# dataset 정의
def load_audios(paths):

    paths = paths
    dataset = []
    for p in paths:
        name = os.path.basename(p)
        name = os.path.splitext(name)[0]
        waveform, sample_rate = torchaudio.load(p)
        dataset.append([waveform, sample_rate, name])

    return dataset

In [4]:
# dataset 생성
h_train = load_audios(train_healthy_paths)
p_train = load_audios(train_pathology_paths)
h_test = load_audios(test_healthy_paths)
p_test = load_audios(test_pathology_paths)

In [5]:
# dataloader
loader_h_train = DataLoader(h_train, batch_size=1, shuffle=False)
loader_p_train = DataLoader(p_train, batch_size=1, shuffle=False)
loader_h_test = DataLoader(h_test, batch_size=1, shuffle=False)
loader_p_test = DataLoader(p_test, batch_size=1, shuffle=False)

In [6]:
def create_melspectrogram_images(loader, label, t):
    dir = f'./SVD/melspectrograms/{t}/{label}'
    os.makedirs(dir, exist_ok=True)
    
    for data in loader:
        waveform = data[0][0]
        sample_rate = data[1][0]
        name = data[2][0]
        
        melspectrogram = T.MelSpectrogram(sample_rate=sample_rate, n_fft=2048, hop_length=512, n_mels=128)
        mel_spec = melspectrogram(waveform)
        
        plt.figure(figsize=(10, 4))
        plt.imshow(librosa.power_to_db(mel_spec.squeeze().numpy(), ref=np.max), aspect='auto', origin='lower')
        plt.axis('off')
        plt.tight_layout(pad=0)
        plt.savefig(f'./SVD/melspectrograms/{t}/{label}/{name}.png', bbox_inches='tight', pad_inches=0)
        plt.close()

In [7]:
# spectrogram image 생성 (5분 정도 소요)
create_melspectrogram_images(loader_h_train, 'healthy', 'train')
create_melspectrogram_images(loader_p_train, 'pathology', 'train')
create_melspectrogram_images(loader_h_test, 'healthy', 'test')
create_melspectrogram_images(loader_p_test, 'pathology', 'test')

In [8]:
# 데이터 증강 함수 정의
def augmentation(mel_spec_tensor):
    # 주파수 마스킹
    freq_masking = T.FrequencyMasking(freq_mask_param=30)
    mel_spec_tensor = freq_masking(mel_spec_tensor)
    
    # 시간 마스킹
    time_masking = T.TimeMasking(time_mask_param=40)
    mel_spec_tensor = time_masking(mel_spec_tensor)
    
    return mel_spec_tensor

def create_augmented_melspectrogram_images(image_paths, label, t, augmentation_ratio):
    dir_aug = f'./SVD/melspectrograms/{t}/{label}'
    os.makedirs(dir_aug, exist_ok=True)

    augmented_count = int(len(image_paths) * augmentation_ratio)

    for i, image_path in enumerate(image_paths):
        if i >= augmented_count:
            break
        mel_spec = plt.imread(image_path)
        
        if mel_spec.ndim == 3 and mel_spec.shape[2] == 4:
            mel_spec = mel_spec[:, :, :3]  # RGBA -> RGB
        
        mel_spec_tensor = torch.from_numpy(mel_spec).permute(2, 0, 1).unsqueeze(0).float()
        
        augmented_mel_spec = augmentation(mel_spec_tensor)
        augmented_mel_spec_np = augmented_mel_spec.squeeze().permute(1, 2, 0).numpy()
        
        name = os.path.splitext(os.path.basename(image_path))[0]
        
        plt.imsave(f'./SVD/melspectrograms/{t}/{label}/{name}_augmented.png', augmented_mel_spec_np, cmap='viridis')

# 원본 이미지 경로
healthy_image_dir = 'SVD/melspectrograms/train/healthy'
pathology_image_dir = 'SVD/melspectrograms/train/pathology'

# 원본 이미지 경로 리스트
healthy_paths = [os.path.join(healthy_image_dir, f) for f in os.listdir(healthy_image_dir) if f.endswith('.png')]
pathology_paths = [os.path.join(pathology_image_dir, f) for f in os.listdir(pathology_image_dir) if f.endswith('.png')]

# 증강된 spectrogram image 생성
create_augmented_melspectrogram_images(healthy_paths, 'healthy', 'train', augmentation_ratio=1)
create_augmented_melspectrogram_images(pathology_paths, 'pathology', 'train', augmentation_ratio=1)

In [2]:
# 파일 개수 확인
train_healthy_images = list(glob.glob('./SVD/melspectrograms/train/healthy/*.png'))
train_pathology_images = list(glob.glob('./SVD/melspectrograms/train/pathology/*.png'))
test_healthy_images = list(glob.glob('./SVD/melspectrograms/test/healthy/*.png'))
test_pathology_images = list(glob.glob('./SVD/melspectrograms/test/pathology/*.png'))
print(f'train healthy   : {len(train_healthy_images)} images')
print(f'train_pathology : {len(train_pathology_images)} images')
print(f'test_healthy    : {len(test_healthy_images)} images')
print(f'test_pathology  : {len(test_pathology_images)} images')

train healthy   : 1064 images
train_pathology : 1524 images
test_healthy    : 100 images
test_pathology  : 100 images


In [11]:
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import transforms
from torchvision.datasets import ImageFolder

# trainset
train_path = './SVD/melspectrograms/train'
trainset = ImageFolder(root=train_path, transform=transforms.Compose([transforms.ToTensor(),
                                                                    transforms.Resize((299, 299))]))
print(trainset)
print(f'\nclass : index\n{trainset.class_to_idx}')

# testset
test_path = './SVD/melspectrograms/test'
testset = ImageFolder(root=test_path, transform=transforms.Compose([transforms.ToTensor(),
                                                                    transforms.Resize((299, 299))]))
print(testset)
print(f'\nclass : index\n{testset.class_to_idx}')

# dataloader
train_dataloader = DataLoader(trainset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(testset, batch_size=16, shuffle=False)


Dataset ImageFolder
    Number of datapoints: 2588
    Root location: ./SVD/melspectrograms/train
    StandardTransform
Transform: Compose(
               ToTensor()
               Resize(size=(299, 299), interpolation=bilinear, max_size=None, antialias=True)
           )

class : index
{'healthy': 0, 'pathology': 1}
Dataset ImageFolder
    Number of datapoints: 200
    Root location: ./SVD/melspectrograms/test
    StandardTransform
Transform: Compose(
               ToTensor()
               Resize(size=(299, 299), interpolation=bilinear, max_size=None, antialias=True)
           )

class : index
{'healthy': 0, 'pathology': 1}


In [11]:
import torch

# Check if CUDA is available
print(torch.cuda.is_available())

# Select the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Using {device} device')


True
Using cuda device


In [14]:
# 사전학습모델 불러오기
import numpy as np
from torchvision.models import resnet50, ResNet50_Weights, efficientnet_b0, EfficientNet_B0_Weights, inception_v3, Inception_V3_Weights

# ResNet50 모델
resnet50_model = resnet50(weights=ResNet50_Weights.DEFAULT)
resnet50_model.fc = nn.Sequential(
    nn.Dropout(p=0.5),
    nn.Linear(2048, 2)
)
# EfficientNet-B0 모델
efficientnet_b0_model = efficientnet_b0(weights=EfficientNet_B0_Weights.DEFAULT)
efficientnet_b0_model.classifier = nn.Sequential(
    nn.Dropout(p=0.5),
    nn.Linear(1280, 2)
)

# Inception-v3 모델
inception_v3_model = inception_v3(weights=Inception_V3_Weights.DEFAULT)
inception_v3_model.fc = nn.Sequential(
    nn.Dropout(p=0.5),
    nn.Linear(2048, 2)
)


# 모델, 손실함수, 옵티마이저
resnet50_model = resnet50_model.to(device)
efficientnet_b0_model = efficientnet_b0_model.to(device)
inception_v3_model = inception_v3_model.to(device)
criterion = nn.CrossEntropyLoss()
resnet50_optimizer = optim.Adam(resnet50_model.parameters(), lr=0.0005)
efficientnet_b0_optimizer = optim.Adam(efficientnet_b0_model.parameters(), lr=0.0005)
inception_v3_optimizer = optim.Adam(inception_v3_model.parameters(), lr=0.0005)


In [None]:
# resnet training
for epoch in range(10):
    running_loss = 0.0
    correct = 0
    total = 0
    
    for i, data in enumerate(train_dataloader):
        images, labels = data[0].to(device), data[1].to(device)
        resnet50_optimizer.zero_grad()
        outputs = resnet50_model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        resnet50_optimizer.step()
        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        if (i + 1) % 10 == 0:
            accuracy = 100 * correct / total
            print(f'[epoch: {epoch+1} / batch: {i+1:3d}] loss: {running_loss/10:.4f}, accuracy: {accuracy:.2f}%')
            running_loss = 0.0
            correct = 0
            total = 0

print('Finished Training')

# efficient training 
for epoch in range(10):
    running_loss = 0.0
    correct = 0
    total = 0

    for i, data in enumerate(train_dataloader):
        images, labels = data[0].to(device), data[1].to(device)
        efficientnet_b0_optimizer.zero_grad()
        outputs = efficientnet_b0_model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        efficientnet_b0_optimizer.step()
        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        if (i + 1) % 10 == 0:
            accuracy = 100 * correct / total
            print(f'[epoch: {epoch+1} / batch: {i+1:3d}] loss: {running_loss/100:.4f}, accuracy: {accuracy:.2f}%')
            running_loss = 0.0
            correct = 0
            total = 0

print('Finished Training')

# Inception-v3 training
for epoch in range(10):
    running_loss = 0.0
    correct = 0
    total = 0
    for i, data in enumerate(train_dataloader):
        images, labels = data[0].to(device), data[1].to(device)
        inception_v3_optimizer.zero_grad()
        outputs = inception_v3_model(images)
        loss = criterion(outputs.logits, labels)
        loss.backward()
        inception_v3_optimizer.step()
        running_loss += loss.item()
        _, predicted = torch.max(outputs.logits.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        if (i + 1) % 10 == 0:
            accuracy = 100 * correct / total
            print(f'[epoch: {epoch+1} / batch: {i+1:3d}] loss: {running_loss/10:.4f}, accuracy: {accuracy:.2f}%')
            running_loss = 0.0
            correct = 0
            total = 0

print('Finished Training Inception-v3')

# ResNet50 모델 저장
resnet50_path = './SVD/resnet50_model.pth'
torch.save(resnet50_model.state_dict(), resnet50_path)

# EfficientNet-B0 모델 저장
efficientnet_b0_path = './SVD/efficientnet_b0_model2.pth'
torch.save(efficientnet_b0_model.state_dict(), efficientnet_b0_path)

# Inception-v3 모델 저장
inception_v3_path = './SVD/inception_v3_model.pth'
torch.save(inception_v3_model.state_dict(), inception_v3_path)


In [None]:
weights = [0.25, 0.25, 0.5]  # ResNet50, EfficientNet-B0, Inception-v3 모델의 가중치

In [None]:
from sklearn.metrics import confusion_matrix
# confusion matrix
def confusion(resnet50_model, efficientnet_b0_model, inception_v3_model, loader, weights):
    y_true = []
    ensemble_preds = []
    resnet50_model.eval()
    efficientnet_b0_model.eval()
    inception_v3_model.eval()
    with torch.no_grad():
        for data in loader:
            images, labels = data[0].to(device), data[1].to(device)
            resnet50_outputs = resnet50_model(images)
            efficientnet_b0_outputs = efficientnet_b0_model(images)
            inception_v3_outputs = inception_v3_model(images)
            # Softmax 
            resnet50_probabilities = nn.functional.softmax(resnet50_outputs, dim=1)
            efficientnet_b0_probabilities = nn.functional.softmax(efficientnet_b0_outputs, dim=1)
            inception_v3_probabilities = nn.functional.softmax(inception_v3_outputs, dim=1)
            ensemble_probabilities = (weights[0] * resnet50_probabilities) + (weights[1] * efficientnet_b0_probabilities) + (weights[2] * inception_v3_probabilities)
            preds = torch.argmax(ensemble_probabilities, dim=1)
            y_true.extend(labels.tolist())
            ensemble_preds.extend(preds.tolist())
    cm = confusion_matrix(y_true, ensemble_preds)
    return cm


cm = confusion(resnet50_model, efficientnet_b0_model, inception_v3_model, test_dataloader, weights)
print(cm)

# metrics
def metrics(cm):
    tn, fp, fn, tp = cm.ravel()
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    recall = (tp) / (tp + fn)
    precision = (tp) / (tp + fp)
    f1 = (2 * recall * precision) / (recall + precision)
    print(f'accuracy: {accuracy:.4f}, recall: {recall:.4f}, precision: {precision:.4f}, f1: {f1:.4f}')
    return accuracy, recall, precision, f1

# metrics
accuracy, recall, precision, f1 = metrics(cm)