# create spectrogram images

In [29]:
import os
import glob
import numpy as np
import torch
import torchaudio
import torchaudio.transforms as T
import librosa
import librosa.display
import IPython.display as ipd
import matplotlib.pyplot as plt
from torch.utils.data import DataLoader


In [30]:
# 파일 개수 확인
train_healthy_paths = list(glob.glob('./SVD/train/healthy/*.wav'))
train_pathology_paths = list(glob.glob('./SVD/train/pathology/*.wav'))
test_healthy_paths = list(glob.glob('./SVD/test/healthy/*.wav'))
test_pathology_paths = list(glob.glob('./SVD/test/pathology/*.wav'))
print(f'train healthy   : {len(train_healthy_paths)} audios')
print(f'train_pathology : {len(train_pathology_paths)} audios')
print(f'test_healthy    : {len(test_healthy_paths)} audios')
print(f'test_pathology  : {len(test_pathology_paths)} audios')

train healthy   : 532 audios
train_pathology : 762 audios
test_healthy    : 100 audios
test_pathology  : 100 audios


In [31]:
# dataset 정의
def load_audios(paths):

    paths = paths
    dataset = []
    for p in paths:
        name = os.path.basename(p)
        name = os.path.splitext(name)[0]
        waveform, sample_rate = torchaudio.load(p)
        dataset.append([waveform, sample_rate, name])

    return dataset

In [32]:
# dataset 생성
h_train = load_audios(train_healthy_paths)
p_train = load_audios(train_pathology_paths)
h_test = load_audios(test_healthy_paths)
p_test = load_audios(test_pathology_paths)

In [33]:
# dataloader
loader_h_train = DataLoader(h_train, batch_size=1, shuffle=False)
loader_p_train = DataLoader(p_train, batch_size=1, shuffle=False)
loader_h_test = DataLoader(h_test, batch_size=1, shuffle=False)
loader_p_test = DataLoader(p_test, batch_size=1, shuffle=False)

In [34]:
def create_melspectrogram_images(loader, label, t):
    dir = f'./SVD/melspectrograms/{t}/{label}'
    os.makedirs(dir, exist_ok=True)
    
    for data in loader:
        waveform = data[0][0]
        sample_rate = data[1][0]
        name = data[2][0]
        
        melspectrogram = T.MelSpectrogram(sample_rate=sample_rate, n_fft=2048, hop_length=512, n_mels=128)
        mel_spec = melspectrogram(waveform)
        
        plt.figure(figsize=(10, 4))
        plt.imshow(librosa.power_to_db(mel_spec.squeeze().numpy(), ref=np.max), aspect='auto', origin='lower')
        plt.axis('off')
        plt.tight_layout(pad=0)
        plt.savefig(f'./SVD/melspectrograms/{t}/{label}/{name}.png', bbox_inches='tight', pad_inches=0)
        plt.close()

In [35]:
# spectrogram image 생성 (5분 정도 소요)
create_melspectrogram_images(loader_h_train, 'healthy', 'train')
create_melspectrogram_images(loader_p_train, 'pathology', 'train')
create_melspectrogram_images(loader_h_test, 'healthy', 'test')
create_melspectrogram_images(loader_p_test, 'pathology', 'test')

In [36]:
# 데이터 증강 함수 정의
def augmentation(mel_spec_tensor):
    # 주파수 마스킹
    freq_masking = T.FrequencyMasking(freq_mask_param=30)
    mel_spec_tensor = freq_masking(mel_spec_tensor)
    
    # 시간 마스킹
    time_masking = T.TimeMasking(time_mask_param=40)
    mel_spec_tensor = time_masking(mel_spec_tensor)
    
    return mel_spec_tensor

def create_augmented_melspectrogram_images(image_paths, label, t, augmentation_ratio):
    dir_aug = f'./SVD/melspectrograms/{t}/{label}'
    os.makedirs(dir_aug, exist_ok=True)

    augmented_count = int(len(image_paths) * augmentation_ratio)

    for i, image_path in enumerate(image_paths):
        if i >= augmented_count:
            break
        mel_spec = plt.imread(image_path)
        
        if mel_spec.ndim == 3 and mel_spec.shape[2] == 4:
            mel_spec = mel_spec[:, :, :3]  # RGBA -> RGB
        
        mel_spec_tensor = torch.from_numpy(mel_spec).permute(2, 0, 1).unsqueeze(0).float()
        
        augmented_mel_spec = augmentation(mel_spec_tensor)
        augmented_mel_spec_np = augmented_mel_spec.squeeze().permute(1, 2, 0).numpy()
        
        name = os.path.splitext(os.path.basename(image_path))[0]
        
        plt.imsave(f'./SVD/melspectrograms/{t}/{label}/{name}_augmented.png', augmented_mel_spec_np, cmap='viridis')

# 원본 이미지 경로
healthy_image_dir = 'SVD/melspectrograms/train/healthy'
pathology_image_dir = 'SVD/melspectrograms/train/pathology'

# 원본 이미지 경로 리스트
healthy_paths = [os.path.join(healthy_image_dir, f) for f in os.listdir(healthy_image_dir) if f.endswith('.png')]
pathology_paths = [os.path.join(pathology_image_dir, f) for f in os.listdir(pathology_image_dir) if f.endswith('.png')]

# 증강된 spectrogram image 생성
create_augmented_melspectrogram_images(healthy_paths, 'healthy', 'train', augmentation_ratio=1)
create_augmented_melspectrogram_images(pathology_paths, 'pathology', 'train', augmentation_ratio=1)

In [37]:
# 파일 개수 확인
train_healthy_images = list(glob.glob('./SVD/melspectrograms/train/healthy/*.png'))
train_pathology_images = list(glob.glob('./SVD/melspectrograms/train/pathology/*.png'))
test_healthy_images = list(glob.glob('./SVD/melspectrograms/test/healthy/*.png'))
test_pathology_images = list(glob.glob('./SVD/melspectrograms/test/pathology/*.png'))
print(f'train healthy   : {len(train_healthy_images)} images')
print(f'train_pathology : {len(train_pathology_images)} images')
print(f'test_healthy    : {len(test_healthy_images)} images')
print(f'test_pathology  : {len(test_pathology_images)} images')

train healthy   : 1064 images
train_pathology : 1524 images
test_healthy    : 100 images
test_pathology  : 100 images


# classification

In [38]:
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import transforms
from torchvision.datasets import ImageFolder

# trainset
train_path = './SVD/melspectrograms/train'
trainset = ImageFolder(root=train_path, transform=transforms.Compose([transforms.ToTensor(),
                                                                    transforms.Resize((256, 256))]))
print(trainset)
print(f'\nclass : index\n{trainset.class_to_idx}')

# testset
test_path = './SVD/melspectrograms/test'
testset = ImageFolder(root=test_path, transform=transforms.Compose([transforms.ToTensor(),
                                                                    transforms.Resize((256, 256))]))
print(testset)
print(f'\nclass : index\n{testset.class_to_idx}')

# dataloader
train_dataloader = DataLoader(trainset, batch_size=32, shuffle=True)
test_dataloader = DataLoader(testset, batch_size=16, shuffle=False)


Dataset ImageFolder
    Number of datapoints: 2588
    Root location: ./SVD/melspectrograms/train
    StandardTransform
Transform: Compose(
               ToTensor()
               Resize(size=(256, 256), interpolation=bilinear, max_size=None, antialias=True)
           )

class : index
{'healthy': 0, 'pathology': 1}
Dataset ImageFolder
    Number of datapoints: 200
    Root location: ./SVD/melspectrograms/test
    StandardTransform
Transform: Compose(
               ToTensor()
               Resize(size=(256, 256), interpolation=bilinear, max_size=None, antialias=True)
           )

class : index
{'healthy': 0, 'pathology': 1}


In [39]:
# GPU
print(torch.backends.mps.is_built())
print(torch.backends.mps.is_available()) 

device = torch.device("mps")
print(f'Using {device} device')

True
True
Using mps device


In [40]:
# 사전학습모델 불러오기
import numpy as np
from torchvision.models import resnet50, ResNet50_Weights, efficientnet_b0, EfficientNet_B0_Weights

# ResNet50 모델
resnet50_model = resnet50(weights=ResNet50_Weights.DEFAULT)
resnet50_model.fc = nn.Sequential(
    nn.Dropout(p=0.5),
    nn.Linear(2048, 2)
)
# EfficientNet-B0 모델
efficientnet_b0_model = efficientnet_b0(weights=EfficientNet_B0_Weights.DEFAULT)
efficientnet_b0_model.classifier = nn.Sequential(
    nn.Dropout(p=0.5),
    nn.Linear(1280, 2)
)

# 모델, 손실함수, 옵티마이저
resnet50_model = resnet50_model.to(device)
efficientnet_b0_model = efficientnet_b0_model.to(device)
criterion = nn.CrossEntropyLoss()
resnet50_optimizer = optim.Adam(resnet50_model.parameters(), lr=0.0005)
efficientnet_b0_optimizer = optim.Adam(efficientnet_b0_model.parameters(), lr=0.0005)


In [41]:
# resnet training
for epoch in range(10):
    running_loss = 0.0
    correct = 0
    total = 0
    
    for i, data in enumerate(train_dataloader):
        images, labels = data[0].to(device), data[1].to(device)
        resnet50_optimizer.zero_grad()
        outputs = resnet50_model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        resnet50_optimizer.step()
        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        if (i + 1) % 10 == 0:
            accuracy = 100 * correct / total
            print(f'[epoch: {epoch+1} / batch: {i+1:3d}] loss: {running_loss/10:.4f}, accuracy: {accuracy:.2f}%')
            running_loss = 0.0
            correct = 0
            total = 0

print('Finished Training')

# efficient training 
for epoch in range(10):
    running_loss = 0.0
    correct = 0
    total = 0

    for i, data in enumerate(train_dataloader):
        images, labels = data[0].to(device), data[1].to(device)
        efficientnet_b0_optimizer.zero_grad()
        outputs = efficientnet_b0_model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        efficientnet_b0_optimizer.step()
        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

        if (i + 1) % 10 == 0:
            accuracy = 100 * correct / total
            print(f'[epoch: {epoch+1} / batch: {i+1:3d}] loss: {running_loss/100:.4f}, accuracy: {accuracy:.2f}%')
            running_loss = 0.0
            correct = 0
            total = 0

print('Finished Training')

# ResNet50 모델 저장
resnet50_path = './SVD/resnet50_model.pth'
torch.save(resnet50_model.state_dict(), resnet50_path)

# EfficientNet-B0 모델 저장
efficientnet_b0_path = './SVD/efficientnet_b0_model.pth'
torch.save(efficientnet_b0_model.state_dict(), efficientnet_b0_path)


[epoch: 1 / batch:  10] loss: 0.6287, accuracy: 64.69%
[epoch: 1 / batch:  20] loss: 0.5090, accuracy: 74.06%
[epoch: 1 / batch:  30] loss: 0.4770, accuracy: 70.62%
[epoch: 1 / batch:  40] loss: 0.4609, accuracy: 79.38%
[epoch: 1 / batch:  50] loss: 0.3947, accuracy: 82.50%
[epoch: 1 / batch:  60] loss: 0.4033, accuracy: 82.50%
[epoch: 1 / batch:  70] loss: 0.3278, accuracy: 84.38%
[epoch: 1 / batch:  80] loss: 0.3431, accuracy: 84.38%
[epoch: 2 / batch:  10] loss: 0.2698, accuracy: 87.81%
[epoch: 2 / batch:  20] loss: 0.2144, accuracy: 90.62%
[epoch: 2 / batch:  30] loss: 0.2925, accuracy: 85.94%
[epoch: 2 / batch:  40] loss: 0.2419, accuracy: 93.12%
[epoch: 2 / batch:  50] loss: 0.3277, accuracy: 89.69%
[epoch: 2 / batch:  60] loss: 0.2834, accuracy: 87.19%
[epoch: 2 / batch:  70] loss: 0.2335, accuracy: 90.31%
[epoch: 2 / batch:  80] loss: 0.1880, accuracy: 93.75%
[epoch: 3 / batch:  10] loss: 0.1461, accuracy: 94.38%
[epoch: 3 / batch:  20] loss: 0.1259, accuracy: 95.94%
[epoch: 3 

In [42]:
from sklearn.metrics import confusion_matrix

# 테스트 데이터에 대한 예측 결과 얻기
def confusion(resnet50_model, efficientnet_b0_model, loader, resnet50_weight, efficientnet_b0_weight):
    y_true = []
    ensemble_preds = []
    
    resnet50_model.eval()
    efficientnet_b0_model.eval()
    
    with torch.no_grad():
        for data in loader:
            images, labels = data[0].to(device), data[1].to(device)
            resnet50_outputs = resnet50_model(images)
            efficientnet_b0_outputs = efficientnet_b0_model(images)
            
            # Softmax 함수 적용하여 정규화
            resnet50_probabilities = nn.functional.softmax(resnet50_outputs, dim=1)
            efficientnet_b0_probabilities = nn.functional.softmax(efficientnet_b0_outputs, dim=1)
            
            ensemble_probabilities = (resnet50_weight * resnet50_probabilities) + (efficientnet_b0_weight * efficientnet_b0_probabilities)
            preds = torch.argmax(ensemble_probabilities, dim=1)
            
            y_true.extend(labels.tolist())
            ensemble_preds.extend(preds.tolist())
    
    cm = confusion_matrix(y_true, ensemble_preds)
    return cm

# 가중치 탐색 함수
def weight_search(resnet50_model, efficientnet_b0_model, loader):
    best_accuracy = 0
    best_weights = None
    best_metrics = None

    # 가중치 범위 설정 (0.3 ~ 0.7)
    weight_range = np.arange(0.3, 0.71, 0.05)
    for resnet50_weight in weight_range:
        efficientnet_b0_weight = 1 - resnet50_weight
        # 주어진 가중치로 평가
        cm = confusion(resnet50_model, efficientnet_b0_model, loader, resnet50_weight, efficientnet_b0_weight)
        accuracy, recall, precision, f1 = metrics(cm)
        # 최고 accuracy 갱신
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_weights = (resnet50_weight, efficientnet_b0_weight)
            best_metrics = (accuracy, recall, precision, f1)

    return best_weights, best_metrics

# metrics 계산 함수
def metrics(cm):
    tn, fp, fn, tp = cm.ravel()
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    recall = (tp) / (tp + fn)
    precision = (tp) / (tp + fp)
    f1 = (2 * recall * precision) / (recall + precision)

    return accuracy, recall, precision, f1

# 최적 가중치 탐색
best_weights, best_metrics = weight_search(resnet50_model, efficientnet_b0_model, test_dataloader)
best_resnet50_weight, best_efficientnet_b0_weight = best_weights

# 최적 가중치로 confusion matrix 계산
cm = confusion(resnet50_model, efficientnet_b0_model, test_dataloader, best_resnet50_weight, best_efficientnet_b0_weight)

# 최적 가중치와 confusion matrix 출력
print(f"Best Weights: ResNet50 = {best_resnet50_weight:.2f}, EfficientNet-B0 = {best_efficientnet_b0_weight:.2f}")
print(cm)

# 최적 가중치로 계산된 metrics 출력
accuracy, recall, precision, f1 = best_metrics
print(f'accuracy: {accuracy:.4f}, recall: {recall:.4f}, precision: {precision:.4f}, f1: {f1:.4f}')

Best Weights: ResNet50 = 0.45, EfficientNet-B0 = 0.55
[[84 16]
 [23 77]]
accuracy: 0.8050, recall: 0.7700, precision: 0.8280, f1: 0.7979


In [43]:
from sklearn.metrics import confusion_matrix

# confusion matrix
def confusion(resnet50_model, efficientnet_b0_model, loader, weights):
    y_true = []
    ensemble_preds = []
    resnet50_model.eval()
    efficientnet_b0_model.eval()
    with torch.no_grad():
        for data in loader:
            images, labels = data[0].to(device), data[1].to(device)
            resnet50_outputs = resnet50_model(images)
            efficientnet_b0_outputs = efficientnet_b0_model(images)
            # Softmax 
            resnet50_probabilities = nn.functional.softmax(resnet50_outputs, dim=1)
            efficientnet_b0_probabilities = nn.functional.softmax(efficientnet_b0_outputs, dim=1)
            ensemble_probabilities = (weights[0] * resnet50_probabilities) + (weights[1] * efficientnet_b0_probabilities)
            preds = torch.argmax(ensemble_probabilities, dim=1)
            y_true.extend(labels.tolist())
            ensemble_preds.extend(preds.tolist())
    cm = confusion_matrix(y_true, ensemble_preds)
    return cm

# confusion matrix
weights = [0.4,0.6]  # ResNet50, EfficientNet-B0
cm = confusion(resnet50_model, efficientnet_b0_model, test_dataloader, weights)
print(cm)

# metrics
def metrics(cm):
    tn, fp, fn, tp = cm.ravel()
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    recall = (tp) / (tp + fn)
    precision = (tp) / (tp + fp)
    f1 = (2 * recall * precision) / (recall + precision)
    print(f'accuracy: {accuracy:.4f}, recall: {recall:.4f}, precision: {precision:.4f}, f1: {f1:.4f}')
    return accuracy, recall, precision, f1

# metrics
accuracy, recall, precision, f1 = metrics(cm)

[[83 17]
 [23 77]]
accuracy: 0.8000, recall: 0.7700, precision: 0.8191, f1: 0.7938
