In [1]:
import os
import numpy as np
import librosa
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt


In [2]:
def extract_mel(file_path, sr=16000, n_mels=64):
    y, _ = librosa.load(file_path, sr=sr)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    mel_norm = (mel_db + 40) / 40  # 정규화: [-40, 0] dB → [0, 1]
    return mel_norm



In [6]:
def extract_mel(file_path, sr=16000, n_mels=64):
    y, _ = librosa.load(file_path, sr=sr)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    mel_norm = (mel_db + 40) / 40
    return mel_norm

root_dir = r"D:\AI+XDL\dataset\fan"  # fan 폴더의 상위 경로

mel_data_array = []
label_array = []

for machine_id in os.listdir(root_dir):
    id_path = os.path.join(root_dir, machine_id)
    if not os.path.isdir(id_path):
        continue

    for status in ['normal', 'abnormal']:
        status_path = os.path.join(id_path, status)
        if not os.path.isdir(status_path):
            continue

        label = 0 if status == 'normal' else 1

        for file in os.listdir(status_path):
            if file.endswith(".wav"):
                file_path = os.path.join(status_path, file)
                mel = extract_mel(file_path)
                mel_data_array.append(mel)
                label_array.append(label)

mel_data_array = np.array(mel_data_array)
label_array = np.array(label_array)

print(f"Loaded {len(label_array)} samples")


Loaded 5550 samples


In [7]:
class MelSpectrogramDataset(Dataset):
    def __init__(self, mel_list, label_list):
        self.mel_list = mel_list
        self.label_list = label_list

    def __len__(self):
        return len(self.mel_list)

    def __getitem__(self, idx):
        mel = self.mel_list[idx]
        label = self.label_list[idx]

        mel_tensor = torch.tensor(mel, dtype=torch.float32)
        label_tensor = torch.tensor(label, dtype=torch.long)

        return mel_tensor, label_tensor


In [8]:
dataset = MelSpectrogramDataset(mel_data_array, label_array)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


In [9]:
class CRNN(nn.Module):
    def __init__(self, input_shape, n_classes):
        super(CRNN, self).__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )

        dummy_input = torch.zeros(1, 1, *input_shape)
        cnn_out = self.cnn(dummy_input)
        _, c, f, t = cnn_out.shape
        self.rnn_input_size = f * c

        self.rnn = nn.LSTM(self.rnn_input_size, 64, batch_first=True, bidirectional=True)

        self.classifier = nn.Sequential(
            nn.Linear(64 * 2, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, n_classes)
        )

    def forward(self, x):
        x = self.cnn(x)
        x = x.permute(0, 3, 1, 2)  # (B, T, C, F)
        x = x.contiguous().view(x.shape[0], x.shape[1], -1)
        x, _ = self.rnn(x)
        x = x[:, -1, :]
        return self.classifier(x)


In [10]:
model = CRNN(input_shape=(64, mel_data_array.shape[2]), n_classes=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for mel, label in dataloader:
        mel = mel.unsqueeze(1)  # (B, 1, 64, T)
        output = model(mel)
        loss = criterion(output, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss:.4f}")


Epoch 1/10, Loss: 100.0276
Epoch 2/10, Loss: 92.9578
Epoch 3/10, Loss: 81.8352
Epoch 4/10, Loss: 71.2486
Epoch 5/10, Loss: 60.7396
Epoch 6/10, Loss: 57.1009
Epoch 7/10, Loss: 49.9172
Epoch 8/10, Loss: 42.9956
Epoch 9/10, Loss: 42.6176
Epoch 10/10, Loss: 37.6059


In [11]:
torch.save(model.state_dict(), "crnn_mimii_fan.pth")


In [12]:
# 모델 구조 먼저 다시 선언
model = CRNN(input_shape=(64, mel_data_array.shape[2]), n_classes=2)
model.load_state_dict(torch.load("crnn_mimii_fan.pth"))
model.eval()  # 평가 모드


CRNN(
  (cnn): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (rnn): LSTM(512, 64, batch_first=True, bidirectional=True)
  (classifier): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=64, out_features=2, bias=True)
  )
)

In [13]:
def predict_audio(file_path, model, device="cpu"):
    mel = extract_mel(file_path)  # (64, T)
    
    if mel.shape[1] < 313:
        # 패딩 (너무 짧은 경우)
        pad_width = 313 - mel.shape[1]
        mel = np.pad(mel, ((0, 0), (0, pad_width)), mode='constant')
    elif mel.shape[1] > 313:
        # 자르기 (너무 긴 경우)
        mel = mel[:, :313]
        
    mel_tensor = torch.tensor(mel, dtype=torch.float32).unsqueeze(0).unsqueeze(0)  # (1, 1, 64, 313)

    with torch.no_grad():
        output = model(mel_tensor.to(device))
        pred = torch.argmax(output, dim=1).item()
        prob = torch.softmax(output, dim=1).squeeze().cpu().numpy()

    return pred, prob


In [None]:
# 예: 테스트할 오디오 파일 경로
test_path = "test/path"

pred_label, prob = predict_audio(test_path, model)

status = "이상(abnormal)" if pred_label == 1 else "정상(normal)"
print(f"예측 결과: {status}")
print(f"정상 확률: {prob[0]:.4f}, 이상 확률: {prob[1]:.4f}")


  y, _ = librosa.load(file_path, sr=sr)
	Deprecated as of librosa version 0.10.0.
	It will be removed in librosa version 1.0.
  y, sr_native = __audioread_load(path, offset, duration, dtype)


FileNotFoundError: [Errno 2] No such file or directory: 'D:/AI+XDL/dataset/fan/id_00/abnormal/id_00_abnormal_00000000.wav'