In [2]:
import os
import numpy as np
import librosa
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt


In [3]:
def extract_mel(file_path, sr=16000, n_mels=64):
    y, _ = librosa.load(file_path, sr=sr)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    mel_norm = (mel_db + 40) / 40  # 정규화: [-40, 0] dB → [0, 1]
    return mel_norm



In [4]:
def extract_mel(file_path, sr=16000, n_mels=64):
    y, _ = librosa.load(file_path, sr=sr)
    mel = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    mel_norm = (mel_db + 40) / 40
    return mel_norm

root_dir = r"D:\AI+XDL\dataset\fan"  # fan 폴더의 상위 경로

mel_data_array = []
label_array = []

for machine_id in os.listdir(root_dir):
    id_path = os.path.join(root_dir, machine_id)
    if not os.path.isdir(id_path):
        continue

    for status in ['normal', 'abnormal']:
        status_path = os.path.join(id_path, status)
        if not os.path.isdir(status_path):
            continue

        label = 0 if status == 'normal' else 1

        for file in os.listdir(status_path):
            if file.endswith(".wav"):
                file_path = os.path.join(status_path, file)
                mel = extract_mel(file_path)
                mel_data_array.append(mel)
                label_array.append(label)

mel_data_array = np.array(mel_data_array)
label_array = np.array(label_array)

print(f"Loaded {len(label_array)} samples")


Loaded 5550 samples


In [5]:
class MelSpectrogramDataset(Dataset):
    def __init__(self, mel_list, label_list):
        self.mel_list = mel_list
        self.label_list = label_list

    def __len__(self):
        return len(self.mel_list)

    def __getitem__(self, idx):
        mel = self.mel_list[idx]
        label = self.label_list[idx]

        mel_tensor = torch.tensor(mel, dtype=torch.float32)
        label_tensor = torch.tensor(label, dtype=torch.long)

        return mel_tensor, label_tensor


In [6]:
dataset = MelSpectrogramDataset(mel_data_array, label_array)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)


In [7]:
class CRNN(nn.Module):
    def __init__(self, input_shape, n_classes):
        super(CRNN, self).__init__()

        self.cnn = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, padding=1),
            nn.BatchNorm2d(16),
            nn.ReLU(),
            nn.MaxPool2d(2),

            nn.Conv2d(16, 32, kernel_size=3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(),
            nn.MaxPool2d(2)
        )

        dummy_input = torch.zeros(1, 1, *input_shape)
        cnn_out = self.cnn(dummy_input)
        _, c, f, t = cnn_out.shape
        self.rnn_input_size = f * c

        self.rnn = nn.LSTM(self.rnn_input_size, 64, batch_first=True, bidirectional=True)

        self.classifier = nn.Sequential(
            nn.Linear(64 * 2, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, n_classes)
        )

    def forward(self, x):
        x = self.cnn(x)
        x = x.permute(0, 3, 1, 2)  # (B, T, C, F)
        x = x.contiguous().view(x.shape[0], x.shape[1], -1)
        x, _ = self.rnn(x)
        x = x[:, -1, :]
        return self.classifier(x)


In [14]:
model = CRNN(input_shape=(64, mel_data_array.shape[2]), n_classes=2)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()
num_epochs = 30

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for mel, label in dataloader:
        mel = mel.unsqueeze(1)  # (B, 1, 64, T)
        output = model(mel)
        loss = criterion(output, label)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {running_loss:.4f}")


Epoch 1/30, Loss: 98.6494
Epoch 2/30, Loss: 91.2964
Epoch 3/30, Loss: 80.4827
Epoch 4/30, Loss: 72.4535
Epoch 5/30, Loss: 64.7064
Epoch 6/30, Loss: 52.6553
Epoch 7/30, Loss: 45.0429
Epoch 8/30, Loss: 41.2857
Epoch 9/30, Loss: 37.9990
Epoch 10/30, Loss: 31.9977
Epoch 11/30, Loss: 30.9600
Epoch 12/30, Loss: 26.7861
Epoch 13/30, Loss: 23.5932
Epoch 14/30, Loss: 19.5062
Epoch 15/30, Loss: 18.1995
Epoch 16/30, Loss: 18.9687
Epoch 17/30, Loss: 15.6025
Epoch 18/30, Loss: 12.7435
Epoch 19/30, Loss: 11.4171
Epoch 20/30, Loss: 12.1144
Epoch 21/30, Loss: 8.8913
Epoch 22/30, Loss: 9.0950
Epoch 23/30, Loss: 6.2131
Epoch 24/30, Loss: 6.8407
Epoch 25/30, Loss: 18.2185
Epoch 26/30, Loss: 6.1346
Epoch 27/30, Loss: 3.5244
Epoch 28/30, Loss: 3.2880
Epoch 29/30, Loss: 7.8595
Epoch 30/30, Loss: 3.2597


In [15]:
torch.save(model.state_dict(), "crnn_mimii_fan.pth")


In [16]:
# 모델 구조 먼저 다시 선언
model = CRNN(input_shape=(64, mel_data_array.shape[2]), n_classes=2)
model.load_state_dict(torch.load("crnn_mimii_fan.pth"))
model.eval()  # 평가 모드


CRNN(
  (cnn): Sequential(
    (0): Conv2d(1, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (rnn): LSTM(512, 64, batch_first=True, bidirectional=True)
  (classifier): Sequential(
    (0): Linear(in_features=128, out_features=64, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=64, out_features=2, bias=True)
  )
)

In [17]:
def predict_audio(file_path, model, device="cpu"):
    mel = extract_mel(file_path)  # (64, T)
    
    if mel.shape[1] < 313:
        # 패딩 (너무 짧은 경우)
        pad_width = 313 - mel.shape[1]
        mel = np.pad(mel, ((0, 0), (0, pad_width)), mode='constant')
    elif mel.shape[1] > 313:
        # 자르기 (너무 긴 경우)
        mel = mel[:, :313]
        
    mel_tensor = torch.tensor(mel, dtype=torch.float32).unsqueeze(0).unsqueeze(0)  # (1, 1, 64, 313)

    with torch.no_grad():
        output = model(mel_tensor.to(device))
        pred = torch.argmax(output, dim=1).item()
        prob = torch.softmax(output, dim=1).squeeze().cpu().numpy()

    return pred, prob


In [None]:
normal_count = 0
abnormal_count = 0
total = 0

target_folder = "D:/AI+XDL/archive/dev_data_fan/train"

for filename in os.listdir(target_folder):
    if filename.endswith(".wav"):
        path = os.path.join(target_folder, filename)
        try:
            pred, prob = predict_audio(path, model)
            label = "정상" if pred == 0 else "이상"
            print(f"{filename}: {label} (정상확률: {prob[0]:.3f}, 이상확률: {prob[1]:.3f})")

            if pred == 0:
                normal_count += 1
            else:
                abnormal_count += 1
            total += 1
        except Exception as e:
            print(f"[오류] {filename}: {e}")

# ✅ 예측 결과 통계 출력
if total > 0:
    print("\n🔍 예측 결과 요약:")
    print(f"정상: {normal_count}개 ({normal_count / total * 100:.1f}%)")
    print(f"이상: {abnormal_count}개 ({abnormal_count / total * 100:.1f}%)")
    print(f"총 예측 수: {total}개")
else:
    print("예측된 오디오가 없습니다.")



00000000.wav: 이상 (정상확률: 0.000, 이상확률: 1.000)
00000001.wav: 이상 (정상확률: 0.000, 이상확률: 1.000)
00000002.wav: 이상 (정상확률: 0.019, 이상확률: 0.981)
00000003.wav: 이상 (정상확률: 0.000, 이상확률: 1.000)
00000004.wav: 이상 (정상확률: 0.000, 이상확률: 1.000)
00000005.wav: 이상 (정상확률: 0.000, 이상확률: 1.000)
00000006.wav: 이상 (정상확률: 0.001, 이상확률: 0.999)
00000007.wav: 이상 (정상확률: 0.000, 이상확률: 1.000)
00000008.wav: 이상 (정상확률: 0.001, 이상확률: 0.999)
00000009.wav: 이상 (정상확률: 0.000, 이상확률: 1.000)
00000010.wav: 이상 (정상확률: 0.001, 이상확률: 0.999)
00000011.wav: 이상 (정상확률: 0.005, 이상확률: 0.995)
00000012.wav: 이상 (정상확률: 0.000, 이상확률: 1.000)
00000013.wav: 이상 (정상확률: 0.000, 이상확률: 1.000)
00000014.wav: 이상 (정상확률: 0.001, 이상확률: 0.999)
00000015.wav: 이상 (정상확률: 0.003, 이상확률: 0.997)
00000016.wav: 이상 (정상확률: 0.000, 이상확률: 1.000)
00000017.wav: 이상 (정상확률: 0.000, 이상확률: 1.000)
00000018.wav: 이상 (정상확률: 0.000, 이상확률: 1.000)
00000019.wav: 이상 (정상확률: 0.008, 이상확률: 0.992)
00000020.wav: 이상 (정상확률: 0.013, 이상확률: 0.987)
00000021.wav: 이상 (정상확률: 0.000, 이상확률: 1.000)
00000022.wav: 이상 (정상확률: 0.002, 이