# Instrument Recognition (Signals and Systems)
This is an instrument recognition project based on IRMAS data set.
## Define the IRMASDataset class

In [9]:
import os
import torch
from torch.utils.data import Dataset
import librosa
import numpy as np

class IRMASDataset(Dataset):
    def __init__(self, file_paths, labels, transform=None, max_len=22050*3):
        """
        Args:
            file_paths (list): 音频文件路径列表。
            labels (list): 对应的标签列表。
            transform (callable, optional): 对输入数据的变换。
            max_len (int): 固定音频长度（采样点数）。超过的部分截断，不足的部分填充。
        """
        self.file_paths = file_paths
        self.labels = labels
        self.transform = transform
        self.max_len = max_len
        self.unique_labels = sorted(list(set(labels)))
        self.label_to_idx = {label: idx for idx, label in enumerate(self.unique_labels)}

    def __len__(self):
        return len(self.file_paths)

    def __getitem__(self, idx):
        audio_path = self.file_paths[idx]
        label = self.labels[idx]
        y, sr = librosa.load(audio_path, sr=22050)  # IRMAS默认采样率为22.05kHz

        # 固定长度
        if len(y) > self.max_len:
            y = y[:self.max_len]
        else:
            y = np.pad(y, (0, max(0, self.max_len - len(y))), 'constant')

        # 提取MFCC特征
        mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
        mfcc = librosa.util.fix_length(mfcc, size=40, axis=1)  # 固定时间帧数

        # 转置以适应CNN输入（channels, height, width）
        mfcc = mfcc.T  # shape: (time_frames, n_mfcc)
        mfcc = np.expand_dims(mfcc, axis=0)  # shape: (1, time_frames, n_mfcc)

        if self.transform:
            mfcc = self.transform(mfcc)
        else:
            mfcc = torch.tensor(mfcc, dtype=torch.float32)

        label = self.label_to_idx[label]
        label = torch.tensor(label, dtype=torch.long)

        return mfcc, label

## Dataloader & Preprocess

In [10]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

# 获取文件路径和标签
def get_file_paths_and_labels(root_dir):
    file_paths = []
    labels = []
    for filename in os.listdir(root_dir):
        if filename.endswith('.wav'):
            file_paths.append(os.path.join(root_dir, filename))
            # 文件名格式: '01-violin-A-1.wav'
            label = filename.split('-')[1]
            labels.append(label)
    return file_paths, labels

# 加载训练和验证集
train_dir = './IRMAS/IRMAS-TrainingData'
val_dir = './IRMAS/IRMAS-TestingData-Part1'

train_files, train_labels = get_file_paths_and_labels(train_dir)
val_files, val_labels = get_file_paths_and_labels(val_dir)

# 创建数据集
train_dataset = IRMASDataset(train_files, train_labels)
val_dataset = IRMASDataset(val_files, val_labels)

# 创建数据加载器
batch_size = 32
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

## CNN Model for MFCC Classification


In [11]:
import torch.nn as nn
import torch.nn.functional as F

class SimpleCNN(nn.Module):
    def __init__(self, num_classes):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(32)
        self.pool1 = nn.MaxPool2d(2, 2)

        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.bn2 = nn.BatchNorm2d(64)
        self.pool2 = nn.MaxPool2d(2, 2)

        self.conv3 = nn.Conv2d(64, 128, kernel_size=3, padding=1)
        self.bn3 = nn.BatchNorm2d(128)
        self.pool3 = nn.MaxPool2d(2, 2)

        self.dropout = nn.Dropout(0.3)

        # 假设MFCC的时间帧数为40，经过3次池化（每次除以2），时间帧数约为5
        # n_mfcc = 40，经过3次池化后约为5
        self.fc1 = nn.Linear(128 * 5 * 5, 256)
        self.fc2 = nn.Linear(256, num_classes)

    def forward(self, x):
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))  # [batch, 32, 110, 20]
        x = self.pool2(F.relu(self.bn2(self.conv2(x))))  # [batch, 64, 55, 10]
        x = self.pool3(F.relu(self.bn3(self.conv3(x))))  # [batch, 128, 27, 5]
        x = x.view(x.size(0), -1)  # 展平
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

## Training & Verification


In [21]:
import torch
import torch.optim as optim
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from tqdm import tqdm

# 设置设备
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'使用设备: {device}')

# 初始化模型
num_classes = len(train_dataset.unique_labels)
model = SimpleCNN(num_classes=num_classes).to(device)

# 定义损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# 训练参数
num_epochs = 30
best_val_acc = 0.0

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    all_preds = []
    all_labels = []

    for inputs, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Training"):
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

    epoch_loss = running_loss / len(train_loader.dataset)
    epoch_acc = accuracy_score(all_labels, all_preds)

    # 验证
    model.eval()
    val_running_loss = 0.0
    val_all_preds = []
    val_all_labels = []

    with torch.no_grad():
        for inputs, labels in tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} - Validation"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            val_running_loss += loss.item() * inputs.size(0)
            _, preds = torch.max(outputs, 1)
            val_all_preds.extend(preds.cpu().numpy())
            val_all_labels.extend(labels.cpu().numpy())

    val_loss = val_running_loss / len(val_loader.dataset)
    val_acc = accuracy_score(val_all_labels, val_all_preds)
    val_f1 = f1_score(val_all_labels, val_all_preds, average='weighted')
    val_precision = precision_score(val_all_labels, val_all_preds, average='weighted')
    val_recall = recall_score(val_all_labels, val_all_preds, average='weighted')

    print(f'Epoch [{epoch+1}/{num_epochs}] '
          f'Train Loss: {epoch_loss:.4f} Train Acc: {epoch_acc:.4f} | '
          f'Val Loss: {val_loss:.4f} Val Acc: {val_acc:.4f} '
          f'Val Precision: {val_precision:.4f} Val Recall: {val_recall:.4f} Val F1: {val_f1:.4f}')

    # 保存最佳模型
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), 'best_irmas_cnn.pth')
        print(f'最佳模型已保存，验证准确率: {best_val_acc:.4f}')

训练集文件数量: 6705
训练集示例:
文件: ./IRMAS/IRMAS-TrainingData/pia/[pia][cla]1346__3.wav, 标签: piano
文件: ./IRMAS/IRMAS-TrainingData/pia/[pia][cla]1291__1.wav, 标签: piano
文件: ./IRMAS/IRMAS-TrainingData/pia/[pia][jaz_blu]1490__3.wav, 标签: piano
文件: ./IRMAS/IRMAS-TrainingData/pia/027__[pia][nod][cla]1398__2.wav, 标签: piano
文件: ./IRMAS/IRMAS-TrainingData/pia/[pia][jaz_blu]1524__1.wav, 标签: piano


NameError: name 'train_test_split' is not defined

## Model Evaluation


In [20]:
# 加载最佳模型
model = SimpleCNN(num_classes=num_classes).to(device)
model.load_state_dict(torch.load('best_irmas_cnn.pth'))
model.eval()

# 示例评估函数
def evaluate(model, loader):
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for inputs, labels in tqdm(loader, desc="Evaluating"):
            inputs, labels = inputs.to(device), labels.to(device)
            outputs = model(inputs)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    precision = precision_score(all_labels, all_preds, average='weighted')
    recall = recall_score(all_labels, all_preds, average='weighted')
    print(f'Accuracy: {acc:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}, F1 Score: {f1:.4f}')

# 评估验证集
evaluate(model, val_loader)

训练集文件数量: 6705
训练集示例:
文件: ./IRMAS/IRMAS-TrainingData/pia/[pia][cla]1346__3.wav, 标签: piano
文件: ./IRMAS/IRMAS-TrainingData/pia/[pia][cla]1291__1.wav, 标签: piano
文件: ./IRMAS/IRMAS-TrainingData/pia/[pia][jaz_blu]1490__3.wav, 标签: piano
文件: ./IRMAS/IRMAS-TrainingData/pia/027__[pia][nod][cla]1398__2.wav, 标签: piano
文件: ./IRMAS/IRMAS-TrainingData/pia/[pia][jaz_blu]1524__1.wav, 标签: piano


NameError: name 'train_test_split' is not defined