# Test

In [1]:
import torchaudio
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import os

In [2]:
# 全局參數設置
n_fft = 512
win_length = None  # 設置為 None 時，默認等於 n_fft
hop_length = n_fft // 2
n_mels = 40  # 確保這個值與模型定義時使用的值相同
sample_rate = 44100  # 請確保這個值正確

mel_spectrogram_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate = sample_rate,
    n_fft = n_fft,
    win_length = win_length,
    hop_length = hop_length,
    n_mels = n_mels,
    f_min = 0,
    f_max = sample_rate/2
)

### 音頻數據的準備與處理
你可以使用 `torchaudio` 來處理音頻數據，將其加載並轉換為頻譜圖。`torchaudio` 可以非常方便地處理音頻格式。

In [3]:
clean_data_dir = "../voice/data/"
mixed_data_dir = "../sound/"
clean_data = []
mixed_data = []
target_size = float('inf')

for file_name in os.listdir(mixed_data_dir):
    clean_file_name = file_name.split('_')[1].split('.')[0] + '.wav'
    waveform_clean, sample_rate = torchaudio.load(''.join([clean_data_dir, clean_file_name]))
    waveform_mixed, sample_rate = torchaudio.load(''.join([mixed_data_dir, file_name]))
    waveform_clean_mono = waveform_clean.mean(dim = 0, keepdim = True)
    waveform_mixed_mono = waveform_mixed.mean(dim = 0, keepdim = True)
    
    # 生成 Mel Spectrogram
    mel_spectrogram_clean = mel_spectrogram_transform(waveform_clean_mono)
    mel_spectrogram_mixed = mel_spectrogram_transform(waveform_mixed_mono)
    
    # 確保兩個頻譜圖具有相同的大小
    target_size = min(target_size, min(mel_spectrogram_clean.shape[-1], mel_spectrogram_mixed.shape[-1]))
    target_size = (target_size // 8) * 8  # 确保目标大小是8的倍数
    clean_data.append(mel_spectrogram_clean)
    mixed_data.append(mel_spectrogram_mixed)
    
    
for i in range(len(clean_data)):
    clean_data[i] = clean_data[i][..., :target_size]
    mixed_data[i] = mixed_data[i][..., :target_size]

print("data length:", len(clean_data))
print("data shape:", clean_data[0].shape)
print("data shape:", mixed_data[0].shape)

data length: 1100
data shape: torch.Size([1, 40, 376])
data shape: torch.Size([1, 40, 376])


### 構建 Denoise Autoencoder 模型
這是一個簡單的自編碼器的 PyTorch 模型範例，使用了卷積層來處理音頻的頻譜數據：

In [4]:
class DenoiseAutoencoder(nn.Module):
    def __init__(self):
        super(DenoiseAutoencoder, self).__init__()
        # 編碼器
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, stride=2, padding=1),  # 修改這裡的in_channels為2
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1),
            nn.ReLU()
        )
        # 解碼器
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(64, 32, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose2d(32, 16, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose2d(16, 1, kernel_size=3, stride=2, padding=1, output_padding=1),  # 修改這裡的out_channels為2
            nn.Sigmoid()  # Sigmoid將輸出值範圍限制在[0, 1]
        )
        
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x


### 準備數據
你可以將嘈雜音頻和乾淨音頻分別準備成訓練集的輸入和標籤。這裡的 `DataLoader` 用來批量加載數據。

In [5]:
class AudioDataset(Dataset):
    def __init__(self, mixed_data, clean_data):
        self.mixed_data = mixed_data
        self.clean_data = clean_data
    
    def __len__(self):
        return len(self.mixed_data)
    
    def __getitem__(self, idx):
        return self.mixed_data[idx], self.clean_data[idx]

# 假設已經有 noisy_data 和 clean_data 作為音頻數據的 Mel spectrogram
dataset = AudioDataset(mixed_data, clean_data)
dataloader = DataLoader(dataset, batch_size = len(mixed_data), shuffle = True)

### 訓練模型
我們可以使用均方誤差（MSE）作為損失函數，並使用Adam優化器來進行模型參數的更新。

In [6]:
# 確認是否有可用的 GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# 初始化模型、損失函數和優化器
model = DenoiseAutoencoder().to(device)  # 確保模型在設備上
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)  # learning rate

# 訓練過程
num_epochs = 50
best_loss = float('inf')  # 初始化最佳損失值

for epoch in range(num_epochs):
    model.train()  # 訓練模式
    for noisy, clean in dataloader:
        # 將數據移動到設備 (如果你使用 GPU)
        noisy, clean = noisy.to(device), clean.to(device)

        # 前向傳播
        outputs = model(noisy)
        loss = criterion(outputs, clean)
        
        # 反向傳播與優化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    # 儲存最佳模型
    if loss.item() < best_loss:
        best_loss = loss.item()
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': best_loss,
        }, 'best_model.pth')
    
    print(f'Epoch [{epoch+1:02d}/{num_epochs}], Loss: {loss.item():.4f}, Best Loss: {best_loss:.4f}')


Using device: cpu
Epoch [01/50], Loss: 5290.2378, Best Loss: 5290.2378
Epoch [02/50], Loss: 5288.1973, Best Loss: 5288.1973
Epoch [03/50], Loss: 5286.7041, Best Loss: 5286.7041
Epoch [04/50], Loss: 5285.3872, Best Loss: 5285.3872
Epoch [05/50], Loss: 5284.1001, Best Loss: 5284.1001
Epoch [06/50], Loss: 5282.8613, Best Loss: 5282.8613
Epoch [07/50], Loss: 5281.7007, Best Loss: 5281.7007
Epoch [08/50], Loss: 5280.7222, Best Loss: 5280.7222
Epoch [09/50], Loss: 5279.8457, Best Loss: 5279.8457
Epoch [10/50], Loss: 5279.1172, Best Loss: 5279.1172
Epoch [11/50], Loss: 5278.5967, Best Loss: 5278.5967
Epoch [12/50], Loss: 5278.1665, Best Loss: 5278.1665
Epoch [13/50], Loss: 5277.6533, Best Loss: 5277.6533
Epoch [14/50], Loss: 5277.3096, Best Loss: 5277.3096
Epoch [15/50], Loss: 5277.1333, Best Loss: 5277.1333
Epoch [16/50], Loss: 5277.0171, Best Loss: 5277.0171
Epoch [17/50], Loss: 5276.9341, Best Loss: 5276.9341
Epoch [18/50], Loss: 5276.8740, Best Loss: 5276.8740
Epoch [19/50], Loss: 5276.83

### 應用模型
當訓練結束後，你可以將新的嘈雜音頻輸入到模型中，輸出去噪後的乾淨音頻。

In [7]:
# 加載最佳模型
checkpoint = torch.load('best_model.pth')
model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)  # 移動模型到設備（如果使用 GPU）

# 設置模型為評估模式
model.eval()

with torch.no_grad():
    # 加載測試音訊
    waveform_test, sample_rate = torchaudio.load("test_mixed.wav")
    
    # 如果需要將音頻轉換為單聲道
    waveform_test_mono = waveform_test.mean(dim=0, keepdim=True)
    
    # 將波形轉換為 Mel Spectrogram
    mel_spectrogram_test = mel_spectrogram_transform(waveform_test_mono.to(device))
    
    # 使用模型去噪 Mel Spectrogram
    denoised_mel = model(mel_spectrogram_test.unsqueeze(0))  # 加入批次維度 (batch size)
    
    # 移除批次維度並轉換回線性頻譜
    inverse_melscale = torchaudio.transforms.InverseMelScale(
        n_stft=(n_fft // 2 + 1),
        n_mels=n_mels,
        sample_rate=sample_rate
    )
    
    # linear_spec = inverse_melscale(denoised_mel.squeeze(0))  # 移除批次維度
    linear_spec = inverse_melscale(mel_spectrogram_test)
    
    # 創建幅度譜並使用 Griffin-Lim 重建波形
    magnitude_spec = linear_spec.unsqueeze(0)  # 添加回批次維度
    
    griffin_lim = torchaudio.transforms.GriffinLim(
        n_fft=n_fft,
        win_length=win_length,
        hop_length=hop_length,
        power=1.0,  # 因為已經有幅度譜而不是功率譜
        n_iter=32
    )
    
    denoised_waveform = griffin_lim(magnitude_spec)
    
    # 保存去噪後的音訊
    if denoised_waveform.dim() == 1:
        denoised_waveform = denoised_waveform.unsqueeze(0)
        
    if denoised_waveform.dim() == 3:
        denoised_waveform = denoised_waveform.squeeze(0)
    
    torchaudio.save('denoised_output.wav', denoised_waveform.cpu(), sample_rate)  # 移回CPU以保存音頻


  checkpoint = torch.load('best_model.pth')
