# Test

### 音頻數據的準備與處理
你可以使用 `torchaudio` 來處理音頻數據，將其加載並轉換為頻譜圖。`torchaudio` 可以非常方便地處理音頻格式。

In [76]:
import torchaudio
import torch
import torch.nn.functional as F

# 加載音頻
waveform_clean, sample_rate = torchaudio.load("test_clean.wav")
waveform_noisy, sample_rate = torchaudio.load("test_noisy.wav")
waveform_mixed, sample_rate = torchaudio.load("test_mixed.wav")

# 全局參數設置
n_fft = 1024
win_length = None  # 設置為 None 時，默認等於 n_fft
hop_length = 512
n_mels = 80  # 確保這個值與模型定義時使用的值相同
half_n_mels = n_mels // 2
sample_rate = 44100  # 請確保這個值正確

# 生成 Mel Spectrogram
mel_spectrogram_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate = sample_rate,
    n_fft = n_fft,
    win_length = win_length,
    hop_length = hop_length,
    n_mels = half_n_mels,
    f_min = 0,
    f_max = sample_rate/2
)
mel_spectrogram_clean = mel_spectrogram_transform(waveform_clean)
mel_spectrogram_noisy = mel_spectrogram_transform(waveform_noisy)
mel_spectrogram_mixed = mel_spectrogram_transform(waveform_mixed)

# 確保兩個頻譜圖具有相同的大小
target_size = (mel_spectrogram_clean.shape[-1] // 8) * 8  # 確保大小是8的倍數
mel_spectrogram_clean = mel_spectrogram_clean[..., :target_size]
mel_spectrogram_noisy = mel_spectrogram_noisy[..., :target_size]
mel_spectrogram_mixed = mel_spectrogram_mixed[..., :target_size]

# 如果仍然需要調整大小，可以使用插值
# if mel_spectrogram_clean.shape != mel_spectrogram_noisy.shape:
#     mel_spectrogram_noisy = F.interpolate(mel_spectrogram_noisy.unsqueeze(0), 
#                                           size=mel_spectrogram_clean.shape[-2:],
#                                           mode='bilinear',
#                                           align_corners=False).squeeze(0)

mel_spectrogram_noisy2 = mel_spectrogram_noisy
mel_spectrogram_clean2 = mel_spectrogram_clean
mel_spectrogram_mixed2 = mel_spectrogram_mixed

print("Clean Mel Spectrogram shape:", mel_spectrogram_clean.shape)
print("Noisy Mel Spectrogram shape:", mel_spectrogram_noisy.shape)
print("Mixed Mel Spectrogram shape:", mel_spectrogram_mixed.shape)

noisy_data = torch.cat((mel_spectrogram_noisy, mel_spectrogram_noisy), 1)
clean_data = torch.cat((mel_spectrogram_clean, mel_spectrogram_clean), 1)
mixed_data = torch.cat((mel_spectrogram_mixed, mel_spectrogram_mixed), 1)
# noisy_data = torch.cat((mel_spectrogram_noisy, mel_spectrogram_noisy2), 1)
# clean_data = torch.cat((mel_spectrogram_clean, mel_spectrogram_clean2), 1)
# mixed_data = torch.cat((mel_spectrogram_mixed, mel_spectrogram_mixed2), 1)

print("Noisy data shape:", noisy_data.shape)
print("Clean data shape:", clean_data.shape)
print("Mixed data shape:", mixed_data.shape)

# 確保這兩個形狀完全相同
assert noisy_data.shape == clean_data.shape and clean_data.shape == mixed_data.shape, "Shapes of noisy and clean data do not match"

Clean Mel Spectrogram shape: torch.Size([2, 40, 376])
Noisy Mel Spectrogram shape: torch.Size([2, 40, 376])
Mixed Mel Spectrogram shape: torch.Size([2, 40, 376])
Noisy data shape: torch.Size([2, 80, 376])
Clean data shape: torch.Size([2, 80, 376])
Mixed data shape: torch.Size([2, 80, 376])


### 構建 Denoise Autoencoder 模型
這是一個簡單的自編碼器的 PyTorch 模型範例，使用了卷積層來處理音頻的頻譜數據：

In [77]:
import torch.nn as nn

class DenoiseAutoencoder(nn.Module):
    def __init__(self):
        super(DenoiseAutoencoder, self).__init__()
        # 編碼器
        self.encoder = nn.Sequential(
            nn.Conv2d(2, 16, kernel_size=3, stride=2, padding=1),  # 修改這裡的in_channels為2
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1),
            nn.ReLU()
        )
        # 解碼器
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(64, 32, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose2d(32, 16, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose2d(16, 2, kernel_size=3, stride=2, padding=1, output_padding=1),  # 修改這裡的out_channels為2
            nn.Sigmoid()  # Sigmoid將輸出值範圍限制在[0, 1]
        )
        
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x


### 準備數據
你可以將嘈雜音頻和乾淨音頻分別準備成訓練集的輸入和標籤。這裡的 `DataLoader` 用來批量加載數據。

In [78]:
from torch.utils.data import DataLoader, Dataset

class AudioDataset(Dataset):
    def __init__(self, noisy_data, clean_data):
        self.noisy_data = noisy_data
        self.clean_data = clean_data
    
    def __len__(self):
        return len(self.noisy_data)
    
    def __getitem__(self, idx):
        return self.noisy_data[idx], self.clean_data[idx]

# 假設已經有 noisy_data 和 clean_data 作為音頻數據的 Mel spectrogram
# dataset = AudioDataset(noisy_data, clean_data)
# dataset = AudioDataset(mixed_data, clean_data)
dataset = AudioDataset(mel_spectrogram_mixed, mel_spectrogram_clean)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

### 訓練模型
我們可以使用均方誤差（MSE）作為損失函數，並使用Adam優化器來進行模型參數的更新。

In [79]:
import torch.optim as optim

# 初始化模型、損失函數和優化器
model = DenoiseAutoencoder()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# 訓練過程
num_epochs = 50
for epoch in range(num_epochs):
    for noisy, clean in dataloader:
        # 將數據搬到GPU，如果可用
        noisy = noisy.unsqueeze(1).float()  # 添加頻道維度
        clean = clean.unsqueeze(1).float()
        
        noisy = noisy.repeat(1, 2, 1, 1)
        clean = clean.repeat(1, 2, 1, 1)
        
        print(noisy.shape)
        print(clean.shape)
        
        # 前向傳播
        outputs = model(noisy)
        print(outputs.shape)
        loss = criterion(outputs, clean)
        
        # 反向傳播與優化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

torch.Size([2, 2, 40, 376])
torch.Size([2, 2, 40, 376])
torch.Size([2, 2, 40, 376])
Epoch [1/50], Loss: 35717.0938
torch.Size([2, 2, 40, 376])
torch.Size([2, 2, 40, 376])
torch.Size([2, 2, 40, 376])
Epoch [2/50], Loss: 35708.4102
torch.Size([2, 2, 40, 376])
torch.Size([2, 2, 40, 376])
torch.Size([2, 2, 40, 376])
Epoch [3/50], Loss: 35702.4141
torch.Size([2, 2, 40, 376])
torch.Size([2, 2, 40, 376])
torch.Size([2, 2, 40, 376])
Epoch [4/50], Loss: 35699.1914
torch.Size([2, 2, 40, 376])
torch.Size([2, 2, 40, 376])
torch.Size([2, 2, 40, 376])
Epoch [5/50], Loss: 35696.7109
torch.Size([2, 2, 40, 376])
torch.Size([2, 2, 40, 376])
torch.Size([2, 2, 40, 376])
Epoch [6/50], Loss: 35694.8555
torch.Size([2, 2, 40, 376])
torch.Size([2, 2, 40, 376])
torch.Size([2, 2, 40, 376])
Epoch [7/50], Loss: 35693.3242
torch.Size([2, 2, 40, 376])
torch.Size([2, 2, 40, 376])
torch.Size([2, 2, 40, 376])
Epoch [8/50], Loss: 35692.4570
torch.Size([2, 2, 40, 376])
torch.Size([2, 2, 40, 376])
torch.Size([2, 2, 40, 37

### 應用模型
當訓練結束後，你可以將新的嘈雜音頻輸入到模型中，輸出去噪後的乾淨音頻。

In [80]:
# 应用模型
model.eval()  # 设置模型为评估模式

# 假设 denoised_output 的形状为 (batch_size, 80, 376)
denoised_output = model(mixed_data)

# 转换为 2D 张量
denoised_output_2d = denoised_output.view(denoised_output.size(0), -1)  # 转换为 [batch_size, 80 * 376]

print("去噪后的输出形状:", denoised_output.shape)
print("逆梅尔变换前的形状:", denoised_output_2d.shape)

# 进行逆梅尔变换
inverse_melscale = torchaudio.transforms.InverseMelScale(
    n_stft=(n_fft // 2) + 1,
    n_mels=n_mels,  # 确保与 denoised_output 一致
    sample_rate=sample_rate,
    f_min=0,
    f_max=sample_rate / 2
)

# 执行逆梅尔变换
reconstructed_stft = inverse_melscale(denoised_output)

# 使用 Griffin-Lim 重建波形
griffin_lim = torchaudio.transforms.GriffinLim(n_fft=n_fft, n_iter=32)
denoised_waveform = griffin_lim(reconstructed_stft)

# 将 denoised_waveform 转换为 2D 张量 (1, n_samples) 以符合保存要求
# 这里我们假设需要保存第一个批次的输出
if denoised_waveform.dim() == 3:
    denoised_waveform = denoised_waveform.squeeze(0)  # 如果是 [batch_size, 1, n_samples] 转为 [1, n_samples]

# 确保 denoised_waveform 是一个不需要梯度的张量
denoised_waveform = denoised_waveform.detach()  # 创建一个不需要梯度的副本

# 保存音频
torchaudio.save("denoised_output.wav", denoised_waveform, sample_rate)  # 确保输出为 [1, n_samples]
print("去噪后的音频已保存为 'denoised_output.wav'")

# 模型架构检查
print("\n模型架构:")
print(model)

去噪后的输出形状: torch.Size([2, 80, 376])
逆梅尔变换前的形状: torch.Size([2, 30080])
去噪后的音频已保存为 'denoised_output.wav'

模型架构:
DenoiseAutoencoder(
  (encoder): Sequential(
    (0): Conv2d(2, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (1): ReLU()
    (2): Conv2d(16, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (3): ReLU()
    (4): Conv2d(32, 64, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))
    (5): ReLU()
  )
  (decoder): Sequential(
    (0): ConvTranspose2d(64, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), output_padding=(1, 1))
    (1): ReLU()
    (2): ConvTranspose2d(32, 16, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), output_padding=(1, 1))
    (3): ReLU()
    (4): ConvTranspose2d(16, 2, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), output_padding=(1, 1))
    (5): Sigmoid()
  )
)
