# Test

In [295]:
import torchaudio
import torch
import torch.nn.functional as F
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim
import os

In [None]:
# 全局參數設置
n_fft = 512
win_length = None  # 設置為 None 時，默認等於 n_fft
hop_length = n_fft // 2
n_mels = 56  # 確保這個值與模型定義時使用的值相同
sample_rate = 44100  # 請確保這個值正確

mel_spectrogram_transform = torchaudio.transforms.MelSpectrogram(
    sample_rate = sample_rate,
    n_fft = n_fft,
    win_length = win_length,
    hop_length = hop_length,
    n_mels = n_mels,
    f_min = 0,
    f_max = sample_rate/2
)

### 音頻數據的準備與處理
你可以使用 `torchaudio` 來處理音頻數據，將其加載並轉換為頻譜圖。`torchaudio` 可以非常方便地處理音頻格式。

In [None]:
clean_data_dir = "../clean_data/"
mixed_data_dir = "../mixed_data/"
clean_data = []
mixed_data = []

for file_name in os.listdir(mixed_data_dir):
    waveform_clean, sample_rate = torchaudio.load(''.join([clean_data_dir, file_name]))
    waveform_mixed, sample_rate = torchaudio.load(''.join([mixed_data_dir, file_name]))
    
    # 生成 Mel Spectrogram
    mel_spectrogram_clean = mel_spectrogram_transform(waveform_clean)
    mel_spectrogram_mixed = mel_spectrogram_transform(waveform_mixed)
    
    # 確保兩個頻譜圖具有相同的大小
    target_size = min(target_size, min(mel_spectrogram_clean.shape[-1], mel_spectrogram_mixed.shape[-1]))
    target_size = (target_size // 8) * 8  # 确保目标大小是8的倍数
    clean_data.append(mel_spectrogram_clean)
    mixed_data.append(mel_spectrogram_mixed)
    
    
for i in range(len(clean_data)):
    clean_data[i] = clean_data[i][..., :target_size]
    mixed_data[i] = mixed_data[i][..., :target_size]

print("data length:", len(clean_data))
print("data shape:", clean_data[0].shape)
print("data shape:", mixed_data[0].shape)

### 構建 Denoise Autoencoder 模型
這是一個簡單的自編碼器的 PyTorch 模型範例，使用了卷積層來處理音頻的頻譜數據：

In [297]:
class DenoiseAutoencoder(nn.Module):
    def __init__(self):
        super(DenoiseAutoencoder, self).__init__()
        # 編碼器
        self.encoder = nn.Sequential(
            nn.Conv2d(1, 16, kernel_size=3, stride=2, padding=1),  # 修改這裡的in_channels為2
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv2d(32, 64, kernel_size=3, stride=2, padding=1),
            nn.ReLU()
        )
        # 解碼器
        self.decoder = nn.Sequential(
            nn.ConvTranspose2d(64, 32, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose2d(32, 16, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose2d(16, 1, kernel_size=3, stride=2, padding=1, output_padding=1),  # 修改這裡的out_channels為2
            nn.Sigmoid()  # Sigmoid將輸出值範圍限制在[0, 1]
        )
        
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x


### 準備數據
你可以將嘈雜音頻和乾淨音頻分別準備成訓練集的輸入和標籤。這裡的 `DataLoader` 用來批量加載數據。

In [298]:
class AudioDataset(Dataset):
    def __init__(self, mixed_data, clean_data):
        self.mixed_data = mixed_data
        self.clean_data = clean_data
    
    def __len__(self):
        return len(self.mixed_data)
    
    def __getitem__(self, idx):
        return self.mixed_data[idx], self.clean_data[idx]

# 假設已經有 noisy_data 和 clean_data 作為音頻數據的 Mel spectrogram
dataset = AudioDataset(mixed_data, clean_data)
dataloader = DataLoader(dataset, batch_size=14, shuffle=True)

### 訓練模型
我們可以使用均方誤差（MSE）作為損失函數，並使用Adam優化器來進行模型參數的更新。

In [None]:
# 初始化模型、損失函數和優化器
model = DenoiseAutoencoder()
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# 訓練過程
num_epochs = 100
for epoch in range(num_epochs):
    for noisy, clean in dataloader:
        # # 將數據搬到GPU，如果可用
        # noisy = noisy.unsqueeze(1).float()  # 添加頻道維度
        # clean = clean.unsqueeze(1).float()
        
        # print(noisy.shape)
        # print(clean.shape)
        
        # 前向傳播
        outputs = model(noisy)
        # print(outputs.shape)
        loss = criterion(outputs, clean)
        
        # 反向傳播與優化
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1:02d}/{num_epochs}], Loss: {loss.item():.4f}')

### 應用模型
當訓練結束後，你可以將新的嘈雜音頻輸入到模型中，輸出去噪後的乾淨音頻。

In [None]:
# 应用模型
model.eval()  # 设置模型为评估模式

# 假设 denoised_output 的形状为 (batch_size, 80, 376)
# denoised_output = model(mixed_data[0])
waveform_test, sample_rate = torchaudio.load("test_mixed.wav")
waveform_test_mono = waveform_test.mean(dim = 0, keepdim = True)
mel_spectrogram_test = mel_spectrogram_transform(waveform_test_mono)
denoised_output = model(mel_spectrogram_test)  # 添加批次和通道维度

print("去噪后的输出形状:", denoised_output.shape)

# 进行逆梅尔变换
# 确保 denoised_output 的形状和类型正确
denoised_output = denoised_output.squeeze(0).float()  # 移除单通道维度并转换为浮点型

inverse_melscale = torchaudio.transforms.InverseMelScale(
    n_stft = (n_fft // 2) + 1,   # 计算 STFT 的大小
    n_mels = n_mels,             # 确保与 denoised_output 的通道数一致
    sample_rate = sample_rate,   # 采样率
    f_min = 0,                   # 最小频率
    f_max = sample_rate / 2      # 最大频率
)

# 执行逆梅尔变换
reconstructed_stft = inverse_melscale(denoised_output)

# 使用 Griffin-Lim 重建波形
griffin_lim = torchaudio.transforms.GriffinLim(n_fft=n_fft, n_iter=32)
denoised_waveform = griffin_lim(reconstructed_stft)

# 将 denoised_waveform 转换为 2D 张量 (1, n_samples) 以符合保存要求
# 这里我们假设需要保存第一个批次的输出
if denoised_waveform.dim() == 1:
    denoised_waveform = denoised_waveform.unsqueeze(0).float()
    print("denoised_waveform shape:", denoised_waveform.shape)

# 确保 denoised_waveform 是一个不需要梯度的张量
denoised_waveform = denoised_waveform.detach()  # 创建一个不需要梯度的副本

# 保存音频
torchaudio.save("denoised_output.wav", denoised_waveform, sample_rate)  # 确保输出为 [1, n_samples]
print("去噪后的音频已保存为 'denoised_output.wav'")

# 模型架构检查
print("\n模型架构:")
print(model)