In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
from torchaudio.transforms import MelSpectrogram
from scipy.io.wavfile import write
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split

In [2]:
# 데이터셋 다운로드 및 로딩
dataset = torchaudio.datasets.LIBRISPEECH(root='./librispeech_data', url="train-clean-100", download=True)

# 샘플링 주파수 설정
sample_rate = 16000

# Mel-Spectrogram 변환기 정의
mel_spectrogram = MelSpectrogram(
    sample_rate=sample_rate,
    n_fft=2048,          # FFT 크기
    hop_length=512,      # Hop 길이
    n_mels=128,          # Mel 주파수 축 크기 
    power=2.0            # 제곱 값
)
print(dataset[0])
print(type(dataset[0]))
print(len(dataset[0]))
#전처리 과정
def preprocess(sample, max_length=754):
    waveform, sr, _transcript, *_rest = sample
    mel_spec = mel_spectrogram(waveform)
    if mel_spec.size(2) < max_length:
        mel_spec = torch.nn.functional.pad(mel_spec, (0, max_length - mel_spec.size(2)))
    elif mel_spec.size(2) > max_length:
        mel_spec = mel_spec[:, :, :max_length]
    mel_spec = (mel_spec - mel_spec.mean()) / mel_spec.std()  # 정규화
    return mel_spec

# 데이터 전처리 적용
processed_data = [preprocess(sample) for sample in dataset]
mel_specs = torch.stack(processed_data)

# mel_specs 데이터셋을 훈련과 검증 데이터로 나누기
train_data, val_data = train_test_split(mel_specs, test_size=0.2, random_state=42)

# 각각의 데이터를 텐서 데이터셋으로 감싸기
train_dataset = TensorDataset(train_data)
val_dataset = TensorDataset(val_data)

# 데이터로더로 변환
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False)

(tensor([[-0.0065, -0.0055, -0.0062,  ...,  0.0033,  0.0005, -0.0095]]), 16000, 'CHAPTER ONE MISSUS RACHEL LYNDE IS SURPRISED MISSUS RACHEL LYNDE LIVED JUST WHERE THE AVONLEA MAIN ROAD DIPPED DOWN INTO A LITTLE HOLLOW FRINGED WITH ALDERS AND LADIES EARDROPS AND TRAVERSED BY A BROOK', 103, 1240, 0)
<class 'tuple'>
6


In [3]:
device =  torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
class WaveNet(nn.Module):
    def __init__(self, num_layers, num_filters, in_channels, out_channels):
        super(WaveNet, self).__init__()
        self.num_layers = num_layers
        self.num_filters = num_filters

        self.conv_layers = nn.ModuleList()
        self.residual_layers = nn.ModuleList()  # Residual layers 추가

        for _ in range(num_layers):
            self.conv_layers.append(nn.Sequential(
                nn.Conv1d(in_channels=in_channels, out_channels=num_filters, kernel_size=2, dilation=2, padding=1),
                nn.BatchNorm1d(num_filters),
                nn.ReLU(),
                nn.Dropout(0.3)
            ))
            self.residual_layers.append(nn.Conv1d(in_channels=in_channels, out_channels=num_filters, kernel_size=1))  # Residual connections
            in_channels = num_filters

        self.final_conv = nn.Conv1d(in_channels=num_filters, out_channels=out_channels, kernel_size=1)

    def forward(self, x):
        residual = x
        for conv, res in zip(self.conv_layers, self.residual_layers):
            x = conv(x)
            residual = res(residual)
            x = x + residual  # Residual connection
        x = self.final_conv(x)
        return x



In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


In [8]:
num_layers = 15
num_filters = 128
in_channels = 128  # Mel-Spectrogram의 출력 채널 수
out_channels = in_channels  # Mel-Spectrogram의 채널 수와 일치
model = WaveNet(num_layers, num_filters, in_channels, out_channels).to(device)
criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=0.00005)
scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=2)

torch.Size([1, 128, 1024])

In [17]:
print(next(model.parameters()).device)

cuda:0


In [15]:
def init_weights(m):
    if isinstance(m, nn.Conv1d):
        nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            nn.init.zeros_(m.bias)

model.apply(init_weights)

WaveNet(
  (conv_layers): ModuleList(
    (0-14): 15 x Sequential(
      (0): Conv1d(128, 128, kernel_size=(2,), stride=(1,), padding=(1,), dilation=(2,))
      (1): BatchNorm1d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
      (3): Dropout(p=0.3, inplace=False)
    )
  )
  (residual_layers): ModuleList(
    (0-14): 15 x Conv1d(128, 128, kernel_size=(1,), stride=(1,))
  )
  (final_conv): Conv1d(128, 128, kernel_size=(1,), stride=(1,))
)

In [16]:
# 훈련 루프
num_epochs = 100
for epoch in range(num_epochs):
    # 훈련 단계
    model.train()
    running_train_loss = 0.0
    for mel_specs in train_dataloader:
        mel_specs = mel_specs[0].to(device).squeeze(1)
        
        # 훈련 데이터로 모델 훈련
        output = model(mel_specs)
        loss = criterion(output, mel_specs)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_train_loss += loss.item() * mel_specs.size(0)

    epoch_train_loss = running_train_loss / len(train_dataloader.dataset)

    # 검증 단계
    model.eval()
    running_val_loss = 0.0
    with torch.no_grad():
        for mel_specs in val_dataloader:
            mel_specs = mel_specs[0].to(device).squeeze(1)
            
            # 검증 데이터로 모델 평가
            output = model(mel_specs)
            loss = criterion(output, mel_specs)

            running_val_loss += loss.item() * mel_specs.size(0)

    epoch_val_loss = running_val_loss / len(val_dataloader.dataset)
    
    # 각 에포크마다 훈련 및 검증 손실 출력
    print(f'Epoch [{epoch+1}/{num_epochs}], Train Loss: {epoch_train_loss:.4f}, Val Loss: {epoch_val_loss:.4f}')
    
    scheduler.step()


Epoch [1/100], Train Loss: 0.1335, Val Loss: 0.0799



KeyboardInterrupt



In [20]:
import numpy as np

def mel_spec_to_waveform(mel_spec, sample_rate=16000, n_fft=2048, hop_length=512):
    # MelSpectrogram의 반대 과정, InverseMelScale을 통해 Mel에서 주파수 영역으로 복원
    inverse_mel_transform = torchaudio.transforms.InverseMelScale(n_stft=(n_fft // 2 + 1), n_mels=128).to(mel_spec.device)
    spec = inverse_mel_transform(mel_spec)
    
    # Griffin-Lim 알고리즘으로 STFT에서 파형으로 복원
    griffin_lim = torchaudio.transforms.GriffinLim(n_fft=n_fft, hop_length=hop_length).to(mel_spec.device)
    waveform = griffin_lim(spec)
    
    return waveform
def generate_audio(model, input_seq, sample_rate=16000):
    model.eval()
    with torch.no_grad():
        output = model(input_seq)

    # 모델 출력 (Mel-Spectrogram)을 다시 파형으로 변환
    waveform = mel_spec_to_waveform(output)
    
    return waveform

# 모델 입력 준비 (1개의 예시 입력)
input_seq = torch.randn(1, 128, 754).to("cuda")  # 모델의 입력 크기에 맞게 조정

# 오디오 생성
generated_waveform = generate_audio(model, input_seq)

# 생성된 오디오를 numpy 배열로 변환
generated_waveform_np = generated_waveform.squeeze().cpu().numpy()

# 오디오 데이터 스케일링 (-1.0 ~ 1.0 범위로 조정)
generated_waveform_np = np.clip(generated_waveform_np, -1.0, 1.0)

# 변환된 데이터를 16-bit PCM으로 변환
generated_waveform_np = (generated_waveform_np * 32767).astype(np.int16)

# 오디오 파일로 저장
write('generated_audio100.wav', sample_rate, generated_waveform_np)

print("생성된 오디오 파일이 저장되었습니다.")

생성된 오디오 파일이 저장되었습니다.


In [22]:
from IPython.display import Audio
Audio('generated_audio100.wav')