In [5]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import matplotlib.pyplot as plt
import numpy as np
from torch.utils.data import DataLoader, Dataset
import os
from tqdm import tqdm

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the autoencoder model
class AudioAutoencoder(nn.Module):
    def __init__(self):
        super(AudioAutoencoder, self).__init__()
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv1d(16, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv1d(32, 64, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv1d(64, 128, kernel_size=3, stride=2, padding=1),
            nn.ReLU()
        )
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.ConvTranspose1d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(64, 32, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(32, 16, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(16, 1, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.Tanh()
        )
    
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x
    
    def encode(self, x):
        return self.encoder(x)
    
    def decode(self, x):
        return self.decoder(x)

# Custom dataset for audio files
class AudioDataset(Dataset):
    def __init__(self, audio_dir, sample_rate=16000, segment_length=16000):
        self.audio_files = [os.path.join(audio_dir, f) for f in os.listdir(audio_dir) 
                          if f.endswith('.wav') or f.endswith('.mp3')]
        self.sample_rate = sample_rate
        self.segment_length = segment_length
    
    def __len__(self):
        return len(self.audio_files)
    
    def __getitem__(self, idx):
        waveform, sr = torchaudio.load(self.audio_files[idx])
        
        # Convert to mono if stereo
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        
        # Resample if needed
        if sr != self.sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.sample_rate)
            waveform = resampler(waveform)
        
        # Take a random segment of fixed length
        if waveform.shape[1] > self.segment_length:
            start = torch.randint(0, waveform.shape[1] - self.segment_length, (1,))
            waveform = waveform[:, start:start+self.segment_length]
        else:
            # Pad if too short
            padding = self.segment_length - waveform.shape[1]
            waveform = torch.nn.functional.pad(waveform, (0, padding))
        
        # Normalize
        waveform = waveform / torch.max(torch.abs(waveform))
        
        return waveform

# Training function
def train_autoencoder(model, dataloader, num_epochs=100):
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.MSELoss()
    
    for epoch in range(num_epochs):
        running_loss = 0.0
        
        for data in tqdm(dataloader):
            data = data.to(device)
            
            # Forward pass
            outputs = model(data)
            loss = criterion(outputs, data)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(dataloader):.4f}')
    
    return model

# Generate new audio by interpolating in latent space
def generate_audio(model, audio1, audio2, num_steps=10):
    model.eval()
    with torch.no_grad():
        # Encode both audio files
        latent1 = model.encode(audio1.to(device))
        latent2 = model.encode(audio2.to(device))
        
        generated_samples = []
        
        # Interpolate between the two latent vectors
        for alpha in np.linspace(0, 1, num_steps):
            interpolated = alpha * latent1 + (1 - alpha) * latent2
            generated = model.decode(interpolated)
            generated_samples.append(generated.cpu())
        
        return generated_samples

ModuleNotFoundError: No module named 'torchaudio'

In [None]:

# Create an instance of the autoencoder
model = AudioAutoencoder().to(device)

# Load dataset (assuming you have audio files in 'audio_samples' directory)
dataset = AudioDataset("//data/audio_data")
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Train the model
model = train_autoencoder(model, dataloader, num_epochs=50)

# Save the model
torch.save(model.state_dict(), "audio_autoencoder.pth")

# Generate new audio (example)
if len(dataset) >= 2:
    audio1 = dataset[0].unsqueeze(0)  # Add batch dimension
    audio2 = dataset[1].unsqueeze(0)
    generated_samples = generate_audio(model, audio1, audio2, num_steps=5)
    
    # Save a generated sample
    sample = generated_samples[2].squeeze(0)
    torchaudio.save("generated_audio.wav", sample, 16000)