In [None]:
pip install torch torchaudio soundfile



In [None]:
import torch
import torch.nn as nn
import torchaudio
import torch.nn.functional as F
import os


In [None]:
class SingleFileDataset(torch.utils.data.Dataset):
    def __init__(self, file_path, sample_rate=16000, segment_duration=1.0):
        self.sample_rate = sample_rate
        self.num_samples = int(sample_rate * segment_duration)

        # Load file
        wav, sr = torchaudio.load(file_path)

        # Mono
        if wav.shape[0] > 1:
            wav = wav.mean(dim=0, keepdim=True)

        # Resample if needed
        if sr != sample_rate:
            wav = torchaudio.functional.resample(wav, sr, sample_rate)

        self.wav = wav

        # Split into segments
        total_samples = wav.shape[1]
        self.segments = []
        step = self.num_samples
        for start in range(0, total_samples, step):
            end = start + self.num_samples
            segment = wav[:, start:end]
            # Pad if needed
            if segment.shape[1] < self.num_samples:
                pad_len = self.num_samples - segment.shape[1]
                segment = F.pad(segment, (0, pad_len))
            self.segments.append(segment)

    def __len__(self):
        return len(self.segments)

    def __getitem__(self, idx):
        return self.segments[idx]


In [None]:
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv1d(1, 64, kernel_size=7, stride=2, padding=3),
            nn.ReLU(),
            nn.Conv1d(64, 128, kernel_size=7, stride=2, padding=3),
            nn.ReLU(),
            nn.Conv1d(128, 256, kernel_size=7, stride=2, padding=3),
            nn.ReLU(),
        )

    def forward(self, x):
        return self.net(x)


In [None]:
class Decoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.ConvTranspose1d(256, 128, kernel_size=7, stride=2, padding=3, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(128, 64, kernel_size=7, stride=2, padding=3, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(64, 1, kernel_size=7, stride=2, padding=3, output_padding=1),
            nn.Tanh()
        )

    def forward(self, x):
        return self.net(x)


In [None]:
class AudioAutoEncoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()

    def forward(self, x):
        z = self.encoder(x)
        x_hat = self.decoder(z)
        return x_hat


In [None]:
!pip install TorchCodec
import torchcodec



In [None]:
# Path to your uploaded audio
audio_path = "/content/tts_output_2.wav"  # Replace with uploaded file name

device = "cuda" if torch.cuda.is_available() else "cpu"

dataset = SingleFileDataset(audio_path)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=1, shuffle=True)

model = AudioAutoEncoder().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
criterion = nn.L1Loss()


In [None]:
num_epochs = 3000

for epoch in range(num_epochs):
    total_loss = 0
    for segment in dataloader:
        segment = segment.to(device)  # (1, 1, 16000)

        recon = model(segment)
        loss = criterion(recon, segment)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    avg_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} | L1 Loss: {avg_loss:.6f}")


Epoch 1/3000 | L1 Loss: 0.127835
Epoch 2/3000 | L1 Loss: 0.098208
Epoch 3/3000 | L1 Loss: 0.073993
Epoch 4/3000 | L1 Loss: 0.055032
Epoch 5/3000 | L1 Loss: 0.056675
Epoch 6/3000 | L1 Loss: 0.055309
Epoch 7/3000 | L1 Loss: 0.051951
Epoch 8/3000 | L1 Loss: 0.052054
Epoch 9/3000 | L1 Loss: 0.048151
Epoch 10/3000 | L1 Loss: 0.047230
Epoch 11/3000 | L1 Loss: 0.046103
Epoch 12/3000 | L1 Loss: 0.044685
Epoch 13/3000 | L1 Loss: 0.044140
Epoch 14/3000 | L1 Loss: 0.042034
Epoch 15/3000 | L1 Loss: 0.041502
Epoch 16/3000 | L1 Loss: 0.039932
Epoch 17/3000 | L1 Loss: 0.037784
Epoch 18/3000 | L1 Loss: 0.035778
Epoch 19/3000 | L1 Loss: 0.033292
Epoch 20/3000 | L1 Loss: 0.031440
Epoch 21/3000 | L1 Loss: 0.028609
Epoch 22/3000 | L1 Loss: 0.026635
Epoch 23/3000 | L1 Loss: 0.024730
Epoch 24/3000 | L1 Loss: 0.022670
Epoch 25/3000 | L1 Loss: 0.021155
Epoch 26/3000 | L1 Loss: 0.020562
Epoch 27/3000 | L1 Loss: 0.020300
Epoch 28/3000 | L1 Loss: 0.020449
Epoch 29/3000 | L1 Loss: 0.019781
Epoch 30/3000 | L1 Loss

In [None]:
os.makedirs("outputs", exist_ok=True)
model.eval()

with torch.no_grad():
    for i, segment in enumerate(dataset):
        seg = segment.unsqueeze(0).to(device)
        recon = model(seg)

        torchaudio.save(f"outputs/original_{i}.wav", seg.cpu()[0], 16000)
        torchaudio.save(f"outputs/reconstructed_{i}.wav", recon.cpu()[0], 16000)

print("Saved reconstructed audio in outputs/")

Saved reconstructed audio in outputs/


In [None]:
from IPython.display import Audio, display
import glob

output_files = sorted(glob.glob("outputs/*.wav"))

for file_path in output_files:
    print(f"Playing {file_path}:")
    display(Audio(file_path))


Playing outputs/original_0.wav:


Playing outputs/original_1.wav:


Playing outputs/reconstructed_0.wav:


Playing outputs/reconstructed_1.wav:


In [None]:
from google.colab import drive
drive.mount('/content/drive')