In [8]:
pip install torchaudio


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchaudio
import matplotlib.pyplot as plt
import numpy as np
from torch.utils.data import DataLoader, Dataset
import os
from tqdm import tqdm

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the autoencoder model
class AudioAutoencoder(nn.Module):
    def __init__(self):
        super(AudioAutoencoder, self).__init__()
        
        # Encoder
        self.encoder = nn.Sequential(
            nn.Conv1d(1, 16, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv1d(16, 32, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv1d(32, 64, kernel_size=3, stride=2, padding=1),
            nn.ReLU(),
            nn.Conv1d(64, 128, kernel_size=3, stride=2, padding=1),
            nn.ReLU()
        )
        
        # Decoder
        self.decoder = nn.Sequential(
            nn.ConvTranspose1d(128, 64, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(64, 32, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(32, 16, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(),
            nn.ConvTranspose1d(16, 1, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.Tanh()
        )
    
    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x
    
    def encode(self, x):
        return self.encoder(x)
    
    def decode(self, x):
        return self.decoder(x)

# Custom dataset for audio files
class AudioDataset(Dataset):
    def __init__(self, audio_dir, sample_rate=16000, segment_length=16000):
        self.audio_files = [os.path.join(audio_dir, f) for f in os.listdir(audio_dir) 
                          if f.endswith('.wav') or f.endswith('.mp3')]
        self.sample_rate = sample_rate
        self.segment_length = segment_length
    
    def __len__(self):
        return len(self.audio_files)
    
    def __getitem__(self, idx):
        waveform, sr = torchaudio.load(self.audio_files[idx])
        
        # Convert to mono if stereo
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        
        # Resample if needed
        if sr != self.sample_rate:
            resampler = torchaudio.transforms.Resample(sr, self.sample_rate)
            waveform = resampler(waveform)
        
        # Take a random segment of fixed length
        if waveform.shape[1] > self.segment_length:
            start = torch.randint(0, waveform.shape[1] - self.segment_length, (1,))
            waveform = waveform[:, start:start+self.segment_length]
        else:
            # Pad if too short
            padding = self.segment_length - waveform.shape[1]
            waveform = torch.nn.functional.pad(waveform, (0, padding))
        
        # Normalize
        waveform = waveform / torch.max(torch.abs(waveform))
        
        return waveform

# Training function
def train_autoencoder(model, dataloader, num_epochs=100):
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.MSELoss()
    
    for epoch in range(num_epochs):
        running_loss = 0.0
        
        for data in tqdm(dataloader):
            data = data.to(device)
            
            # Forward pass
            outputs = model(data)
            loss = criterion(outputs, data)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(dataloader):.4f}')
    
    return model

# Generate new audio by interpolating in latent space
def generate_audio(model, audio1, audio2, num_steps=10):
    model.eval()
    with torch.no_grad():
        # Encode both audio files
        latent1 = model.encode(audio1.to(device))
        latent2 = model.encode(audio2.to(device))
        
        generated_samples = []
        
        # Interpolate between the two latent vectors
        for alpha in np.linspace(0, 1, num_steps):
            interpolated = alpha * latent1 + (1 - alpha) * latent2
            generated = model.decode(interpolated)
            generated_samples.append(generated.cpu())
        
        return generated_samples

In [11]:

# Create an instance of the autoencoder
model = AudioAutoencoder().to(device)

# Load dataset (assuming you have audio files in 'audio_samples' directory)
dataset = AudioDataset("/Users/jeevanbhatta/Downloads/voice-classification/data/audio_data")
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Train the model
model = train_autoencoder(model, dataloader, num_epochs=50)

# Save the model
torch.save(model.state_dict(), "audio_autoencoder.pth")

# Generate new audio (example)
if len(dataset) >= 2:
    audio1 = dataset[0].unsqueeze(0)  # Add batch dimension
    audio2 = dataset[1].unsqueeze(0)
    generated_samples = generate_audio(model, audio1, audio2, num_steps=5)
    
    # Save a generated sample
    sample = generated_samples[2].squeeze(0)
    torchaudio.save("generated_audio.wav", sample, 16000)

100%|██████████| 26/26 [00:10<00:00,  2.50it/s]


Epoch [1/50], Loss: 0.3686


100%|██████████| 26/26 [00:10<00:00,  2.39it/s]


Epoch [2/50], Loss: 0.1510


100%|██████████| 26/26 [00:10<00:00,  2.41it/s]


Epoch [3/50], Loss: 0.0743


100%|██████████| 26/26 [00:10<00:00,  2.43it/s]


Epoch [4/50], Loss: 0.0527


100%|██████████| 26/26 [00:11<00:00,  2.25it/s]


Epoch [5/50], Loss: 0.0425


100%|██████████| 26/26 [00:10<00:00,  2.52it/s]


Epoch [6/50], Loss: 0.0366


100%|██████████| 26/26 [00:10<00:00,  2.54it/s]


Epoch [7/50], Loss: 0.0330


100%|██████████| 26/26 [00:11<00:00,  2.17it/s]


Epoch [8/50], Loss: 0.0292


100%|██████████| 26/26 [00:10<00:00,  2.52it/s]


Epoch [9/50], Loss: 0.0262


100%|██████████| 26/26 [00:10<00:00,  2.54it/s]


Epoch [10/50], Loss: 0.0238


100%|██████████| 26/26 [00:10<00:00,  2.46it/s]


Epoch [11/50], Loss: 0.0228


100%|██████████| 26/26 [00:10<00:00,  2.41it/s]


Epoch [12/50], Loss: 0.0224


100%|██████████| 26/26 [00:10<00:00,  2.40it/s]


Epoch [13/50], Loss: 0.0204


100%|██████████| 26/26 [00:10<00:00,  2.49it/s]


Epoch [14/50], Loss: 0.0194


100%|██████████| 26/26 [00:10<00:00,  2.43it/s]


Epoch [15/50], Loss: 0.0185


100%|██████████| 26/26 [00:10<00:00,  2.49it/s]


Epoch [16/50], Loss: 0.0175


100%|██████████| 26/26 [00:10<00:00,  2.44it/s]


Epoch [17/50], Loss: 0.0167


100%|██████████| 26/26 [00:10<00:00,  2.48it/s]


Epoch [18/50], Loss: 0.0156


100%|██████████| 26/26 [00:10<00:00,  2.47it/s]


Epoch [19/50], Loss: 0.0137


100%|██████████| 26/26 [00:10<00:00,  2.37it/s]


Epoch [20/50], Loss: 0.0138


100%|██████████| 26/26 [00:10<00:00,  2.55it/s]


Epoch [21/50], Loss: 0.0141


100%|██████████| 26/26 [00:10<00:00,  2.47it/s]


Epoch [22/50], Loss: 0.0125


100%|██████████| 26/26 [00:11<00:00,  2.36it/s]


Epoch [23/50], Loss: 0.0110


100%|██████████| 26/26 [00:10<00:00,  2.43it/s]


Epoch [24/50], Loss: 0.0119


100%|██████████| 26/26 [00:10<00:00,  2.52it/s]


Epoch [25/50], Loss: 0.0100


100%|██████████| 26/26 [00:10<00:00,  2.53it/s]


Epoch [26/50], Loss: 0.0097


100%|██████████| 26/26 [00:11<00:00,  2.17it/s]


Epoch [27/50], Loss: 0.0088


100%|██████████| 26/26 [00:10<00:00,  2.52it/s]


Epoch [28/50], Loss: 0.0085


100%|██████████| 26/26 [00:09<00:00,  2.62it/s]


Epoch [29/50], Loss: 0.0076


100%|██████████| 26/26 [00:10<00:00,  2.52it/s]


Epoch [30/50], Loss: 0.0075


100%|██████████| 26/26 [00:10<00:00,  2.59it/s]


Epoch [31/50], Loss: 0.0065


100%|██████████| 26/26 [00:10<00:00,  2.54it/s]


Epoch [32/50], Loss: 0.0066


100%|██████████| 26/26 [00:09<00:00,  2.74it/s]


Epoch [33/50], Loss: 0.0071


100%|██████████| 26/26 [00:09<00:00,  2.73it/s]


Epoch [34/50], Loss: 0.0054


100%|██████████| 26/26 [00:09<00:00,  2.62it/s]


Epoch [35/50], Loss: 0.0053


100%|██████████| 26/26 [00:10<00:00,  2.59it/s]


Epoch [36/50], Loss: 0.0055


100%|██████████| 26/26 [00:11<00:00,  2.27it/s]


Epoch [37/50], Loss: 0.0049


100%|██████████| 26/26 [00:11<00:00,  2.33it/s]


Epoch [38/50], Loss: 0.0050


100%|██████████| 26/26 [00:10<00:00,  2.49it/s]


Epoch [39/50], Loss: 0.0049


100%|██████████| 26/26 [00:10<00:00,  2.56it/s]


Epoch [40/50], Loss: 0.0047


100%|██████████| 26/26 [00:11<00:00,  2.29it/s]


Epoch [41/50], Loss: 0.0050


100%|██████████| 26/26 [00:10<00:00,  2.42it/s]


Epoch [42/50], Loss: 0.0041


100%|██████████| 26/26 [00:10<00:00,  2.54it/s]


Epoch [43/50], Loss: 0.0050


100%|██████████| 26/26 [00:15<00:00,  1.71it/s]


Epoch [44/50], Loss: 0.0040


100%|██████████| 26/26 [00:10<00:00,  2.52it/s]


Epoch [45/50], Loss: 0.0039


100%|██████████| 26/26 [00:09<00:00,  2.80it/s]


Epoch [46/50], Loss: 0.0049


100%|██████████| 26/26 [00:09<00:00,  2.73it/s]


Epoch [47/50], Loss: 0.0040


100%|██████████| 26/26 [00:09<00:00,  2.81it/s]


Epoch [48/50], Loss: 0.0038


100%|██████████| 26/26 [00:08<00:00,  2.90it/s]


Epoch [49/50], Loss: 0.0037


100%|██████████| 26/26 [00:08<00:00,  2.89it/s]

Epoch [50/50], Loss: 0.0044





## Training on Single audio file

In [12]:

dataset = AudioDataset("/Users/jeevanbhatta/Downloads/voice-classification/data/single_data_jeevan")
dataloader = DataLoader(dataset, batch_size=1, shuffle=True)

model = train_autoencoder(model, dataloader, num_epochs=50)

if len(dataset) > 0:
    audio = dataset[0].unsqueeze(0).to(device)
    generated_samples = generate_audio(model, audio, audio, num_steps=5)

    sample = generated_samples[2].squeeze(0)
    torchaudio.save("jeevan_one_sample_generated_audio.wav", sample.cpu(), 16000)

100%|██████████| 1/1 [00:00<00:00,  8.44it/s]


Epoch [1/50], Loss: 0.0020


100%|██████████| 1/1 [00:00<00:00, 47.18it/s]


Epoch [2/50], Loss: 0.3855


100%|██████████| 1/1 [00:00<00:00, 54.75it/s]


Epoch [3/50], Loss: 0.0278


100%|██████████| 1/1 [00:00<00:00, 80.02it/s]


Epoch [4/50], Loss: 0.0502


100%|██████████| 1/1 [00:00<00:00, 56.14it/s]


Epoch [5/50], Loss: 0.1676


100%|██████████| 1/1 [00:00<00:00, 18.11it/s]


Epoch [6/50], Loss: 0.1376


100%|██████████| 1/1 [00:00<00:00, 60.69it/s]


Epoch [7/50], Loss: 0.0793


100%|██████████| 1/1 [00:00<00:00, 66.90it/s]


Epoch [8/50], Loss: 0.0295


100%|██████████| 1/1 [00:00<00:00, 57.75it/s]


Epoch [9/50], Loss: 0.0092


100%|██████████| 1/1 [00:00<00:00, 69.47it/s]


Epoch [10/50], Loss: 0.0150


100%|██████████| 1/1 [00:00<00:00, 53.77it/s]


Epoch [11/50], Loss: 0.0473


100%|██████████| 1/1 [00:00<00:00, 84.86it/s]


Epoch [12/50], Loss: 0.0488


100%|██████████| 1/1 [00:00<00:00, 62.90it/s]


Epoch [13/50], Loss: 0.0571


100%|██████████| 1/1 [00:00<00:00, 59.60it/s]


Epoch [14/50], Loss: 0.0345


100%|██████████| 1/1 [00:00<00:00, 83.69it/s]


Epoch [15/50], Loss: 0.0285


100%|██████████| 1/1 [00:00<00:00, 58.88it/s]


Epoch [16/50], Loss: 0.0196


100%|██████████| 1/1 [00:00<00:00, 60.65it/s]


Epoch [17/50], Loss: 0.0089


100%|██████████| 1/1 [00:00<00:00, 46.60it/s]


Epoch [18/50], Loss: 0.0271


100%|██████████| 1/1 [00:00<00:00, 42.18it/s]


Epoch [19/50], Loss: 0.0339


100%|██████████| 1/1 [00:00<00:00, 13.00it/s]


Epoch [20/50], Loss: 0.0142


100%|██████████| 1/1 [00:00<00:00, 60.19it/s]


Epoch [21/50], Loss: 0.0104


100%|██████████| 1/1 [00:00<00:00, 59.32it/s]


Epoch [22/50], Loss: 0.0150


100%|██████████| 1/1 [00:00<00:00, 50.58it/s]


Epoch [23/50], Loss: 0.0103


100%|██████████| 1/1 [00:00<00:00, 44.27it/s]


Epoch [24/50], Loss: 0.0203


100%|██████████| 1/1 [00:00<00:00, 41.88it/s]


Epoch [25/50], Loss: 0.0045


100%|██████████| 1/1 [00:00<00:00, 44.18it/s]


Epoch [26/50], Loss: 0.0055


100%|██████████| 1/1 [00:00<00:00, 46.58it/s]


Epoch [27/50], Loss: 0.0019


100%|██████████| 1/1 [00:00<00:00, 41.12it/s]


Epoch [28/50], Loss: 0.0035


100%|██████████| 1/1 [00:00<00:00, 48.99it/s]


Epoch [29/50], Loss: 0.0089


100%|██████████| 1/1 [00:00<00:00, 42.43it/s]


Epoch [30/50], Loss: 0.0066


100%|██████████| 1/1 [00:00<00:00, 45.30it/s]


Epoch [31/50], Loss: 0.0061


100%|██████████| 1/1 [00:00<00:00, 26.45it/s]


Epoch [32/50], Loss: 0.0082


100%|██████████| 1/1 [00:00<00:00, 11.79it/s]


Epoch [33/50], Loss: 0.0126


100%|██████████| 1/1 [00:00<00:00, 25.24it/s]


Epoch [34/50], Loss: 0.0052


100%|██████████| 1/1 [00:00<00:00, 34.67it/s]


Epoch [35/50], Loss: 0.0023


100%|██████████| 1/1 [00:00<00:00, 33.13it/s]


Epoch [36/50], Loss: 0.0051


100%|██████████| 1/1 [00:00<00:00, 33.36it/s]


Epoch [37/50], Loss: 0.0083


100%|██████████| 1/1 [00:00<00:00, 34.83it/s]


Epoch [38/50], Loss: 0.0010


100%|██████████| 1/1 [00:00<00:00, 35.90it/s]


Epoch [39/50], Loss: 0.0258


100%|██████████| 1/1 [00:00<00:00, 31.55it/s]


Epoch [40/50], Loss: 0.0090


100%|██████████| 1/1 [00:00<00:00, 26.49it/s]


Epoch [41/50], Loss: 0.0039


100%|██████████| 1/1 [00:00<00:00, 33.02it/s]


Epoch [42/50], Loss: 0.0085


100%|██████████| 1/1 [00:00<00:00, 46.94it/s]


Epoch [43/50], Loss: 0.0077


100%|██████████| 1/1 [00:00<00:00, 36.66it/s]


Epoch [44/50], Loss: 0.0063


100%|██████████| 1/1 [00:00<00:00, 36.29it/s]


Epoch [45/50], Loss: 0.0107


100%|██████████| 1/1 [00:00<00:00, 34.02it/s]


Epoch [46/50], Loss: 0.0066


100%|██████████| 1/1 [00:00<00:00,  6.35it/s]


Epoch [47/50], Loss: 0.0029


100%|██████████| 1/1 [00:00<00:00, 25.15it/s]


Epoch [48/50], Loss: 0.0056


100%|██████████| 1/1 [00:00<00:00, 36.87it/s]


Epoch [49/50], Loss: 0.0046


100%|██████████| 1/1 [00:00<00:00, 48.42it/s]

Epoch [50/50], Loss: 0.0029





In [None]:

# Create an instance of the autoencoder for a long duration
model = AudioAutoencoder().to(device)

# Load dataset (assuming you have audio files in 'audio_samples' directory)
dataset = AudioDataset("/Users/jeevanbhatta/Downloads/voice-classification/data/audio_data")
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Train the model
model = train_autoencoder(model, dataloader, num_epochs=50)

# Save the model
torch.save(model.state_dict(), "audio_autoencoder.pth")

# Generate new audio (example)
if len(dataset) >= 2:
    audio1 = dataset[0].unsqueeze(0)  # Add batch dimension
    audio2 = dataset[1].unsqueeze(0)
    generated_samples = generate_audio(model, audio1, audio2, num_steps=5)
    
    # Save a generated sample
    sample = generated_samples[2].squeeze(0)
    torchaudio.save("generated_audio.wav", sample, 16000)

Use teacher forcing one or recurrent (for time domain signals)

train for a,b,c and generate b,c,d

try lstm, Keras/tf "Sequential"

PCA everything first, then learn the teacher forcing on PCA and then do decoding.

PCA will clean up the noise.

Then I can do ICA; independent componnet Analysis -- works much better on audio data

ICA, data is the mixture of the underlying components --> works wonders for noise


Set check points to save progress in large training
