In [None]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import joblib
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm

# Custom Dataset
class CustomDataset(Dataset):
    def __init__(self, mel_dir, mfcc_dir):
        self.mel_files = sorted([os.path.join(mel_dir, f) for f in os.listdir(mel_dir) if f.endswith(".npy")])
        self.mfcc_files = sorted([os.path.join(mfcc_dir, f) for f in os.listdir(mfcc_dir) if f.endswith(".npy")])

        if not self.mel_files or not self.mfcc_files:
            raise ValueError("No .npy files found in one or both input directories.")

    def __len__(self):
        return min(len(self.mel_files), len(self.mfcc_files))

    def __getitem__(self, idx):
        mel = torch.tensor(np.load(self.mel_files[idx]), dtype=torch.float32)
        mfcc = torch.tensor(np.load(self.mfcc_files[idx]), dtype=torch.float32)
        return mel, mfcc  # Return separately

# Custom collate function for dynamic padding
def collate_fn(batch):
    mel_list, mfcc_list = zip(*batch)  # Unpacking batch

    max_width = max(x.shape[1] for x in mel_list + mfcc_list)  # Find max width

    # Pad Mel and MFCC separately
    mel_padded = [F.pad(x, (0, max_width - x.shape[1])) for x in mel_list]
    mfcc_padded = [F.pad(x, (0, max_width - x.shape[1])) for x in mfcc_list]

    # Stack and concatenate along channel dimension
    mel_padded = torch.stack(mel_padded)
    mfcc_padded = torch.stack(mfcc_padded)
    inputs = torch.cat((mel_padded, mfcc_padded), dim=1)  # Merge along feature axis

    return inputs, inputs  # Autoencoder target is itself

# Define the model
class VoiceConversionModel(nn.Module):
    def __init__(self, input_dim):
        super(VoiceConversionModel, self).__init__()

        # Encoder with 1024 increments up to 8192
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 1024),
            nn.ReLU(),
            nn.Linear(1024, 2048),
            nn.ReLU(),
            nn.Linear(2048, 3072),
            nn.ReLU(),
            nn.Linear(3072, 4096),
            nn.ReLU(),
            nn.Linear(4096, 5120),
            nn.ReLU(),
            nn.Linear(5120, 6144),
            nn.ReLU(),
            nn.Linear(6144, 7168),
            nn.ReLU(),
            nn.Linear(7168, 8192),
            nn.ReLU()
        )

        # Decoder (mirroring the encoder)
        self.decoder = nn.Sequential(
            nn.Linear(8192, 7168),
            nn.ReLU(),
            nn.Linear(7168, 6144),
            nn.ReLU(),
            nn.Linear(6144, 5120),
            nn.ReLU(),
            nn.Linear(5120, 4096),
            nn.ReLU(),
            nn.Linear(4096, 3072),
            nn.ReLU(),
            nn.Linear(3072, 2048),
            nn.ReLU(),
            nn.Linear(2048, 1024),
            nn.ReLU(),
            nn.Linear(1024, input_dim)
        )

    def forward(self, x):
        batch_size, feat_dim, time_dim = x.shape  # Get original shape
        x = x.view(batch_size, -1)  # Flatten input dynamically
        x = self.encoder(x)
        x = self.decoder(x)
        x = x.view(batch_size, feat_dim, time_dim)  # Reshape back to original shape
        return x

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
BATCH_SIZE = 32
LEARNING_RATE = 0.00016813084287417308  # Updated best learning rate
EPOCHS = 70  # Updated to 70 epochs

# Define your input directories
MEL_INPUT_DIR = r"C:\Users\cl502_11\MG\Feature Extraction\DataChunk1 (29 Files)\80_10_10\mel_spectrograms\train"
MFCC_INPUT_DIR = r"C:\Users\cl502_11\MG\Feature Extraction\DataChunk1 (29 Files)\80_10_10\mfccs\train"
OUTPUT_DIR = r"C:\Users\cl502_11\MG\Models\VQ-VAE\AE2"

# Load dataset
dataset = CustomDataset(MEL_INPUT_DIR, MFCC_INPUT_DIR)
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# Get input dimension dynamically
sample_input, _ = next(iter(train_loader))
input_dim = sample_input.view(sample_input.shape[0], -1).shape[1]  # Flattened feature size

# Initialize model, loss, optimizer
model = VoiceConversionModel(input_dim).to(device)
criterion = nn.MSELoss()
optimizer = optim.RMSprop(model.parameters(), lr=LEARNING_RATE)  # Updated optimizer
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)

# Training loop
train_losses = []
for epoch in range(EPOCHS):
    model.train()
    epoch_loss = 0
    loop = tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}", leave=True)
    for batch in loop:
        inputs, targets = batch
        inputs, targets = inputs.to(device), targets.to(device)

        mean, std = inputs.mean(), inputs.std()
        inputs = (inputs - mean) / std

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        loop.set_postfix(loss=loss.item())
    train_losses.append(epoch_loss / len(train_loader))
    scheduler.step(epoch_loss / len(train_loader))  # Update learning rate based on loss

# Save the trained encoder and decoder separately in joblib format
joblib.dump(model.encoder.state_dict(), os.path.join(OUTPUT_DIR, "encoder.joblib"))
joblib.dump(model.decoder.state_dict(), os.path.join(OUTPUT_DIR, "decoder.joblib"))


Epoch 1/70: 100%|██████████| 20/20 [00:03<00:00,  6.29it/s, loss=2.43e+3]
Epoch 2/70: 100%|██████████| 20/20 [00:02<00:00,  7.01it/s, loss=2.11e+3]
Epoch 3/70: 100%|██████████| 20/20 [00:02<00:00,  7.05it/s, loss=937]    
Epoch 4/70: 100%|██████████| 20/20 [00:02<00:00,  6.98it/s, loss=794]   
Epoch 5/70: 100%|██████████| 20/20 [00:02<00:00,  6.95it/s, loss=344]
Epoch 6/70: 100%|██████████| 20/20 [00:02<00:00,  6.78it/s, loss=349]
Epoch 7/70: 100%|██████████| 20/20 [00:02<00:00,  7.07it/s, loss=381]
Epoch 8/70: 100%|██████████| 20/20 [00:02<00:00,  7.03it/s, loss=305]
Epoch 9/70: 100%|██████████| 20/20 [00:02<00:00,  6.93it/s, loss=546]
Epoch 10/70: 100%|██████████| 20/20 [00:02<00:00,  6.97it/s, loss=346]
Epoch 11/70: 100%|██████████| 20/20 [00:02<00:00,  7.10it/s, loss=311]
Epoch 12/70: 100%|██████████| 20/20 [00:02<00:00,  7.06it/s, loss=297]
Epoch 13/70: 100%|██████████| 20/20 [00:02<00:00,  6.96it/s, loss=646]
Epoch 14/70: 100%|██████████| 20/20 [00:02<00:00,  7.05it/s, loss=477]


['C:\\Users\\cl502_11\\MG\\Models\\VQ-VAE\\AE2\\decoder.joblib']