In [1]:
!pip install librosa soundfile matplotlib torch torchaudio

import os
import librosa
import librosa.display
import numpy as np
import matplotlib.pyplot as plt
import IPython.display as ipd
import torch
import torch
import torch.nn as nn
import torch.nn.functional as F



Collecting librosa
  Downloading librosa-0.11.0-py3-none-any.whl.metadata (8.7 kB)
Collecting soundfile
  Downloading soundfile-0.13.1-py2.py3-none-win_amd64.whl.metadata (16 kB)
Collecting torch
  Downloading torch-2.9.1-cp313-cp313-win_amd64.whl.metadata (30 kB)
Collecting torchaudio
  Downloading torchaudio-2.9.1-cp313-cp313-win_amd64.whl.metadata (6.9 kB)
Collecting audioread>=2.1.9 (from librosa)
  Downloading audioread-3.1.0-py3-none-any.whl.metadata (9.0 kB)
Collecting pooch>=1.1 (from librosa)
  Downloading pooch-1.8.2-py3-none-any.whl.metadata (10 kB)
Collecting soxr>=0.3.2 (from librosa)
  Downloading soxr-1.0.0-cp312-abi3-win_amd64.whl.metadata (5.6 kB)
Collecting standard-aifc (from librosa)
  Downloading standard_aifc-3.13.0-py3-none-any.whl.metadata (969 bytes)
Collecting standard-sunau (from librosa)
  Downloading standard_sunau-3.13.0-py3-none-any.whl.metadata (914 bytes)
Collecting standard-chunk (from standard-aifc->librosa)
  Downloading standard_chunk-3.13.0-py3-non

In [2]:
import pandas as pd
import os

DATA_DIR = r"C:\Users\Dell\Downloads\archive (1)"
metadata_path = os.path.join(DATA_DIR, "metadata.csv")

metadata = pd.read_csv(metadata_path)

metadata["speaker_id"] = metadata["speaker_id"].astype(str)

metadata.head()



FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Dell\\Downloads\\archive (1)\\metadata.csv'

In [None]:
import os
import librosa
import librosa.display
import matplotlib.pyplot as plt
import IPython.display as ipd
import numpy as np

# 1. Screenshot ke mutabiq sahi paths
BASE_DIR = r"C:\Users\Dell\project\archive (1)"
WAV_DIR = os.path.join(BASE_DIR, "wavs")
METADATA_PATH = os.path.join(BASE_DIR, "metadata.csv")

# 2. Check karein ke folder mil raha hai
if os.path.exists(WAV_DIR):
    # Wavs folder ke andar ki pehli file uthayein
    all_files = [f for f in os.listdir(WAV_DIR) if f.endswith('.wav')]
    sample_file = all_files[0]
    audio_path = os.path.join(WAV_DIR, sample_file)
    
    print(f"‚úÖ Total files found: {len(all_files)}")
    print(f"‚úÖ Loading sample: {sample_file}")

    # 3. Audio Load & Play
    y, sr = librosa.load(audio_path, sr=16000)
    ipd.display(ipd.Audio(y, rate=sr))

    # 4. Mel-Spectrogram Visualization
    plt.figure(figsize=(10, 4))
    S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=80)
    S_dB = librosa.power_to_db(S, ref=np.max)
    librosa.display.specshow(S_dB, sr=sr, x_axis='time', y_axis='mel', cmap='magma')
    plt.colorbar(format='%+2.0f dB')
    plt.title(f"Spectrogram of {sample_file}")
    plt.show()
else:
    print("‚ùå check folder path is wrong")

In [None]:
#Step 1: Preprocessing (Saving Tensors)
# Create a folder to save processed data

SAVE_DIR = os.path.join(BASE_DIR, "processed_tensors")
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

def preprocess_and_save():
    all_files = [f for f in os.listdir(WAV_DIR) if f.endswith('.wav')]
    print(f"Processing {len(all_files)} files...")
    
    for file_name in all_files[:500]: # Processing first 500 for testing
        file_path = os.path.join(WAV_DIR, file_name)
        # Load and Normalize
        y, sr = librosa.load(file_path, sr=16000)
        y, _ = librosa.effects.trim(y)
        
        # Convert to Mel-Spectrogram
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=80)
        S_dB = librosa.power_to_db(S, ref=np.max)
        
        # Save as PyTorch Tensor
        tensor_data = torch.FloatTensor(S_dB)
        torch.save(tensor_data, os.path.join(SAVE_DIR, file_name.replace('.wav', '.pt')))

preprocess_and_save()
print("‚úÖ Preprocessing Complete.")

In [None]:
#Step 2: Define the GAN Architecture
import torch.nn as nn

class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        self.main = nn.Sequential(
            nn.Conv1d(80, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv1d(128, 80, kernel_size=3, padding=1),
            nn.Tanh()
        )
    def forward(self, x):
        return self.main(x)

class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.main = nn.Sequential(
            nn.Conv1d(80, 64, kernel_size=3, stride=2, padding=1),
            nn.LeakyReLU(0.2),
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.main(x)

# Initialize models
generator = Generator()
discriminator = Discriminator()
print("‚úÖ Models Initialized.")

In [None]:
#Step 3: Setup the DataLoader
from torch.utils.data import DataLoader, Dataset

class VoiceDataset(Dataset):
    def __init__(self, tensor_dir):
        self.file_list = [f for f in os.listdir(tensor_dir) if f.endswith('.pt')]
        self.tensor_dir = tensor_dir
    def __len__(self):
        return len(self.file_list)
    def __getitem__(self, idx):
        return torch.load(os.path.join(self.tensor_dir, self.file_list[idx]))

# Create the loader
dataset = VoiceDataset(SAVE_DIR)
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
print(f"‚úÖ DataLoader ready with {len(dataset)} items.")

In [None]:
#Step 4: Shape Test (Crucial)
# Grab one batch
test_batch = next(iter(dataloader))
print(f"Input Shape: {test_batch.shape}")

# Pass through Generator
gen_output = generator(test_batch)
print(f"Generator Output Shape: {gen_output.shape}")

# Pass through Discriminator
disc_output = discriminator(gen_output)
print(f"Discriminator Decision Shape: {disc_output.shape}")

if gen_output.shape == test_batch.shape:
    print("üöÄ Shape Test Passed! You are ready for training.")

In [None]:
# 1. Initialize Models
generator = Generator()
discriminator = Discriminator()

# 2. Optimizers (Adam is best for GANs)
g_optimizer = torch.optim.Adam(generator.parameters(), lr=0.0001, betas=(0.5, 0.9))
d_optimizer = torch.optim.Adam(discriminator.parameters(), lr=0.0001, betas=(0.5, 0.9))

# 3. Loss Function (Binary Cross Entropy for Real vs Fake)
criterion = torch.nn.BCELoss()

print("‚úÖ Optimizers and Loss Functions initialized.")

In [None]:
#Phase 3: Preprocessing (Generating Tensors)
import torch
import os
import librosa
import numpy as np
from tqdm import tqdm

# Define the save directory
SAVE_DIR = os.path.join(BASE_DIR, "processed_tensors")
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)

def preprocess_and_save(limit=1000):
    all_files = [f for f in os.listdir(WAV_DIR) if f.endswith('.wav')]
    print(f"Processing {min(len(all_files), limit)} files...")
    
    for file_name in tqdm(all_files[:limit]):
        file_path = os.path.join(WAV_DIR, file_name)
        
        # 1. Load and normalize audio
        y, sr = librosa.load(file_path, sr=16000)
        y, _ = librosa.effects.trim(y)
        
        # 2. Extract Mel-Spectrogram
        # We use 80 bands as it is standard for voice conversion
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=80)
        S_dB = librosa.power_to_db(S, ref=np.max)
        
        # 3. Save as PyTorch Tensor
        tensor_data = torch.FloatTensor(S_dB)
        save_path = os.path.join(SAVE_DIR, file_name.replace('.wav', '.pt'))
        torch.save(tensor_data, save_path)

# Run this to create your dataset
preprocess_and_save(limit=1000)

In [None]:
#Phase 4: Defining the Model Architecture
import torch.nn as nn

class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()
        # Simple Encoder-Decoder style for audio features
        self.main = nn.Sequential(
            nn.Conv1d(80, 128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.Conv1d(128, 80, kernel_size=3, padding=1),
            nn.Tanh()
        )
    def forward(self, x):
        return self.main(x)

class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.main = nn.Sequential(
            nn.Conv1d(80, 64, kernel_size=3, stride=2, padding=1),
            nn.LeakyReLU(0.2),
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten(),
            nn.Linear(64, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.main(x)

# Initialize models
generator = Generator()
discriminator = Discriminator()
print("‚úÖ Generator and Discriminator classes defined.")

In [None]:
#Metadata Mapping
#Phase 5: Creating the DataLoader
# Create a dictionary to map filenames to speaker IDs
# This is essential for the GAN to learn individual voice characteristics
file_to_speaker = dict(zip(metadata['file_name'], metadata['speaker_id']))

# Verify the mapping
test_file = "p300_169.wav"
if test_file in file_to_speaker:
    print(f"‚úÖ Mapping Success: File {test_file} is Speaker {file_to_speaker[test_file]}")

In [None]:
#The Training Loop (The Final Piece)
import torch.optim as optim

# Hyperparameters
epochs = 50
lr = 0.0002

# Initialize Optimizers
g_optimizer = optim.Adam(generator.parameters(), lr=lr, betas=(0.5, 0.999))
d_optimizer = optim.Adam(discriminator.parameters(), lr=lr, betas=(0.5, 0.999))
criterion = nn.BCELoss()

print("Starting Training...")

for epoch in range(epochs):
    for i, mels in enumerate(dataloader):
        
        batch_size = mels.size(0)
        real_label = torch.ones(batch_size, 1)
        fake_label = torch.zeros(batch_size, 1)

        # ---------------------
        #  Train Discriminator
        # ---------------------
        d_optimizer.zero_grad()
        
        # Real loss
        output_real = discriminator(mels)
        loss_real = criterion(output_real, real_label)
        
        # Fake loss
        fake_mels = generator(mels) 
        output_fake = discriminator(fake_mels.detach())
        loss_fake = criterion(output_fake, fake_label)
        
        d_loss = (loss_real + loss_fake) / 2
        d_loss.backward()
        d_optimizer.step()

        # -----------------
        #  Train Generator
        # -----------------
        g_optimizer.zero_grad()
        
        output_g = discriminator(fake_mels)
        g_loss = criterion(output_g, real_label) # Generator wants to be seen as 'Real'
        
        g_loss.backward()
        g_optimizer.step()

    print(f"Epoch [{epoch}/{epochs}] | D Loss: {d_loss.item():.4f} | G Loss: {g_loss.item():.4f}")

In [None]:
# Save the trained weights
torch.save(generator.state_dict(), "generator_vctk.pth")
torch.save(discriminator.state_dict(), "discriminator_vctk.pth")
print("‚úÖ Models saved successfully!")

In [None]:
import librosa.display

# 1. Get a sample from the dataloader
generator.eval() # Set to evaluation mode
with torch.no_grad():
    samples = next(iter(dataloader))
    source_mel = samples[0].unsqueeze(0) # Take the first sample
    
    # 2. Generate "Fake" Mel-Spectrogram
    converted_mel = generator(source_mel)

# 3. Convert Mel-Spectrogram back to Audio (Vocoder step)
# Since we don't have a trained Vocoder like HiFi-GAN yet, we use Griffin-Lim
converted_mel_numpy = converted_mel.squeeze().cpu().numpy()
# Reverse the log-scale and normalization we did earlier
mel_inverted = librosa.db_to_power(converted_mel_numpy * 40 - 40)
audio_out = librosa.feature.inverse.mel_to_audio(mel_inverted, sr=16000)

# 4. Listen to the result
print("Original Speaker converted through Generator:")
ipd.display(ipd.Audio(audio_out, rate=16000))

In [None]:
# Save the weights to your local folder
torch.save(generator.state_dict(), "generator_vctk.pth")
torch.save(discriminator.state_dict(), "discriminator_vctk.pth")
print("‚úÖ Models saved as .pth files")

In [None]:
import librosa.display

# 1. Set to evaluation mode
generator.eval() 

with torch.no_grad():
    # 2. Get a batch and pick the first sample
    test_batch = next(iter(dataloader))
    source_mel = test_batch[0].unsqueeze(0) 
    
    # 3. Generate the converted version
    converted_mel = generator(source_mel)

# 4. Visualize the Result
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
librosa.display.specshow(source_mel.squeeze().cpu().numpy(), y_axis='mel')
plt.title("Original Spectrogram")

plt.subplot(1, 2, 2)
librosa.display.specshow(converted_mel.squeeze().cpu().numpy(), y_axis='mel')
plt.title("Generated Spectrogram")
plt.show()

In [None]:
# Convert Mel-Spectrogram back to Audio
converted_mel_np = converted_mel.squeeze().cpu().numpy()

# Note: We must undo any normalization/log-scaling done during preprocessing
# For example, if you used power_to_db, we use db_to_power
mel_inverted = librosa.db_to_power(converted_mel_np)

# Reconstruct the waveform
audio_signal = librosa.feature.inverse.mel_to_audio(mel_inverted, sr=16000)

# Play the audio in Jupyter
print("Listen to the converted voice:")
ipd.display(ipd.Audio(audio_signal, rate=16000))

In [None]:
import torch
import os

# 1. Define the model architecture from the hub
repo = 'nvidia/DeepLearningExamples:torchhub'
model_name = 'nvidia_hifigan'

# Load the entrypoint
hub_output = torch.hub.load(repo, model_name, pretrained=False, trust_repo=True)

# FIX: If hub_output is a tuple, take the first element (the actual model)
if isinstance(hub_output, tuple):
    vocoder = hub_output[0]
else:
    vocoder = hub_output

# 2. Path to the checkpoint we downloaded earlier
checkpoint_path = os.path.join(os.path.expanduser("~"), "hifigan_cpu_ckpt.pt")

# 3. Load the weights
if os.path.exists(checkpoint_path):
    print("Loading weights onto CPU...")
    checkpoint = torch.load(checkpoint_path, map_location=torch.device('cpu'))
    
    # Load the state dict into the model
    vocoder.load_state_dict(checkpoint['generator'])
    vocoder.eval()
    print("‚úÖ SUCCESS: HiFi-GAN Vocoder is ready!")
else:
    print("‚ùå Checkpoint file not found. Please run the download step again.")

In [None]:
import librosa
import numpy as np
import torch

def get_mel_spectrogram(wav_path):
    # 1. Load at 22050Hz (HiFi-GAN's native rate)
    y, sr = librosa.load(wav_path, sr=22050)
    
    # 2. Standard HiFi-GAN parameters
    n_fft = 1024
    hop_length = 256
    win_length = 1024
    n_mels = 80
    fmin = 0
    fmax = 8000
    
    # 3. Generate Mel
    mel_spec = librosa.feature.melspectrogram(
        y=y, sr=sr, n_fft=n_fft, hop_length=hop_length, 
        win_length=win_length, n_mels=n_mels, fmin=fmin, fmax=fmax
    )
    
    # 4. Log Scaling (Crucial for Neural Vocoders)
    mel_spec = np.log(np.clip(mel_spec, a_min=1e-5, a_max=None))
    return torch.FloatTensor(mel_spec).unsqueeze(0)

# TEST: Use a REAL file from your wavs folder
test_file_path = os.path.join(WAV_DIR, all_files[0])
input_mel = get_mel_spectrogram(test_file_path)

with torch.no_grad():
    # Generate audio
    output_audio = vocoder(input_mel)

# Play this result
import IPython.display as ipd
ipd.display(ipd.Audio(output_audio.squeeze().cpu().numpy(), rate=22050))