# Imports

In [1]:
import librosa
import numpy as np
import torch
import torch.nn.functional as F
import torchaudio
import torchaudio.transforms as T
import torch.nn as nn
import nemo.collections.asr as nemo_asr
import json
from torch.utils.data import Dataset,DataLoader

      def forward(
    
      def backward(ctx, grad_output):
    
      def forward(
    
      def backward(ctx, grad_output):
    


# Utils

## Content Embeddings

In [2]:
def extract_content_embeddings(mel_spectrogram_tensor, encoder,device):
    mel_tensor = mel_spectrogram_tensor.to(device)
    lengths = torch.full((mel_tensor.shape[0],), mel_tensor.shape[2], dtype=torch.int32).to(device)

    # # Ensure the input tensor has the correct shape
    # if len(mel_tensor.shape) == 2:
    #     # Adding batch dimension (1, mel_channels, time_frames)
    #     mel_tensor = mel_tensor.unsqueeze(0)

    encoder.eval()
    with torch.no_grad():
        # content_embeddings = encoder(mel_tensor)
        content_embeddings, _ = encoder(
            audio_signal=mel_tensor,
            length=lengths
        )        

    return content_embeddings 

## Duration Augmented Content Embeddings 

In [3]:
def cosine_similarity(v1, v2):
    return F.cosine_similarity(v1.unsqueeze(0), v2.unsqueeze(0), dim=-1).item()

def group_similar_vectors(vectors, threshold, vector_duration):
    grouped_vectors = []
    durations = []

    current_group = [vectors[:, 0]]  # Start with the first vector (slice across batch dimension)
    current_duration = vector_duration

    for i in range(1, vectors.shape[-1]):
        sim = cosine_similarity(vectors[:, i], vectors[:, i - 1])
        # print(sim)
        # If cosine similarity is above threshold, group the vectors
        if sim > threshold:
            # print("Exceeded Threshold")
            current_group.append(vectors[:, i])
            current_duration += vector_duration
        else:
            # Compute the average of the current group and save it
            averaged_vector = torch.mean(torch.stack(current_group, dim=-1), dim=-1)
            grouped_vectors.append(averaged_vector)
            durations.append(current_duration)

            # Start a new group
            current_group = [vectors[:, i]]
            current_duration = vector_duration

    # Append the last group
    if current_group:
        averaged_vector = torch.mean(torch.stack(current_group, dim=-1), dim=-1)
        grouped_vectors.append(averaged_vector)
        durations.append(current_duration)

    return torch.stack(grouped_vectors, dim=-1), durations


def duration_augmented_representation(content_embeddings, T=0.925, vector_duration=46.44):
    # Remove the batch dimension for processing
    content_vectors = content_embeddings.squeeze(0)  # Shape: [256, num_content_vectors]

    grouped_vectors, new_durations = group_similar_vectors(content_vectors, T, vector_duration)

    # Convert durations to seconds if needed
    new_durations_in_seconds = [d / 1000 for d in new_durations]

    # Add the batch dimension back to the output
    grouped_vectors = grouped_vectors.unsqueeze(0)  # Shape: [1, 256, num_grouped_vectors]

    return grouped_vectors, new_durations_in_seconds

## Pitch Contour Feature Extraction

In [69]:
def extract_f0(audio_path, sr=22050):
    """
    Extract the fundamental frequency (F0) contour using PYin algorithm.
    
    Args:
    - audio_path (str): Path to the audio file.
    - sr (int): Sampling rate. Default is 22050.
    
    Returns:
    - f0_tensor (torch.Tensor): Extracted F0 contour as a PyTorch tensor.
    - pitch_tensor (torch.Tensor): Extracted pitch contour as a PyTorch tensor.
    """
    # Load the audio file
    y, sr = librosa.load(audio_path, sr=sr)

    # Compute the F0 (fundamental frequency) using PYin
    f0_contour, voiced_flag, voiced_probs = librosa.pyin(
        y, 
        fmin=librosa.note_to_hz('C2'),  # Minimum pitch (in Hz)
        fmax=librosa.note_to_hz('C7')   # Maximum pitch (in Hz)
    )
    
    # Replace unvoiced frames (None) with zeros or some placeholder
    f0_contour = np.nan_to_num(f0_contour)
    print("reahced here")
    # Convert to PyTorch tensor
    f0_tensor = torch.tensor(f0_contour, dtype=torch.float32)
    print("reahced here 1")
    # Optionally, if you want to extract a pitch contour (e.g., using voiced probabilities)
    # pitch_tensor = torch.tensor(voiced_probs, dtype=torch.float32)  # Using voiced probabilities as pitch contour

    return f0_tensor


# Function to normalize the F0 contour
def normalize_f0(f0_contour):

    # Filter out unvoiced (NaN or 0 values)
    # voiced_f0 = f0_contour[f0_contour > 0]  # Exclude unvoiced frames
    
    # Compute mean and standard deviation only for voiced frames
    mean_f0 = np.mean(f0_contour)
    std_f0 = np.std(f0_contour)
    
    # Normalize F0 contour (keep NaNs for unvoiced frames)
    normalized_f0 = (f0_contour - mean_f0) / std_f0

    # Optionally: Clip or apply ReLU to remove negative values (if required)
    # normalized_f0 = np.clip(normalized_f0, 0, None)
    
    return normalized_f0


## Speaker Embedding 

In [None]:
speaker_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(model_name='titanet_large')

# Dataset

In [5]:
class SynthesizerNoTransformDataset(Dataset):
    def __init__(self, data,content_encoder,speaker_encoder, device):
        # Load the JSON file
        # with open(json_file, 'r') as f:
        self.data = data
        self.content_encoder = content_encoder
        self.speaker_encoder = speaker_encoder
        self.device = device            
    
    def __len__(self):
        # Return the number of entries in the dataset
        return len(self.data)
    
    def extract_content_embeddings(self, mel_spectrogram_tensor):
        mel_tensor = mel_spectrogram_tensor.unsqueeze(0).to(self.device)  # Add batch dimension
        lengths = torch.full((mel_tensor.shape[0],), mel_tensor.shape[2], dtype=torch.int32).to(self.device)

        self.content_encoder.eval()
        with torch.no_grad():
            content_embeddings, _ = self.content_encoder(
                audio_signal=mel_tensor,
                length=lengths
            )

        return content_embeddings.cpu()   
    
    def extract_speaker_embeddings(self, audio_path):
        self.speaker_encoder.eval()
        with torch.no_grad():
            speaker_embedding = self.speaker_encoder.get_embedding(audio_path).cpu()
        return speaker_embedding      
    
    def __getitem__(self, idx):
        mel_path = self.data[idx]['mel_filepath']
        audio_path = self.data[idx]['audio_filepath']
        # print(audio_path)
        mel_spectrogram = torch.from_numpy(np.load(mel_path))
        content_embeddings = self.extract_content_embeddings(mel_spectrogram) 
        speaker_embeddings = self.extract_speaker_embeddings(audio_path)  
        # duration_augmented_content_embeddings, durations = self.duration_augmented_representation(content_embeddings)
        # print(duration_augmented_content_embeddings.shape)
        # print("Reached herte amin")
        # normalized_pitch_contour =self.normalize_f0(audio_path)
        # print(normalized_pitch_contour.shape)
        
        return content_embeddings,speaker_embeddings,audio_path
    # ,duration_augmented_content_embeddings
    # ,normalized_pitch_contour

In [6]:
def collate_fn(batch):
    """
    Pads the content embeddings, speaker embeddings, and duration-augmented content embeddings in the batch to the same length.
    
    Args:
    - batch (list of tuples): Each tuple contains content_embeddings, speaker_embeddings, duration_augmented_content_embeddings.
    
    Returns:
    - padded_content_embeddings (torch.Tensor): Padded content embeddings.
    - speaker_batch (torch.Tensor): Speaker embeddings.
    - padded_duration_augmented_embeddings (torch.Tensor): Padded duration-augmented content embeddings.
    """
    # Unzip the batch into separate components
    content_embeddings, speaker_embeddings = zip(*batch)
    
    # Find the maximum length in the batch for padding
    max_len_content = max([embedding.size(-1) for embedding in content_embeddings])
    # max_len_duration = max([embedding.size(-1) for embedding in duration_augmented_content_embeddings])
    
    # Pad the content embeddings to the maximum length
    padded_content_embeddings = [F.pad(embedding, (0, max_len_content - embedding.size(-1))) for embedding in content_embeddings]
    
    # Pad the duration-augmented content embeddings to the maximum length
    # padded_duration_augmented_embeddings = [F.pad(embedding, (0, max_len_duration - embedding.size(-1))) for embedding in duration_augmented_content_embeddings]

    # Stack the speaker embeddings (assuming they are already of fixed size)
    speaker_batch = torch.stack(speaker_embeddings)
    # duration_augmented_content_batch = torch.stack(duration_augmented_content_embeddings)
    # normalized_pitch_contour_batch = torch.stack(normalized_pitch_contour)
    
    # Stack the padded embeddings
    padded_content_embeddings = torch.stack(padded_content_embeddings)
    # f0_tensors_padded = pad_sequence(normalized_pitch_contour, batch_first=True, padding_value=0.0)
    # padded_duration_augmented_embeddings = torch.stack(padded_duration_augmented_embeddings)

    return padded_content_embeddings, speaker_batch

In [7]:
data_path = "/home/keagan/Documents/projects/SelfVC/data/val_manifest.json"
with open(data_path, 'r') as file:
    data = json.load(file)



In [9]:
class ConformerEncoder256(nn.Module):
    def __init__(self, original_encoder):
        super(ConformerEncoder256, self).__init__()
        self.encoder = original_encoder
        # Add a 1D convolution with stride to downsample to 256
        self.downsample = nn.Conv1d(in_channels=512, out_channels=256, kernel_size=1, stride=2) # Example

    def forward(self, audio_signal, length):
        # Pass through original encoder
        embeddings, lengths = self.encoder(audio_signal=audio_signal, length=length)
        # Reduce the embedding size
        embeddings = self.downsample(embeddings)
        return embeddings, lengths

In [10]:
conformer_encoder_256 = torch.load('/home/keagan/Documents/projects/SelfVC/models/conformer_encoder_v2.pth')
speaker_model = nemo_asr.models.EncDecSpeakerLabelModel.from_pretrained(model_name='titanet_large')

      conformer_encoder_256 = torch.load('/home/keagan/Documents/projects/SelfVC/models/conformer_encoder_v2.pth')
    


[NeMo I 2024-10-18 12:44:47 cloud:58] Found existing object /home/keagan/.cache/torch/NeMo/NeMo_1.23.0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo.
[NeMo I 2024-10-18 12:44:47 cloud:64] Re-using file from: /home/keagan/.cache/torch/NeMo/NeMo_1.23.0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo
[NeMo I 2024-10-18 12:44:47 common:924] Instantiating model from pre-trained checkpoint


[NeMo W 2024-10-18 12:44:47 modelPT:165] If you intend to do training or fine-tuning, please call the ModelPT.setup_training_data() method and provide a valid configuration file to setup the train data loader.
    Train config : 
    manifest_filepath: /manifests/combined_fisher_swbd_voxceleb12_librispeech/train.json
    sample_rate: 16000
    labels: null
    batch_size: 64
    shuffle: true
    is_tarred: false
    tarred_audio_filepaths: null
    tarred_shard_strategy: scatter
    augmentor:
      noise:
        manifest_path: /manifests/noise/rir_noise_manifest.json
        prob: 0.5
        min_snr_db: 0
        max_snr_db: 15
      speed:
        prob: 0.5
        sr: 16000
        resample_type: kaiser_fast
        min_speed_rate: 0.95
        max_speed_rate: 1.05
    num_workers: 15
    pin_memory: true
    
[NeMo W 2024-10-18 12:44:47 modelPT:172] If you intend to do validation, please call the ModelPT.setup_validation_data() or ModelPT.setup_multiple_validation_data() method 

[NeMo I 2024-10-18 12:44:47 features:289] PADDING: 16


      return torch.load(model_weights, map_location='cpu')
    


[NeMo I 2024-10-18 12:44:47 save_restore_connector:249] Model EncDecSpeakerLabelModel was successfully restored from /home/keagan/.cache/torch/NeMo/NeMo_1.23.0/titanet-l/11ba0924fdf87c049e339adbf6899d48/titanet-l.nemo.


In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
dataset = SynthesizerNoTransformDataset(data,conformer_encoder_256,speaker_model,device)

In [12]:
batch_size = 4  # Adjust as needed
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True,collate_fn=collate_fn)

In [13]:
for i, (content_embeddings,speaker_embeddings,duration_augmented_content_embeddings,normalized_pitch_contour) in enumerate(dataloader):
    # duration_augmented_content_embeddings
    # normalized_pitch_contour
    print(f"Batch {i+1}:")
    print(f"content_embeddings Shape: {content_embeddings.shape}")
    print(f"speaker_embeddings Shape: {speaker_embeddings.shape}")
    print(f"duration_augmented_content_embeddings Shape: {duration_augmented_content_embeddings.shape}")
    print(f"normalized_pitch_contour Shape: {normalized_pitch_contour.shape}")
    # Optionally, break after first batch for testing
    if i == 0:
        break

/home/keagan/Documents/projects/SelfVC/data/audios/7484_39971_000015_000000.wav


      with torch.cuda.amp.autocast(enabled=False):
    
      with torch.cuda.amp.autocast(enabled=False):
    


Reached herte amin


      with torch.cuda.amp.autocast(enabled=False):
    
      with torch.cuda.amp.autocast(enabled=False):
    


torch.Size([127])
/home/keagan/Documents/projects/SelfVC/data/audios/5012_31097_000008_000001.wav
Reached herte amin
torch.Size([603])
/home/keagan/Documents/projects/SelfVC/data/audios/56_1733_000012_000001.wav
Reached herte amin
torch.Size([72])
/home/keagan/Documents/projects/SelfVC/data/audios/666_11243_000007_000003.wav
Reached herte amin
torch.Size([446])


RuntimeError: stack expects each tensor to be equal size, but got [127] at entry 0 and [603] at entry 1

In [100]:
f0_tensor = extract_f0("/home/keagan/Documents/projects/SelfVC/data/audios/4957_36386_000058_000002.wav", sr=22050)

reahced here
reahced here 1


In [101]:
f0_tensor

tensor([  0.0000,   0.0000,   0.0000,   0.0000,   0.0000, 174.6141, 169.6432,
        167.6947, 165.7685, 161.0494, 158.2827, 169.6432, 220.0000, 245.5194,
        261.6256, 278.7883, 285.3047, 283.6615, 257.1310, 216.2205, 258.6205,
        245.5194, 233.0819, 223.8455, 220.0000, 220.0000, 222.5563, 225.1423,
        248.3722, 246.9417, 248.3722, 252.7136, 260.1187, 264.6655, 260.1187,
          0.0000,   0.0000, 220.0000, 211.2820, 204.0850, 194.8689, 190.4180,
        191.5211, 197.1331, 197.1331, 186.0689, 211.2820, 210.0652, 204.0850,
        202.9096, 202.9096, 207.6523, 207.6523,   0.0000,   0.0000, 248.3722,
        249.8110, 252.7136, 261.6256, 269.2918, 277.1826, 283.6615,   0.0000,
          0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
          0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
          0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.0000,
          0.0000,   0.0000,   0.0000,   0.0000,   0.0000,   0.00

In [97]:
def normalize_f0(audio_path, sr=22050):
    """
    Normalize the F0 contour.
    
    Args:
    - f0_tensor (torch.Tensor): F0 contour as a PyTorch tensor.
    
    Returns:
    - normalized_f0 (torch.Tensor): Normalized F0 contour.
    """
    f0_tensor = extract_f0(audio_path, sr=22050)
    # Filter out unvoiced (NaN or 0 values)
    # voiced_f0 = f0_tensor[f0_tensor > 0]  # Exclude unvoiced frames
    
    # Compute mean and standard deviation only for voiced frames
    mean_f0 = f0_tensor.mean()
    std_f0 = f0_tensor.std()
    
    # Normalize F0 contour (keep NaNs for unvoiced frames)
    normalized_f0 = (f0_tensor - mean_f0) / std_f0

    return normalized_f0     

In [99]:
f0_tensor_norm = normalize_f0("/home/keagan/Documents/projects/SelfVC/data/audios/4957_36386_000058_000002.wav", sr=22050)

reahced here
reahced here 1


In [102]:
f0_tensor_norm

tensor([-1.1538, -1.1538, -1.1538, -1.1538, -1.1538,  0.3828,  0.3390,  0.3219,
         0.3049,  0.2634,  0.2391,  0.3390,  0.7822,  1.0067,  1.1485,  1.2995,
         1.3568,  1.3424,  1.1089,  0.7489,  1.1220,  1.0067,  0.8973,  0.8160,
         0.7822,  0.7822,  0.8047,  0.8274,  1.0318,  1.0193,  1.0318,  1.0700,
         1.1352,  1.1752,  1.1352, -1.1538, -1.1538,  0.7822,  0.7055,  0.6421,
         0.5610,  0.5218,  0.5316,  0.5809,  0.5809,  0.4836,  0.7055,  0.6947,
         0.6421,  0.6318,  0.6318,  0.6735,  0.6735, -1.1538, -1.1538,  1.0318,
         1.0445,  1.0700,  1.1485,  1.2159,  1.2854,  1.3424, -1.1538, -1.1538,
        -1.1538, -1.1538, -1.1538, -1.1538, -1.1538, -1.1538, -1.1538, -1.1538,
        -1.1538, -1.1538, -1.1538, -1.1538, -1.1538, -1.1538, -1.1538, -1.1538,
        -1.1538, -1.1538, -1.1538, -1.1538, -1.1538, -1.1538, -1.1538, -1.1538,
        -1.1538, -1.1538, -1.1538, -1.1538,  0.6841,  0.8855,  1.0445,  1.1485,
         1.1752,  1.1887,  1.2023, -1.15