In [15]:
import librosa
import numpy as np
# import torchaudio

# Normalized Pitch Contour Extraction

In [22]:
def extract_f0(audio_path, sr=22050):
    """
    Extract the fundamental frequency (F0) contour using PYin algorithm.
    
    Args:
    - audio_path (str): Path to the audio file.
    - sr (int): Sampling rate. Default is 22050.
    
    Returns:
    - f0_contour (np.ndarray): Extracted F0 contour.
    - times (np.ndarray): Time axis corresponding to the F0 contour.
    """
    # Load the audio file
    y, sr = librosa.load(audio_path, sr=sr)

    # Compute the F0 (fundamental frequency) using PYin
    f0_contour, voiced_flag, voiced_probs = librosa.pyin(
        y, 
        fmin=librosa.note_to_hz('C2'),  # Minimum pitch (in Hz)
        fmax=librosa.note_to_hz('C7')   # Maximum pitch (in Hz)
    )
    
    # Replace unvoiced frames (None) with zeros or some placeholder
    f0_contour = np.nan_to_num(f0_contour)

    # Generate time axis for plotting
    # times = librosa.times_like(f0_contour, sr=sr)
    
    return f0_contour

# Function to normalize the F0 contour
def normalize_f0(f0_contour):
    """
    Normalize the F0 contour using its mean and standard deviation.
    Excludes unvoiced frames (zeros or NaNs) from calculation.
    
    Args:
    - f0_contour (np.ndarray): F0 contour to normalize.
    
    Returns:
    - normalized_f0 (np.ndarray): Normalized F0 contour.
    - mean_f0 (float): Mean of the original F0 contour (excluding unvoiced).
    - std_f0 (float): Standard deviation of the original F0 contour (excluding unvoiced).
    """
    # Filter out unvoiced (NaN or 0 values)
    # voiced_f0 = f0_contour[f0_contour > 0]  # Exclude unvoiced frames
    
    # Compute mean and standard deviation only for voiced frames
    mean_f0 = np.mean(f0_contour)
    std_f0 = np.std(f0_contour)
    
    # Normalize F0 contour (keep NaNs for unvoiced frames)
    normalized_f0 = (f0_contour - mean_f0) / std_f0

    # Optionally: Clip or apply ReLU to remove negative values (if required)
    # normalized_f0 = np.clip(normalized_f0, 0, None)
    
    return normalized_f0

In [23]:
# audio_path = '/home/keagan/Documents/projects/SelfVC/data/archive/14/208/14_208_000001_000000.wav'
# f0 = extract_f0(audio_path, sr=22050)
# normalized_f0 = normalize_f0(f0)