# crossfAIder - pruebas

## 1 - Extract features for A and B using VAE

Try using Encodec by facebook https://huggingface.co/docs/transformers/model_doc/encodec

In [14]:
import torch
import torchaudio
from transformers import EncodecModel, AutoProcessor
import numpy as np

## Load Pretrained Encodec model capable of extracting audio features

In [10]:
# Load the EnCodec model
model = EncodecModel.from_pretrained("facebook/encodec_24khz")
processor = AutoProcessor.from_pretrained("facebook/encodec_24khz")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
def extract_encodec_features(audio_path):
    waveform, sr = torchaudio.load(audio_path)  # Loads an audio file as a waveform tensor.
    waveform = torchaudio.functional.resample(waveform, orig_freq=sr, new_freq=processor.sampling_rate) # Ensures the sample rate is 24 kHz, as required by EnCodec

    # Prepare input for EnCodec
    inputs = processor(raw_audio=waveform, sampling_rate=processor.sampling_rate, return_tensors="pt") # Converts the waveform into a PyTorch tensor for processing
    
    # Encode audio into latent representation
    with torch.no_grad():
        encoder_outputs = model.encode(inputs["input_values"], inputs["padding_mask"]) # Extracts audio_codes (the compressed latent features) and audio_scales (quantization scales)
    
    return encoder_outputs.audio_codes, encoder_outputs.audio_scales  # Encoded features

## Define an interpolation function

TODO try alternatives to Linear Interpolation:

Spherical Interpolation (SLERP): More natural blending in latent space.

Bezier Curves: Non-linear transitions for smoother effects.

In [None]:
import numpy as np


# Uses linear interpolation between Track A and Track B in 10 steps
def interpolate_encodec_features(audio_codes_A, audio_codes_B, steps=10):
    interpolations = []
    # alpha varies from 0 to 1 (0 = Track A, 1 = Track B)
    for alpha in np.linspace(0, 1, steps):
        # Linear interpolation between both audio representations
        interpolated_code = (1 - alpha) * audio_codes_A + alpha * audio_codes_B
        interpolations.append(interpolated_code)

    return interpolations

## Decode back to waveform

In [None]:
def decode_encodec_features(interpolated_codes):
    """Decodes interpolated EnCodec latent representations back to audio."""
    interpolated_audio = []
    
    for codes in interpolated_codes:
        with torch.no_grad():
            decoded_audio = model.decode(audio_codes=codes, audio_scales=None, padding_mask=None)
        
        interpolated_audio.append(decoded_audio.squeeze().cpu().numpy())
    
    return interpolated_audio

## Run full process

In [None]:
# Extract features from Track A (end) and Track B (start)
audio_codes_A, _ = extract_encodec_features("trackA_end.wav")
audio_codes_B, _ = extract_encodec_features("trackB_start.wav")

# Interpolate between the two tracks
interpolated_codes = interpolate_encodec_features(audio_codes_A, audio_codes_B, steps=10)

# Decode the interpolated representations back to audio
audio_transitions = decode_encodec_features(interpolated_codes)

In [None]:
# Save one of the transition steps as a file
import soundfile as sf
sf.write("transition.wav", audio_transitions[5], samplerate=processor.sampling_rate)