# crossfAIder - pruebas

## 1 - Extract features for A and B using VAE

Try using Encodec by facebook https://huggingface.co/docs/transformers/model_doc/encodec

In [1]:
import torch
import torchaudio
from transformers import EncodecModel, AutoProcessor
import numpy as np
import IPython
import torchaudio.transforms as T
from pydub import AudioSegment

  from .autonotebook import tqdm as notebook_tqdm


## Load Pretrained Encodec model capable of extracting audio features

In [2]:
# Load the EnCodec model
model = EncodecModel.from_pretrained("facebook/encodec_24khz")
processor = AutoProcessor.from_pretrained("facebook/encodec_24khz")

  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [None]:
def extract_encodec_features(audio_path, duration=5, last=True):

    # Load MP3 file using pydub
    audio = AudioSegment.from_mp3(audio_path)

    # Determine start time
    start_time = (audio.duration_seconds - duration) if last else 0  # Last X seconds or first X seconds

    # Extract the required segment
    segment = audio[start_time * 1000 : (start_time + duration) * 1000]  # Convert to milliseconds

    # Convert to MONO
    segment = segment.set_channels(1)

    # Convert to waveform tensor
    samples = torch.tensor(segment.get_array_of_samples()).float()
    waveform = samples / (2**15)  # Normalize (convert int16 to float)

    # Ensure correct shape for EnCodec: (1, samples) instead of (samples, 1, 1)
    waveform = waveform.unsqueeze(0)  # Add batch dimension → Shape (1, samples)

    # Resample to EnCodec’s required sample rate (24 kHz)
    waveform = T.Resample(orig_freq=audio.frame_rate, new_freq=processor.sampling_rate)(waveform)

    # Ensure correct shape: (channels, samples) → EnCodec expects (1, samples)
    waveform = waveform.squeeze(0)  # Remove batch dim → Now shape (1, samples)

    # Prepare for EnCodec
    inputs = processor(raw_audio=waveform, sampling_rate=processor.sampling_rate, return_tensors="pt")

    # Encode to latent space
    with torch.no_grad():
        encoder_outputs = model.encode(inputs["input_values"], inputs["padding_mask"])

    return encoder_outputs.audio_codes  # Return only the audio codes

## Define an interpolation function

TODO try alternatives to Linear Interpolation:

Spherical Interpolation (SLERP): More natural blending in latent space.

Bezier Curves: Non-linear transitions for smoother effects.

In [None]:
import torch
import numpy as np

def interpolate_encodec_features(audio_codes_A, audio_codes_B, steps=10):
    
    interpolations = []
    
    for alpha in np.linspace(0, 1, steps):
        # Ensure interpolation preserves shape (batch, streams, codebooks, frames)
        interpolated_code = (1 - alpha) * audio_codes_A + alpha * audio_codes_B
        interpolations.append(interpolated_code)
    
    # Stack into a tensor of shape (steps, batch, streams, codebooks, frames)
    return torch.cat(interpolations, dim=0)  # Now it's a single tensor with batch dimension


## Decode back to waveform

In [102]:
def decode_encodec_features(interpolated_codes):
    """Decodes EnCodec latent codes into audio waveforms."""
    interpolated_audio = []
    
    for codes in interpolated_codes:
        # Add batch dimension if not present
        if len(codes.shape) == 3:  # [n_q, T] → [1, n_q, T]
            codes = codes.unsqueeze(0)
        
        with torch.no_grad():
            # Get the scales - this is critical
            # For EnCodec, scales are usually calculated during encoding
            # Since we're not encoding, we need to create default scales
            # The shape should match the codes: [batch_size, channels, T]
            # or whatever shape your model expects
            
            # Option 1: Create default scales (all ones)
            scales_shape = (codes.shape[0], 1, codes.shape[-1])  # [batch, 1, frames]
            audio_scales = torch.ones(scales_shape, device=codes.device)
            
            # Decode with scales
            decoded_audio = model.decode(
                audio_codes=codes,
                audio_scales=audio_scales,
                padding_mask=None
            )
        
        decoded_audio = decoded_audio.squeeze().cpu().numpy()
        interpolated_audio.append(decoded_audio)
    
    return interpolated_audio

## Run full process

In [92]:
# Extract last 5 seconds of Track A
trackA_codes = extract_encodec_features("./tracks/trackA.mp3", duration=5, last=True)
# Extract first 5 seconds of Track B
trackB_codes = extract_encodec_features("./tracks/trackB.mp3", duration=5, last=True)

print(trackA_codes.shape)
print(trackB_codes.shape)

torch.Size([1, 1, 2, 375])
torch.Size([1, 1, 2, 375])


In [95]:
# Generate interpolated features
interpolated_codes = interpolate_encodec_features(trackA_codes, trackB_codes, steps=10)

print(interpolated_codes.shape)

torch.Size([10, 1, 2, 375])


In [103]:
# Decode back to audio
audio_transitions = decode_encodec_features(interpolated_codes)

RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)

In [None]:
# Save the 5th interpolated step as an MP3 transition file
save_as_mp3(audio_transitions[5], "transition.mp3")