# crossfAIder - pruebas

## 1 - Extract features for A and B using VAE

Try using Encodec by facebook https://huggingface.co/docs/transformers/model_doc/encodec

In [1]:
import torch
import torchaudio
from transformers import EncodecModel, AutoProcessor
import numpy as np
import IPython
import torchaudio.transforms as T
from pydub import AudioSegment

  from .autonotebook import tqdm as notebook_tqdm


## Load Pretrained Encodec model capable of extracting audio features

In [2]:
# Load the EnCodec model
model = EncodecModel.from_pretrained("facebook/encodec_24khz")
processor = AutoProcessor.from_pretrained("facebook/encodec_24khz")

  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.48, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [21]:
def extract_encodec_features(audio_path, duration=5, last=True):
    """
    Extract EnCodec latent features from an MP3 file.
    
    Args:
        audio_path (str): Path to the MP3 file.
        duration (float): Duration of the extracted segment in seconds (default: 5s).
        last (bool): If True, extracts the last `duration` seconds. If False, extracts the first `duration` seconds.

    Returns:
        Tensor: Encoded audio_codes (latent representation).
    """
    # Load MP3 file using pydub
    audio = AudioSegment.from_mp3(audio_path)

    # Determine start time
    start_time = (audio.duration_seconds - duration) if last else 0  # Last X seconds or first X seconds

    # Extract the required segment
    segment = audio[start_time * 1000 : (start_time + duration) * 1000]  # Convert to milliseconds

    # Convert to MONO
    segment = segment.set_channels(1)

    # Convert to waveform tensor
    samples = torch.tensor(segment.get_array_of_samples()).float()
    waveform = samples / (2**15)  # Normalize (convert int16 to float)

    # Ensure correct shape for EnCodec: (1, samples) instead of (samples, 1, 1)
    waveform = waveform.unsqueeze(0)  # Add batch dimension → Shape (1, samples)

    # Resample to EnCodec’s required sample rate (24 kHz)
    waveform = T.Resample(orig_freq=audio.frame_rate, new_freq=processor.sampling_rate)(waveform)

    # Ensure correct shape: (channels, samples) → EnCodec expects (1, samples)
    waveform = waveform.squeeze(0)  # Remove batch dim → Now shape (1, samples)

    # Prepare for EnCodec
    inputs = processor(raw_audio=waveform, sampling_rate=processor.sampling_rate, return_tensors="pt")

    # Encode to latent space
    with torch.no_grad():
        encoder_outputs = model.encode(inputs["input_values"], inputs["padding_mask"])

    return encoder_outputs.audio_codes  # Return only the audio codes

## Define an interpolation function

TODO try alternatives to Linear Interpolation:

Spherical Interpolation (SLERP): More natural blending in latent space.

Bezier Curves: Non-linear transitions for smoother effects.

In [53]:
import torch
import numpy as np

def interpolate_encodec_features(audio_codes_A, audio_codes_B, steps=10):
    """Interpolates between two sets of EnCodec latent representations."""
    interpolations = []
    
    for alpha in np.linspace(0, 1, steps):
        # Linear interpolation in latent space
        interpolated_code = (1 - alpha) * audio_codes_A + alpha * audio_codes_B
        interpolations.append(interpolated_code)
    
    # Stack tensors into a single batch tensor
    return torch.stack(interpolations)  # Now it's a single tensor instead of a list


## Decode back to waveform

In [57]:
def decode_encodec_features(interpolated_codes):
    """Decodes EnCodec latent codes into audio waveforms."""
    
    if not isinstance(interpolated_codes, torch.Tensor):
        raise ValueError(f"Expected torch.Tensor but got {type(interpolated_codes)}")

    # Ensure correct batch processing
    interpolated_audio = []
    
    for i in range(interpolated_codes.shape[0]):  # Loop through batch dimension
        with torch.no_grad():
            decoded_audio = model.decode(audio_codes=interpolated_codes[i].unsqueeze(0),
                                         audio_scales=None, padding_mask=None)
        
        interpolated_audio.append(decoded_audio.squeeze().cpu().numpy())

    return interpolated_audio

## Save output as mp3

In [6]:
from pydub import AudioSegment
import soundfile as sf

def save_as_mp3(audio_data, output_path, sr=24000):
    """Saves an array of audio samples as an MP3 file."""
    # Convert numpy array to WAV file first
    temp_wav = "temp_transition.wav"
    sf.write(temp_wav, audio_data, sr)
    
    # Convert WAV to MP3 using pydub
    sound = AudioSegment.from_wav(temp_wav)
    sound.export(output_path, format="mp3")

## Run full process

In [49]:
# Extract last 5 seconds of Track A
trackA_codes = extract_encodec_features("./tracks/trackA.mp3", duration=5, last=True)

In [37]:
print(trackA_codes)

tensor([[[[ 62,  62,  62,  62, 408, 408, 408, 408, 408, 408, 408, 408,  62,  62,
            62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,
           408, 408,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,
            62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,
            62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,
            62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,
            62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,
            62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,
            62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,
            62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,
            62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,
            62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,  62,
            62,  62,  62,  6

In [50]:
# Extract first 5 seconds of Track B
trackB_codes = extract_encodec_features("./tracks/trackB.mp3", duration=5, last=True)

In [36]:
print(trackB_codes)

tensor([[[[ 862,  293,  868,  651,  967,  136,  275,  255,  868,  321,  967,
            811,  862,  293,  131,  565,  670,  811,  325,  293,  670,  696,
             73,  696,   80,   73,  293,  670,  696,  457,  811,   80,  875,
            293,  756,  811,  875,  811,  310,  131,  991,  131,  457,  900,
             80,  670,  957,  670,  131,  321,  696,  293,  753,  293,  900,
            131,   52,  811,   91,  879,  310,  875,  321,  325, 1022,  879,
            971,  604,  875,  699,  904,  491,  310,  228,  370,  228,  224,
            228,  879,  310,  604,   52,  699,  724,  228,  724,  629,  999,
            904,  604,  879,  904,  325,  228,  432,  904,  430,  432,  834,
            904,  904,  724,  904,  432, 1017,  604,  257,  834,  430,  257,
            724,  228,  430,  834,  724,  432,  855,  855, 1019,  855, 1019,
            430, 1017,  855, 1017, 1019,  257,  106,  855, 1017,  106,  855,
            876, 1019, 1017,  855, 1019,  855,  876,  876,  430, 1019,  855,

In [54]:
# Generate interpolated audio codes
interpolated_codes = interpolate_encodec_features(trackA_codes, trackB_codes, steps=10)

In [55]:
print(type(interpolated_codes))
print(interpolated_codes)

<class 'torch.Tensor'>
tensor([[[[[ 62.0000,  62.0000,  62.0000,  ...,  62.0000,  62.0000,
             62.0000],
           [913.0000, 424.0000, 424.0000,  ..., 518.0000, 518.0000,
            518.0000]]]],



        [[[[150.8889,  87.6667, 151.5556,  ...,  62.0000,  62.0000,
             62.0000],
           [845.6666, 401.3333, 453.7778,  ..., 518.0000, 518.0000,
            518.0000]]]],



        [[[[239.7778, 113.3333, 241.1111,  ...,  62.0000,  62.0000,
             62.0000],
           [778.3334, 378.6667, 483.5555,  ..., 518.0000, 518.0000,
            518.0000]]]],



        ...,



        [[[[684.2222, 241.6667, 688.8889,  ...,  62.0000,  62.0000,
             62.0000],
           [441.6667, 265.3333, 632.4445,  ..., 518.0000, 518.0000,
            518.0000]]]],



        [[[[773.1111, 267.3333, 778.4445,  ...,  62.0000,  62.0000,
             62.0000],
           [374.3333, 242.6667, 662.2222,  ..., 518.0000, 518.0000,
            518.0000]]]],



        [[[[862.0000,

In [None]:
# Convert interpolated features back to audio
audio_transitions = decode_encodec_features(interpolated_codes)

TypeError: 'NoneType' object is not subscriptable

In [None]:
# Save the 5th interpolated step as an MP3 transition file
save_as_mp3(audio_transitions[5], "transition.mp3")