In [None]:
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from IPython.display import Audio
import soundfile as sf
import numpy as np

In [None]:
processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")

In [None]:
inputs = processor(
    text=["Classic Rock song with guitar solo in 30 seconds"],
    padding=True,
    return_tensors="pt",
)

audio_values = model.generate(**inputs, max_new_tokens=1536)

# Get the sampling rate from the model's configuration
sampling_rate = model.config.audio_encoder.sampling_rate

# Convert audio values to numpy array
audio_values_np = audio_values[0].numpy()

# Ensure data is in the correct format and range & clip to [-1.0, 1.0] range if necessary
audio_values_np = np.clip(audio_values_np, -1.0, 1.0)

# Convert data to float32 (if necessary)
audio_values_np = audio_values_np.astype(np.float32)

# Check if audio_values_np is mono (1D) or stereo (2D)
# If Stereo (2D), the second dimension is channels
if audio_values_np.ndim == 1:
    audio_values_np = audio_values_np[None, :]  # Add channel dimension if mono

# Save as a .wav file
sf.write('generated_audio_02.wav', audio_values_np.T, sampling_rate)  # Transpose if necessary

# Play the generated audio in the Notebook
Audio(audio_values[0].numpy(), rate=sampling_rate)