In [None]:
# utilising musicgen to create music transitions between 2 songs
# musicgen huggingface colab was helpful in creating this code

In [None]:
!nvidia-smi

In [None]:
# the installations that work for us

In [None]:
!pip install --upgrade --quiet pip

!pip uninstall -y cudf-cu12 pylibcudf-cu12
!pip install --upgrade pyarrow>=21.0.0 datasets[audio]

In [None]:
# loading in the model from musicgen

from transformers import MusicgenForConditionalGeneration
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")

In [None]:
# commands for ffmpeg to get the combined audio that is sent into the model:

#   ffmpeg -sseof -5 -i audio1.mp3 -t 5 part1.mp3
#   ffmpeg -ss 0 -i audio2.mp3 -t 5 part2.mp3
#   echo -e "file 'part1.mp3'\nfile 'part2.mp3'" > list.txt
#   ffmpeg -f concat -safe 0 -i list.txt -c copy output.mp3


In [None]:
# formatting the combined audio
import torchaudio
import torch
wav1, sr1 = torchaudio.load("input.mp3")
resampler = torchaudio.transforms.Resample(orig_freq=sr1, new_freq=32000)
wav1 = resampler(wav1.mean(dim=0, keepdim=True))

In [None]:
# actual generation
from transformers import AutoProcessor, MusicgenForConditionalGeneration
from IPython.display import Audio
import torch
import scipy

processor = AutoProcessor.from_pretrained("facebook/musicgen-small")
model = MusicgenForConditionalGeneration.from_pretrained("facebook/musicgen-small")

inputs = processor(
    audio=wav1.squeeze(),
    sampling_rate=32000,
    text=["there are two songs within this audio, blend them together and make a seamless transition"],
    return_tensors="pt"
)

audio_values = model.generate(**inputs, do_sample=True, guidance_scale=3, max_new_tokens=256)
Audio(audio_values[0].cpu().numpy(), rate=model.config.audio_encoder.sampling_rate)

scipy.io.wavfile.write("musicgen_outcombined.wav", rate=32000, data=audio_values[0, 0].cpu().numpy())
