# AudioLDM

## Install git lfs

In [None]:
!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | sudo bash
!sudo apt-get install git-lfs

## Clone model from huggingface

In [None]:
!git clone https://huggingface.co/haoheliu/AudioLDM-S-Full
!mv AudioLDM-S-Full/audioldm-s-full AudioLDM-S-Full/audioldm-s-full.ckpt

## Install AudioLDM

In [None]:
!git clone https://github.com/jkrukowski/AudioLDM.git && cd AudioLDM && git checkout -t origin/removed-model-download && cd ..
!pip install -e AudioLDM/

In [None]:
import sys
sys.path.append('./AudioLDM/')

In [None]:
import numpy as np
import random
import uuid
import librosa
import soundfile as sf
from IPython.display import Audio, display
from audioldm import text_to_audio, style_transfer, build_model, latent_diffusion

audioldm = build_model(ckpt_path='./AudioLDM-S-Full/audioldm-s-full.ckpt')

def round_to_multiple(number, multiple):
  x = multiple * round(number / multiple)
  if x == 0: x = multiple
  return x

def text2audio(text, duration, guidance_scale, random_seed, n_candidates, steps):
  waveform = text_to_audio(
    audioldm,
    text,
    random_seed,
    duration=duration,
    guidance_scale=guidance_scale,
    ddim_steps=steps,
    n_candidate_gen_per_text=int(n_candidates)
  )
  if(len(waveform) == 1):
    waveform = waveform[0]
  return waveform

def styleaudio(text, duration, audio_path, strength, guidance_scale, random_seed, steps):
  waveform = style_transfer(
    audioldm,
    text,
    audio_path,
    strength,
    random_seed,
    duration=duration,
    guidance_scale=guidance_scale,
    ddim_steps=steps,
  )
  if(len(waveform) == 1):
    waveform = waveform[0]
  return waveform

## Text2Audio

In [None]:
input = "blend of haunting soundscapes and minimalist electronic music. The resulting sound would be characterized by manipulated and processed samples, beats, and textures, combined with atmospheric and nostalgic elements. Repetitive loops, glitch elements, and sound decay would create a mesmerizing and hypnotic effect. The focus would be on sound design, textures, and timbres, while incorporating elements of emotion and memory. The resulting music would be experimental, dreamy, and immersive, offering a unique and captivating listening experience."
seed = random.randint(0, 10_000_000)
generated_audio = text2audio(input, 20, 10, seed, 10, 500)
out_file = f'text2audio_{uuid.uuid4()}_{seed}.wav'
sf.write(out_file, generated_audio.T, 16000, subtype='PCM_24')
print(out_file)
display(Audio(out_file, autoplay=False))

## Style audio

In [None]:
init_y, init_sr = librosa.load("/content/32a88eb2-620a-4f9d-8ecb-7e9783bc9bc5_6839090.wav", sr=None, mono=True)
init_duration = librosa.get_duration(init_y, init_sr)
duration = round_to_multiple(init_duration, 2.5) if init_duration < 20 else 20
seed = random.randint(0, 10_000_000)
out_file = f'styled_{uuid.uuid4()}_{seed}.wav'
generated_audio = styleaudio("gregorian choir singing", duration, "/content/32a88eb2-620a-4f9d-8ecb-7e9783bc9bc5_6839090.wav", 0.5, 10, seed, 500)
sf.write(out_file, generated_audio.T, 16000, subtype='PCM_24')
print(out_file)
display(Audio(out_file, autoplay=False))