### 1. Import libraries

In [1]:
import os
# CUDA設定を削除
from audioldm2.p2p_pipeline import AudioLDM2Pipeline
from diffusers import DDIMScheduler, DDIMInverseScheduler
import torch
import nltk
import IPython
import soundfile as sf
import datetime
from lpmc.music_captioning import captioning
from audioldm2.embedding_calculator import EmbeddingCalculator

model_ckpt = "cvssp/audioldm2"
pipeline = AudioLDM2Pipeline.from_pretrained(model_ckpt,
                                             torch_dtype=torch.float32)  # .to("cuda")を削除
pipeline.scheduler = DDIMScheduler.from_config(pipeline.scheduler.config)
pipeline.inverse_scheduler = DDIMInverseScheduler.from_config(pipeline.scheduler.config)
RANDOM_SEED = 42

  from .autonotebook import tqdm as notebook_tqdm
Cannot initialize model with low cpu memory usage because `accelerate` was not found in the environment. Defaulting to `low_cpu_mem_usage=False`. It is strongly recommended to install `accelerate` for faster and less memory-intense model loading. You can do so with: 
```
pip install accelerate
```
.
Loading pipeline components...: 100%|██████████| 11/11 [00:09<00:00,  1.20it/s]


AssertionError: Torch not compiled with CUDA enabled

### 2. Real audio inversion (optional)

In [None]:
before_concept = "piano"
after_concept = "saxophone"

In [None]:
# get text input
audio_path = "sample_audio/bach_anh114.wav"
caption = captioning.captioning(audio_path)
prompt = EmbeddingCalculator.postprocessing_caption(caption, before_concept)
print(prompt)

### 3. Calculating Δ

In [None]:
embedding_calculator = EmbeddingCalculator(embedding_model=pipeline, prompt_length=len(nltk.word_tokenize(prompt)))

source_embeddings, generated_source_embeddings, target_embeddings, generated_target_embeddings = embedding_calculator(
    before_concept,
    after_concept,)

In [None]:
# get latent
g_cpu = torch.Generator().manual_seed(RANDOM_SEED)
inv_latents, audio_origin = pipeline.invert(prompt, audio_path=audio_path, generator=g_cpu, guidance_scale=1)

### 4. Diffusion

In [None]:
g_cpu = torch.Generator().manual_seed(RANDOM_SEED)
audios_before_, audios_after_ = pipeline(
    prompt,
    negative_prompt="low quality",
    source_embeddings=source_embeddings,
    generated_source_embeddings=generated_source_embeddings,
    target_embeddings=target_embeddings,
    generated_target_embeddings=generated_target_embeddings,
    num_inference_steps=100,
    audio_length_in_s=10,
    guidance_scale=3.5,
    cross_attention_guidance_amount=0.04,
    generator=g_cpu,
    # num_waveforms_per_prompt=3,
    before_concept=before_concept,
    after_concept=after_concept,
    latents=inv_latents,
    )

### 5. Display results

In [None]:
audios_before, audios_after = audios_before_.audios[0], audios_after_.audios[0]

IPython.display.display(IPython.display.Audio(audios_before, rate=16000))
IPython.display.display(IPython.display.Audio(audios_after, rate=16000))
# save


current_time = datetime.datetime.now().strftime("%m%d%H%M%S")

sf.write(f"outputs/{current_time}_{before_concept}_to_{after_concept}_before.wav", audios_before, 16000)
sf.write(f"outputs/{current_time}_{before_concept}_to_{after_concept}_after.wav", audios_after, 16000)