# Sheet

In [3]:
import os

os.chdir("/data/notebook_files/Pengi")

In [4]:
from Pengi.wrapper import PengiWrapper as Pengi

pengi = Pengi(config="base")

generated_responses = pengi.generate(audio_paths=["/data/notebook_files/violin_0.wav"],
                                   text_prompts=["generate audio caption "],
                                   add_texts=[","],
                                   max_len=30,
                                   beam_size=6,
                                   temperature=1.0,
                                   stop_token=' <|endoftext|>'
                                   )

In [8]:
pengi_output_prompts = generated_responses[0][0]
pengi_output_prompts

## DO NOT EDIT CELL BELOW

This codeblock below is our "ground truth" for reference.

In [20]:
"""make variations of input image"""
import os, sys
os.chdir("/data/notebook_files/Make-An-Audio-main")

print(os.path.abspath("."))
import argparse, os, sys, glob
import PIL
import torch
import numpy as np
from omegaconf import OmegaConf
from PIL import Image
from tqdm import tqdm, trange
from itertools import islice
from einops import rearrange, repeat
from torchvision.utils import make_grid
from torch import autocast
import librosa
# from contextlib import nullcontext
import time
from pytorch_lightning import seed_everything
import math
from ldm.util import instantiate_from_config
from ldm.models.diffusion.ddim import DDIMSampler
from vocoder.bigvgan.models import VocoderBigVGAN
# from ldm.data.extract_mel_spectrogram import TRANSFORMS_22050,TRANSFORMS_16000
from preprocess.NAT_mel import MelNet
import soundfile

batch_max_length = 624
SAMPLE_RATE= 16000

def chunk(it, size):
    it = iter(it)
    return iter(lambda: tuple(islice(it, size)), ())


def load_model_from_config(config, ckpt, verbose=True):
    print(f"Loading model from {ckpt}")
    pl_sd = torch.load(ckpt, map_location="cpu")
    if "global_step" in pl_sd:
        print(f"Global Step: {pl_sd['global_step']}")
    sd = pl_sd["state_dict"]
    model = instantiate_from_config(config.model)
    m, u = model.load_state_dict(sd, strict=False)
    if len(m) > 0 and verbose:
        print("missing keys:")
        print(m)
    if len(u) > 0 and verbose:
        print("unexpected keys:")
        print(u)

    model.cuda()
    model.eval()
    return model

def load_audio(path,transform,sr=16000,batch_max_length=624):# load wav and return mel
    wav,_ = librosa.load(path,sr=sr)

    audio = transform(wav) # (1,melbins,T)
    if audio.shape[2] <= batch_max_length:
        n_repeat = math.ceil((batch_max_length + 1) / audio.shape[1])
        audio = audio.repeat(1,1, n_repeat)

    audio = audio[..., :batch_max_length].unsqueeze(0) # shape [1,1,80,batch_max_length]
    return audio

def load_img(path):# load mel
    audio = np.load(path)
    if audio.shape[1] <= batch_max_length:
        n_repeat = math.ceil((batch_max_length + 1) / audio.shape[1])
        audio = np.tile(audio, reps=(1, n_repeat))

    audio = audio[:, :batch_max_length]
    audio = torch.FloatTensor(audio)[None, None, :, :] # [1,1,80,batch_max_length]
    return audio


class Opt:
    def __init__(self):
        self.config = "/data/notebook_files/Make-An-Audio-main/configs/text_to_audio/txt2audio_args.yaml"
        self.ckpt = "/data/notebook_files/Make-An-Audio-main/useful_ckpts/maa1_full.ckpt"
        self.vocoder_ckpt = "/data/notebook_files/Make-An-Audio-main/useful_ckpts/bigvnat"
        self.outdir = "/data/notebook_files/audio-outputs-with-pengi"
        self.from_file = None
        
        self.strength = 0.3
        self.seed = 42
        self.scale = 3.0
        self.n_samples = 2
        self.n_iter = 1

        self.ddim_steps = 100
        self.ddim_eta = 0.0

        self.prompt = "a warm melodic sounding violin"
        self.init_audio = "/data/notebook_files/violin_0.wav"
        self.pengi_iterations = 5

def main():
    opt = Opt()
    seed_everything(opt.seed)

    config = OmegaConf.load(f"{opt.config}")
    model = load_model_from_config(config, f"{opt.ckpt}")

    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
    model = model.to(device)

    hparams = {
        'audio_sample_rate': SAMPLE_RATE,
        'audio_num_mel_bins':80,
        'fft_size': 1024,
        'win_size': 1024,
        'hop_size': 256,
        'fmin': 0,
        'fmax': 8000,
        'batch_max_length': 1248, 
        'mode': 'pad', # pad,none,
    }
    melnet = MelNet(hparams)
    sampler = DDIMSampler(model)
    vocoder = VocoderBigVGAN(opt.vocoder_ckpt,device)

    os.makedirs(opt.outdir, exist_ok=True)
    outpath = opt.outdir

    batch_size = opt.n_samples # 一个prompt产生n_samples个结果
    if not opt.from_file: # load prompts from this file
        prompt = opt.prompt
        assert prompt is not None
        data = [batch_size * [prompt]]
    else:
        print(f"reading prompts from {opt.from_file}")
        with open(opt.from_file, "r") as f:
            data = f.read().splitlines()
            data = list(chunk(data, batch_size))


    sample_path = os.path.join(outpath, "samples")
    os.makedirs(sample_path, exist_ok=True)
    base_count = len(os.listdir(sample_path))

    assert os.path.isfile(opt.init_audio)
    init_image = load_audio(opt.init_audio,transform=melnet).to(device)
    init_image = repeat(init_image, '1 ... -> b ...', b=batch_size)
    init_latent = model.get_first_stage_encoding(model.encode_first_stage(init_image))  # move to latent space
    sampler.make_schedule(ddim_num_steps=opt.ddim_steps, ddim_eta=opt.ddim_eta, verbose=False)

    assert 0. <= opt.strength <= 1., 'can only work with strength in [0.0, 1.0]'
    t_enc = int(opt.strength * opt.ddim_steps)
    print(f"target t_enc is {t_enc} steps")

    with torch.no_grad():
        with model.ema_scope():
            tic = time.time()
            all_samples = list()
            for n in trange(opt.n_iter, desc="Sampling"):
                for prompts in tqdm(data, desc="data"):
                    uc = None
                    if opt.scale != 1.0: # default=5.0
                        uc = model.get_learned_conditioning(batch_size * [""])
                    if isinstance(prompts, tuple):
                        prompts = list(prompts)
                    c = model.get_learned_conditioning(prompts)
                    z_enc = sampler.stochastic_encode(init_latent, torch.tensor([t_enc]*batch_size).to(device)) # [B, channel, c, h]
                    # decode it
                    samples = sampler.decode(z_enc, c, t_enc, unconditional_guidance_scale=opt.scale,
                                                unconditional_conditioning=uc,)

                    x_samples = model.decode_first_stage(samples)
                    print(x_samples.shape)
                    for x_sample in x_samples:
                        spec = x_sample[0].cpu().numpy()
                        spec_ori = init_image[0][0].cpu().numpy()
                        print(x_sample.shape,spec.shape,init_image.shape)
                        wav = vocoder.vocode(spec)
                        wav_ori = vocoder.vocode(spec_ori)
                        soundfile.write(os.path.join(outpath, f'{prompt.replace(" ", "-")}.wav'), wav, SAMPLE_RATE, 'FLOAT')
                        soundfile.write(os.path.join(outpath, f'{prompt.replace(" ", "-")}_ori.wav'), wav_ori, SAMPLE_RATE, 'FLOAT')
                        base_count += 1
                    all_samples.append(x_samples)


    print(f"Your samples are ready and waiting for you here: \n{outpath} \n"
            f" \nEnjoy.")


/data/notebook_files/Make-An-Audio-main


## EDIT THE CODE CELL BELOW

This will represent our automated prompt-tuning with Pengi and Make-an-Audio

In [7]:
import sys
sys.path.insert(0, "/data/notebook_files/Pengi")

In [8]:
print(sys.path)

['/data/notebook_files/Pengi', '/data/notebook_files/Pengi', '/data/notebook_files', '/opt/datalore/python', '/var/datalore/manager/.pip', '/data/workspace_files', '/opt/python/lib/python38.zip', '/opt/python/lib/python3.8', '/opt/python/lib/python3.8/lib-dynload', '', '/opt/python/envs/minimal/lib/python3.8/site-packages', '/opt/python/envs/minimal/lib/python3.8/site-packages/IPython/extensions', '/home/datalore/.ipython', '']


In [25]:
import os
from Pengi.wrapper import PengiWrapper as Pengi

opt = Opt()
seed_everything(opt.seed)

config = OmegaConf.load(f"{opt.config}")
model = load_model_from_config(config, f"{opt.ckpt}")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)

hparams = {
    'audio_sample_rate': SAMPLE_RATE,
    'audio_num_mel_bins':80,
    'fft_size': 1024,
    'win_size': 1024,
    'hop_size': 256,
    'fmin': 0,
    'fmax': 8000,
    'batch_max_length': 1248, 
    'mode': 'pad', # pad,none,
}
melnet = MelNet(hparams)
sampler = DDIMSampler(model)
vocoder = VocoderBigVGAN(opt.vocoder_ckpt,device)

os.makedirs(opt.outdir, exist_ok=True)
outpath = opt.outdir

batch_size = opt.n_samples # 一个prompt产生n_samples个结果
if not opt.from_file: # load prompts from this file
    prompt = opt.prompt
    assert prompt is not None
    data = [batch_size * [prompt]]
else:
    print(f"reading prompts from {opt.from_file}")
    with open(opt.from_file, "r") as f:
        data = f.read().splitlines()
        data = list(chunk(data, batch_size))


sample_path = os.path.join(outpath, "samples")
os.makedirs(sample_path, exist_ok=True)
base_count = len(os.listdir(sample_path))

assert os.path.isfile(opt.init_audio)
init_image = load_audio(opt.init_audio,transform=melnet).to(device)
init_image = repeat(init_image, '1 ... -> b ...', b=batch_size)
init_latent = model.get_first_stage_encoding(model.encode_first_stage(init_image))  # move to latent space
sampler.make_schedule(ddim_num_steps=opt.ddim_steps, ddim_eta=opt.ddim_eta, verbose=False)

assert 0. <= opt.strength <= 1., 'can only work with strength in [0.0, 1.0]'
t_enc = int(opt.strength * opt.ddim_steps)
print(f"target t_enc is {t_enc} steps")

pengi_output_prompts = {prompt[0]: [] for prompt in data}

with torch.no_grad():
    with model.ema_scope():
        tic = time.time()
        all_samples = list()
        old_encoded = None
        last_pengi_output = None

        for prompts in tqdm(data, desc="data"): # goal prompt
            uc = None
            
            if opt.scale != 1.0: # default=5.0
                uc = model.get_learned_conditioning(batch_size * [""])
            
            for pengi_i in range(opt.pengi_iterations):
                if old_encoded is None:
                    c = model.get_learned_conditioning(prompts)
                else:
                    pengi_encoded = model.get_learned_conditioning(last_pengi_output)
                    c = old_encoded + (old_encoded - pengi_encoded)
                

                old_encoded = c
                    
                # new_encoded_prompt = old_encoded + (old_encoded - result_encoded)

                z_enc = sampler.stochastic_encode(init_latent, torch.tensor([t_enc]*batch_size).to(device)) # [B, channel, c, h]
                # decode it
                samples = sampler.decode(z_enc, c, t_enc, unconditional_guidance_scale=opt.scale,
                                            unconditional_conditioning=uc,)

                x_samples = model.decode_first_stage(samples)
                print(x_samples.shape)
                for x_sample in x_samples:
                    spec = x_sample[0].cpu().numpy()
                    spec_ori = init_image[0][0].cpu().numpy()
                    print(x_sample.shape,spec.shape,init_image.shape)
                    wav = vocoder.vocode(spec)
                    wav_ori = vocoder.vocode(spec_ori)
                    soundfile.write(os.path.join(outpath, f'{prompts[0]}-{pengi_i}.wav'), wav, SAMPLE_RATE, 'FLOAT')
                    soundfile.write(os.path.join(outpath, f'{prompts[0]}-{pengi_i}_ori.wav'), wav_ori, SAMPLE_RATE, 'FLOAT')
                    base_count += 1

                all_samples.append(x_samples)
                pengi = Pengi(config="base")

                generated_responses = pengi.generate(audio_paths=[os.path.join(outpath, f'{prompts[0]}-{pengi_i}.wav')],
                                                text_prompts=["generate audio caption "],
                                                add_texts=[","],
                                                max_len=30,
                                                beam_size=6,
                                                temperature=1.0,
                                                stop_token=' <|endoftext|>')
                
                last_pengi_output = generated_responses[0][0][0] # this is the pengi prompts, taking the first of the 6 elements.
                pengi_output_prompts[prompts[0]].append(last_pengi_output)

print(f"Your samples are ready and waiting for you here: \n{outpath} \n"
        f" \nEnjoy.")
print(f"{pengi_output_prompts}")

Loading model from /data/notebook_files/Make-An-Audio-main/useful_ckpts/maa1_full.ckpt
Global Step: 3722264
LatentDiffusion_audio: Running in eps-prediction mode
DiffusionWrapper has 160.22 M params.
making attention of type 'vanilla' with 256 in_channels
making attention of type 'vanilla' with 256 in_channels
making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 512 in_channels
Working with z of shape (1, 4, 78, 78) = 24336 dimensions.
making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 256 in_channels
making attention of type 'vanilla' with 256 in_channels
making attention of type 'vanilla' with 256 in_channels
TextEncoder comes with 111.32 M params.
target t_enc is 30 steps
torc

Global seed set to 42
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
data:   0%|          | 0/1 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, pl

In [23]:
prompt

'a warm melodic sounding violin'

In [16]:
[x[0] for x in data]

['a bird chirping']

In [30]:
c.shape

torch.Size([2, 77, 1024])

In [32]:
z_enc.shape

torch.Size([2, 4, 10, 78])

It seems like we're typically losing *adjectives*

# Sheet 2

# Pengi Prompt Tuning on Make-an-Audio
## Add all the necessary packages to PATH

In [1]:
import sys
sys.path.insert(0, "/data/notebook_files/Pengi")
sys.path.insert(0, "/data/notebook_files/Make-An-Audio-main")

## Import all the modules that are necessary for the experiment to work

In [2]:
"""make variations of input image"""
import argparse, os, glob
import PIL
import torch
import numpy as np
from omegaconf import OmegaConf
from PIL import Image
from tqdm import tqdm, trange
from itertools import islice
from einops import rearrange, repeat
from torchvision.utils import make_grid
from torch import autocast
import librosa
# from contextlib import nullcontext
import time
from pytorch_lightning import seed_everything
import math
from ldm.util import instantiate_from_config
from ldm.models.diffusion.ddim import DDIMSampler
from vocoder.bigvgan.models import VocoderBigVGAN
# from ldm.data.extract_mel_spectrogram import TRANSFORMS_22050,TRANSFORMS_16000
from preprocess.NAT_mel import MelNet
import soundfile

batch_max_length = 624
SAMPLE_RATE= 16000

  from .autonotebook import tqdm as notebook_tqdm


## Create helper functions

In [3]:
def chunk(it, size):
    it = iter(it)
    return iter(lambda: tuple(islice(it, size)), ())


def load_model_from_config(config, ckpt, verbose=True):
    print(f"Loading model from {ckpt}")
    pl_sd = torch.load(ckpt, map_location="cpu")
    if "global_step" in pl_sd:
        print(f"Global Step: {pl_sd['global_step']}")
    sd = pl_sd["state_dict"]
    model = instantiate_from_config(config.model)
    m, u = model.load_state_dict(sd, strict=False)
    if len(m) > 0 and verbose:
        print("missing keys:")
        print(m)
    if len(u) > 0 and verbose:
        print("unexpected keys:")
        print(u)

    model.cuda()
    model.eval()
    return model


def load_audio(path,transform,sr=16000,batch_max_length=624):# load wav and return mel
    wav,_ = librosa.load(path,sr=sr)

    audio = transform(wav) # (1,melbins,T)
    if audio.shape[2] <= batch_max_length:
        n_repeat = math.ceil((batch_max_length + 1) / audio.shape[1])
        audio = audio.repeat(1,1, n_repeat)

    audio = audio[..., :batch_max_length].unsqueeze(0) # shape [1,1,80,batch_max_length]
    return audio


def load_img(path):# load mel
    audio = np.load(path)
    if audio.shape[1] <= batch_max_length:
        n_repeat = math.ceil((batch_max_length + 1) / audio.shape[1])
        audio = np.tile(audio, reps=(1, n_repeat))

    audio = audio[:, :batch_max_length]
    audio = torch.FloatTensor(audio)[None, None, :, :] # [1,1,80,batch_max_length]
    return audio


class Opt:
    def __init__(self):
        self.config = "/data/notebook_files/Make-An-Audio-main/configs/text_to_audio/txt2audio_args.yaml"
        self.ckpt = "/data/notebook_files/Make-An-Audio-main/useful_ckpts/maa1_full.ckpt"
        self.vocoder_ckpt = "/data/notebook_files/Make-An-Audio-main/useful_ckpts/bigvnat"
        self.outdir = "/data/notebook_files/audio-outputs-with-pengi"
        self.from_file = None
        
        self.strength = 0.3
        self.seed = 42
        self.scale = 3.0
        self.n_samples = 2
        self.n_iter = 1

        self.ddim_steps = 100
        self.ddim_eta = 0.0

        self.prompt = "a warm melodic sounding violin"
        self.init_audio = "/data/notebook_files/violin_0.wav"
        self.pengi_iterations = 5

## Run experiment 1
In this experiment, we will be iterating on the sound by modifying the prompt embedding by trying to empasize the parts that were missed in the previous run.

In [None]:
from Pengi.wrapper import PengiWrapper as Pengi

opt = Opt()
seed_everything(opt.seed)

config = OmegaConf.load(f"{opt.config}")
model = load_model_from_config(config, f"{opt.ckpt}")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)

hparams = {
    'audio_sample_rate': SAMPLE_RATE,
    'audio_num_mel_bins':80,
    'fft_size': 1024,
    'win_size': 1024,
    'hop_size': 256,
    'fmin': 0,
    'fmax': 8000,
    'batch_max_length': 1248, 
    'mode': 'pad', # pad,none,
}

melnet = MelNet(hparams)
sampler = DDIMSampler(model)
vocoder = VocoderBigVGAN(opt.vocoder_ckpt,device)

os.makedirs(opt.outdir, exist_ok=True)
outpath = opt.outdir

batch_size = opt.n_samples # 一个prompt产生n_samples个结果
if not opt.from_file: # load prompts from this file
    prompt = opt.prompt
    assert prompt is not None
    data = [batch_size * [prompt]]
else:
    print(f"reading prompts from {opt.from_file}")
    with open(opt.from_file, "r") as f:
        data = f.read().splitlines()
        data = list(chunk(data, batch_size))


sample_path = os.path.join(outpath, "samples")
os.makedirs(sample_path, exist_ok=True)
base_count = len(os.listdir(sample_path))


assert os.path.isfile(opt.init_audio)
init_image = load_audio(opt.init_audio,transform=melnet).to(device)
init_image = repeat(init_image, '1 ... -> b ...', b=batch_size)
init_latent = model.get_first_stage_encoding(model.encode_first_stage(init_image))  # move to latent space
sampler.make_schedule(ddim_num_steps=opt.ddim_steps, ddim_eta=opt.ddim_eta, verbose=False)


assert 0. <= opt.strength <= 1., 'can only work with strength in [0.0, 1.0]'
t_enc = int(opt.strength * opt.ddim_steps)
print(f"target t_enc is {t_enc} steps")


pengi_output_prompts = {prompt[0]: [] for prompt in data}


with torch.no_grad():
    with model.ema_scope():
        tic = time.time()
        all_samples = list()
        old_encoded = None
        last_pengi_output = None

        for prompts in tqdm(data, desc="data"): # goal prompt
            uc = None
            
            if opt.scale != 1.0: # default=5.0
                uc = model.get_learned_conditioning(batch_size * [""])
            
            for pengi_i in range(opt.pengi_iterations):
                if old_encoded is None:
                    c = model.get_learned_conditioning(prompts)
                else:
                    pengi_encoded = model.get_learned_conditioning(last_pengi_output)
                    c = old_encoded + (old_encoded - pengi_encoded)
                
                old_encoded = c

                z_enc = sampler.stochastic_encode(init_latent, torch.tensor([t_enc]*batch_size).to(device)) # [B, channel, c, h]
                # decode it
                samples = sampler.decode(z_enc, c, t_enc, unconditional_guidance_scale=opt.scale,
                                            unconditional_conditioning=uc,)

                x_samples = model.decode_first_stage(samples)
                print(x_samples.shape)
                for x_sample in x_samples:
                    spec = x_sample[0].cpu().numpy()
                    spec_ori = init_image[0][0].cpu().numpy()
                    print(x_sample.shape,spec.shape,init_image.shape)
                    wav = vocoder.vocode(spec)
                    wav_ori = vocoder.vocode(spec_ori)
                    soundfile.write(os.path.join(outpath, f'{prompts[0]}-{pengi_i}.wav'), wav, SAMPLE_RATE, 'FLOAT')
                    soundfile.write(os.path.join(outpath, f'{prompts[0]}-{pengi_i}_ori.wav'), wav_ori, SAMPLE_RATE, 'FLOAT')
                    base_count += 1

                all_samples.append(x_samples)
                pengi = Pengi(config="base")

                generated_responses = pengi.generate(audio_paths=[os.path.join(outpath, f'{prompts[0]}-{pengi_i}.wav')],
                                                text_prompts=["generate audio caption "],
                                                add_texts=[","],
                                                max_len=30,
                                                beam_size=6,
                                                temperature=1.0,
                                                stop_token=' <|endoftext|>')
                
                last_pengi_output = generated_responses[0][0][0] # this is the pengi prompts, taking the first of the 6 elements.
                pengi_output_prompts[prompts[0]].append(last_pengi_output)

print(f"{pengi_output_prompts}")

### Conclusion
It appears we are diverging from the goal after the second iteration.

## Experiment 2
Try to interpolate the prompts

In [13]:
from Pengi.wrapper import PengiWrapper as Pengi

opt = Opt()
opt.init_audio = "/data/notebook_files/source_unmodified_audios/harp.wav"
opt.prompt = "A beautiful, melodic sounding harp"
opt.outdir = "/data/notebook_files/audio-outputs-with-pengi/experiment2"
seed_everything(opt.seed)

config = OmegaConf.load(f"{opt.config}")
model = load_model_from_config(config, f"{opt.ckpt}")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)

hparams = {
    'audio_sample_rate': SAMPLE_RATE,
    'audio_num_mel_bins':80,
    'fft_size': 1024,
    'win_size': 1024,
    'hop_size': 256,
    'fmin': 0,
    'fmax': 8000,
    'batch_max_length': 1248, 
    'mode': 'pad', # pad,none,
}

melnet = MelNet(hparams)
sampler = DDIMSampler(model)
vocoder = VocoderBigVGAN(opt.vocoder_ckpt,device)

os.makedirs(opt.outdir, exist_ok=True)
outpath = opt.outdir

batch_size = opt.n_samples # 一个prompt产生n_samples个结果
if not opt.from_file: # load prompts from this file
    prompt = opt.prompt
    assert prompt is not None
    data = [batch_size * [prompt]]
else:
    print(f"reading prompts from {opt.from_file}")
    with open(opt.from_file, "r") as f:
        data = f.read().splitlines()
        data = list(chunk(data, batch_size))


sample_path = os.path.join(outpath, "samples")
os.makedirs(sample_path, exist_ok=True)
base_count = len(os.listdir(sample_path))


assert os.path.isfile(opt.init_audio)
init_image = load_audio(opt.init_audio,transform=melnet).to(device)
init_image = repeat(init_image, '1 ... -> b ...', b=batch_size)
init_latent = model.get_first_stage_encoding(model.encode_first_stage(init_image))  # move to latent space
sampler.make_schedule(ddim_num_steps=opt.ddim_steps, ddim_eta=opt.ddim_eta, verbose=False)


assert 0. <= opt.strength <= 1., 'can only work with strength in [0.0, 1.0]'
t_enc = int(opt.strength * opt.ddim_steps)
print(f"target t_enc is {t_enc} steps")


pengi_output_prompts = {prompt[0]: [] for prompt in data}
a = 0.5

with torch.no_grad():
    with model.ema_scope():
        tic = time.time()
        all_samples = list()
        old_encoded = None
        last_pengi_output = None

        for prompts in tqdm(data, desc="data"): # goal prompt
            uc = None
            
            if opt.scale != 1.0: # default=5.0
                uc = model.get_learned_conditioning(batch_size * [""])
            
            for pengi_i in range(opt.pengi_iterations):
                if old_encoded is None:
                    c = model.get_learned_conditioning(prompts)
                else:
                    pengi_encoded = model.get_learned_conditioning(last_pengi_output)
                    c = (a * pengi_encoded) + ((1 - a) * old_encoded)
                
                old_encoded = c

                z_enc = sampler.stochastic_encode(init_latent, torch.tensor([t_enc]*batch_size).to(device)) # [B, channel, c, h]
                # decode it
                samples = sampler.decode(z_enc, c, t_enc, unconditional_guidance_scale=opt.scale,
                                            unconditional_conditioning=uc,)

                x_samples = model.decode_first_stage(samples)
                print(x_samples.shape)
                for x_sample in x_samples:
                    spec = x_sample[0].cpu().numpy()
                    spec_ori = init_image[0][0].cpu().numpy()
                    print(x_sample.shape,spec.shape,init_image.shape)
                    wav = vocoder.vocode(spec)
                    wav_ori = vocoder.vocode(spec_ori)
                    soundfile.write(os.path.join(outpath, f'{prompts[0]}-{pengi_i}.wav'), wav, SAMPLE_RATE, 'FLOAT')
                    soundfile.write(os.path.join(outpath, f'{prompts[0]}-{pengi_i}_ori.wav'), wav_ori, SAMPLE_RATE, 'FLOAT')
                    base_count += 1

                all_samples.append(x_samples)
                pengi = Pengi(config="base")

                generated_responses = pengi.generate(audio_paths=[os.path.join(outpath, f'{prompts[0]}-{pengi_i}.wav')],
                                                text_prompts=["generate audio caption "],
                                                add_texts=[","],
                                                max_len=30,
                                                beam_size=6,
                                                temperature=1.0,
                                                stop_token=' <|endoftext|>')
                
                last_pengi_output = generated_responses[0][0][0] # this is the pengi prompts, taking the first of the 6 elements.
                pengi_output_prompts[prompts[0]].append(last_pengi_output)

print(f"{pengi_output_prompts}")

Loading model from /data/notebook_files/Make-An-Audio-main/useful_ckpts/maa1_full.ckpt
Global Step: 3722264
LatentDiffusion_audio: Running in eps-prediction mode
DiffusionWrapper has 160.22 M params.
making attention of type 'vanilla' with 256 in_channels
making attention of type 'vanilla' with 256 in_channels
making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 512 in_channels
Working with z of shape (1, 4, 78, 78) = 24336 dimensions.
making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 256 in_channels
making attention of type 'vanilla' with 256 in_channels
making attention of type 'vanilla' with 256 in_channels
TextEncoder comes with 111.32 M params.
target t_enc is 30 steps
torc

Global seed set to 42
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
data:   0%|          | 0/1 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, pl

### Observations
It seems like we're losing *adjectival* language in our Pengi outputs. 

Could we coerce it to use more adjectives?
If we can accomplish this, will it make an useful difference in our outputs?



We're also currently doing top 1. What if we weighted in a top-K sense (using Pengi's generative results, rescaled to a pdf) and then used a tree-type structure with pruning later to branch results (and possibly pass original inputs through as well)? 

## Idea for experiment 3:
original_prompt: "Make the violin sound warmer"

description: "Right now this audio is ..."

description + original_prompt -> bert

feed the new audio with the new bert embedding.

In [19]:
from Pengi.wrapper import PengiWrapper as Pengi

opt = Opt()
opt.outdir = "/data/notebook_files/audio-outputs-with-pengi/experiment3"
seed_everything(opt.seed)

config = OmegaConf.load(f"{opt.config}")
model = load_model_from_config(config, f"{opt.ckpt}")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model = model.to(device)

hparams = {
    'audio_sample_rate': SAMPLE_RATE,
    'audio_num_mel_bins':80,
    'fft_size': 1024,
    'win_size': 1024,
    'hop_size': 256,
    'fmin': 0,
    'fmax': 8000,
    'batch_max_length': 1248, 
    'mode': 'pad', # pad,none,
}

melnet = MelNet(hparams)
sampler = DDIMSampler(model)
vocoder = VocoderBigVGAN(opt.vocoder_ckpt,device)

os.makedirs(opt.outdir, exist_ok=True)
outpath = opt.outdir

batch_size = opt.n_samples # 一个prompt产生n_samples个结果
if not opt.from_file: # load prompts from this file
    prompt = opt.prompt
    assert prompt is not None
    data = [batch_size * [prompt]]
else:
    print(f"reading prompts from {opt.from_file}")
    with open(opt.from_file, "r") as f:
        data = f.read().splitlines()
        data = list(chunk(data, batch_size))


sample_path = os.path.join(outpath, "samples")
os.makedirs(sample_path, exist_ok=True)
base_count = len(os.listdir(sample_path))


assert os.path.isfile(opt.init_audio)
init_image = load_audio(opt.init_audio,transform=melnet).to(device)
init_image = repeat(init_image, '1 ... -> b ...', b=batch_size)
init_latent = model.get_first_stage_encoding(model.encode_first_stage(init_image))  # move to latent space
sampler.make_schedule(ddim_num_steps=opt.ddim_steps, ddim_eta=opt.ddim_eta, verbose=False)


assert 0. <= opt.strength <= 1., 'can only work with strength in [0.0, 1.0]'
t_enc = int(opt.strength * opt.ddim_steps)
print(f"target t_enc is {t_enc} steps")


pengi_output_prompts = {prompt[0]: [] for prompt in data}
a = 0.5

with torch.no_grad():
    with model.ema_scope():
        tic = time.time()
        all_samples = list()
        old_encoded = None
        last_pengi_output = None

        for prompts in tqdm(data, desc="data"): # goal prompt
            uc = None
            
            if opt.scale != 1.0: # default=5.0
                uc = model.get_learned_conditioning(batch_size * [""])
            
            for pengi_i in range(opt.pengi_iterations):
                if old_encoded is None:
                    c = model.get_learned_conditioning(prompts)
                else:
                    pengi_encoded = model.get_learned_conditioning(last_pengi_output)
                    c = (a * pengi_encoded) + ((1 - a) * old_encoded)
                
                old_encoded = c

                z_enc = sampler.stochastic_encode(init_latent, torch.tensor([t_enc]*batch_size).to(device)) # [B, channel, c, h]
                # decode it
                samples = sampler.decode(z_enc, c, t_enc, unconditional_guidance_scale=opt.scale,
                                            unconditional_conditioning=uc,)

                x_samples = model.decode_first_stage(samples)
                print(x_samples.shape)
                for x_sample in x_samples:
                    spec = x_sample[0].cpu().numpy()
                    spec_ori = init_image[0][0].cpu().numpy()
                    print(x_sample.shape,spec.shape,init_image.shape)
                    wav = vocoder.vocode(spec)
                    wav_ori = vocoder.vocode(spec_ori)
                    soundfile.write(os.path.join(outpath, f'{prompts[0]}-{pengi_i}.wav'), wav, SAMPLE_RATE, 'FLOAT')
                    soundfile.write(os.path.join(outpath, f'{prompts[0]}-{pengi_i}_ori.wav'), wav_ori, SAMPLE_RATE, 'FLOAT')
                    base_count += 1

                all_samples.append(x_samples)
                pengi = Pengi(config="base")
                
                beam_size = 8
                random_prompt_index = np.random.randint(0, beam_size)
                generated_responses = pengi.generate(audio_paths=[os.path.join(outpath, f'{prompts[0]}-{pengi_i}.wav')],
                                                text_prompts=["generate an audio caption with stylistic adjectives "],
                                                add_texts=[","],
                                                max_len=30,
                                                beam_size=beam_size,
                                                temperature=2,
                                                stop_token=' <|endoftext|>')
                
                last_pengi_output = generated_responses[0][0][random_prompt_index] # this is the pengi prompts, taking the first of the 6 elements.
                pengi_output_prompts[prompts[0]].append(last_pengi_output)

print(f"{pengi_output_prompts}")

Loading model from /data/notebook_files/Make-An-Audio-main/useful_ckpts/maa1_full.ckpt
Global Step: 3722264
LatentDiffusion_audio: Running in eps-prediction mode
DiffusionWrapper has 160.22 M params.
making attention of type 'vanilla' with 256 in_channels
making attention of type 'vanilla' with 256 in_channels
making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 512 in_channels
Working with z of shape (1, 4, 78, 78) = 24336 dimensions.
making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 512 in_channels
making attention of type 'vanilla' with 256 in_channels
making attention of type 'vanilla' with 256 in_channels
making attention of type 'vanilla' with 256 in_channels
TextEncoder comes with 111.32 M params.
target t_enc is 30 steps
torc

Global seed set to 42
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
data:   0%|          | 0/1 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, pl

## Idea for experiment 4:

original prompt: "Make the violin sound warmer warmer warmer warmer"

description: "Right now this audio is ..."

description + original_prompt -> bert

feed the new audio with the new bert embedding.