In [1]:
import os, gc, torchaudio, pydub, re
from typing import Literal
import random
import wandb, datetime
import numpy as np
import torch
from torch import nn, optim
from torch.optim import lr_scheduler
from accelerate import Accelerator, notebook_launcher
from torch.cuda.amp import GradScaler
from safetensors.torch import save_model
from transformers import LlamaModel
from time import time
from tqdm.auto import tqdm
from torch.utils.data import DataLoader, IterableDataset
from datasets import load_dataset, Audio, Features
from huggingface_hub import login
from transformers import (
    AutoTokenizer,
    EncodecModel,
    AutoProcessor,
    LlamaModel,
    LlamaConfig,
    LlamaForCausalLM
)

In [2]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()
hk = user_secrets.get_secret("hfkey")
wkey = user_secrets.get_secret("wandb")

login(hk)

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [3]:

class config:
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    outpath = "samples"


class model_configs:
    encodec_id = "facebook/encodec_32khz"
    llama_id = "meta-llama/Llama-3.2-1B"
    canary_id = "tensorkelechi/canary_mini"


class data_configs:
    sample_rate = 32000
    split = 1000
    max_duration = 5
    dtype = torch.float16
    batch_size = 4
    dataset_id = "benjamin-paine/freesound-laion-640k"
    mini_dataset_id = "lewtun/music_genres"
    processed_repo_id = "tensorkelechi/freesound_mini"


class train_configs:
    precision = torch.float16
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    grad_steps = 4
    epochs = 2
    lr = 1e-4
    sft_file = 'kaminari.safetensors'
    model_file = 'kaminari.pth'
    outpath = 'kaminari'

In [4]:

music_prefix = "🎶"
start_of_music = "<somu>"
end_of_music = "<eomu>"
music_codebook_size = 2048
music_codebook_num = 4
music_vocab_size = 8192

music_tokens = {
#     "prefix": music_prefix,
    "sos": start_of_music,
    "eos": end_of_music,
}


def modality_tokens_to_string(tokens):
    """
    Convert audio/music tokens to a single string with prefix and postfix.
    """
    prefix = music_prefix
    start = music_tokens["sos"]
    end = music_tokens["eos"]

    tokens_str = []
    # music tokens are 2-dim array
    # Convert each token to its corresponding string representation
    for idx in range(len(tokens[0])):
        for layer_idx in range(len(tokens)):
            tokens_str.append(
                f"<{prefix}{tokens[layer_idx][idx] + music_codebook_size * layer_idx}>"
            )

    tokens_string = "".join(tokens_str)
    tokens_string = f"{start}{tokens_string}{end}"

    return tokens_string


In [5]:
def clear_mem():
    torch.cuda.empty_cache()
    gc.collect()


def trimpad_audio(audio):
    samples = int(data_configs.sample_rate * data_configs.max_duration)
#     audio = audio.numpy()

    if len(audio) > samples:
        audio = audio[:samples]

    else:
        pad_width = samples - len(audio)
        audio = np.pad(audio, (0, pad_width), mode="reflect")

    return torch.as_tensor(audio)


def seed_everything(seed=333):
    torch.manual_seed(seed)
    np.random.seed(seed)
    random.seed(seed)

seed_everything()

In [6]:
"""
Code for audio/music tokenization and model processing
"""
def prepare_tokenizer(tokenizer, tokens: list):
    special_tokens = [f"<{music_prefix}{x}>" for x in range(music_vocab_size)]
    tokenizer.add_tokens(special_tokens)
    tokenizer.add_tokens(tokens)
    tokenizer.add_special_tokens({'pad_token': '[pad]'})
    
    return tokenizer


# encoding/compressing music/audio waveform to tokens
def encode_music(audio, encodec_model, audio_processor):
    
    audio_array = trimpad_audio(audio)
    
    audio_proc = audio_processor(
        raw_audio=audio_array, sampling_rate=data_configs.sample_rate,   
        return_tensors='pt'
    )  # preprocess audio waveform for encoding
#     print(len(audio_proc["input_values"]))
#     print(len(audio_proc['padding_mask']))
    
    masks = audio_proc["padding_mask"]  # get processor masks for decoding

    with torch.no_grad():
        audio_tokens = encodec_model.encode(
            # tokenize/encode with pretrained neural codec
            audio_proc["input_values"],
            audio_proc["padding_mask"],
        )
    audio_codes = audio_tokens.audio_codes
#     print(f'audio_codes.shape ={audio_codes.shape}')
    
    return audio_codes[0][0], masks


def tokens2string(tokens):
    """
    Convert visual tokens to a single string with prefix and postfix.
    """
    prefix = music_prefix
    start = music_tokens["sos"]
    end = music_tokens["eos"]

    # music tokens are 2-dim array
    # Convert each token to its corresponding string representation
    tokens_str = []

    for idx in range(len(tokens[0])):
        #         print('layer 1')

        for layer_idx in range(len(tokens)):
            #             print('layer2')
            tokens_str.append(
                f"<{prefix}{tokens[layer_idx][idx] + music_codebook_size * layer_idx}>"
            )

    tokens_string = "".join(tokens_str)
    tokens_string = f" - {start}{tokens_string}{end}"
    return tokens_string



def extractor2(text, tag1=start_of_music, tag2=end_of_music):
    start = None
    try:
        # print(text)
        start = text.index(tag1) + len(tag1)
        end = text.index(tag2, start)
        extracted_text = text[start:end].strip()
        if not extracted_text:
            try:
                extracted_text = text[start:]
            except:
                extracted_text = text
        return extracted_text
    except ValueError:
        try:
            extracted_text = text[start:]
        except Exception as e:
            print(e)
            extracted_text = text
        return extracted_text


# for audio decoding
def content2rvq_codes(content, codebook_size=2048, codebook_num=4):
    codes = [int(code) for code in re.findall(r"\d+", content)]
    print(len(codes))  # 6004
    codes = np.array([code % codebook_size for code in codes])
    print(codes.shape)  # (6004,)
    n = codes.shape[0] // codebook_num
    print(n)  # (1501)
    # Transpose the last two dimensions to match the desired output
    # if can't divide evenly, drop the last few codes
    codes = codes[: n * codebook_num]
    print(codes.shape)
    codes = codes.reshape(n, codebook_num).T
    print(codes.shape)  # (4, 1501)
    codes = np.expand_dims(codes, 0)
    codes = np.expand_dims(codes, 0)
    print(codes.shape)  # (1, 1, 4, 1501)
    codes = torch.tensor(codes).long().to(config.device)
    print(codes.shape)
    return codes


def decode_music(content):
    # codes = content2rvq_codes(content, music_codebook_size, music_codebook_num)
    music = encodec_model.decode(content, [None])
    print(f'decoded audio = {music.shape}')
    music = music[0].squeeze(0).detach().cpu()
    return music

In [7]:
# dataset preparation
# for class-based music data, lewtun/music_genres
music_data = load_dataset(
    data_configs.dataset_id,
    split="train",
    streaming=True,
    trust_remote_code=True,
).cast_column("audio", Audio(sampling_rate=32000))

data_features = music_data.features.copy()

music_data = music_data.map(
    lambda r: {"tags": " ".join(r["tags"])}#, features=Features(data_features)
)

music_data = music_data.take(4000)


music_data

README.md:   0%|          | 0.00/4.11k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/1352 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/123 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/1352 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/123 [00:00<?, ?it/s]

IterableDataset({
    features: Unknown,
    n_shards: 1352
})

In [8]:
# Audio encoder, FaceBook Encodec-32khz
encodec_model = EncodecModel.from_pretrained(model_configs.encodec_id)

audio_processor = AutoProcessor.from_pretrained(
    model_configs.encodec_id
)  # preprocessor for neural audio codec

# freeze or prevent gradient update
encodec_model=encodec_model.eval()
type(encodec_model)

config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/236M [00:00<?, ?B/s]

  WeightNorm.apply(module, name, dim)
  self.register_buffer("padding_total", torch.tensor(kernel_size - stride, dtype=torch.int64), persistent=False)


preprocessor_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

transformers.models.encodec.modeling_encodec.EncodecModel

In [9]:
class MusicData(IterableDataset):
    def __init__(self, tokenizer, dataset=music_data):
        self.dataset = dataset
        self.tokenizer = tokenizer

    def __len__(self):
        return data_configs.split

    def __iter__(self):
        for sample in self.dataset:
            audio_tokens = encode_music(
                sample["audio"]["array"],
                encodec_model=encodec_model,
                audio_processor=audio_processor,
            )
            audio_string = tokens2string(audio_tokens[0])

            label = sample["tags"]#' '.join(sample["tags"])
            data_string = label + audio_string

            input_tokens = self.tokenizer(data_string, return_tensors='pt', truncation=True, padding='max_length', max_length=1024)
            token_ids = input_tokens["input_ids"]
            attn_mask = input_tokens["attention_mask"]

            yield {"input_ids": token_ids, "attention_mask": attn_mask}

In [10]:
from transformers import LlamaTokenizer

tokenizer = LlamaTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")

# LLM tokenizer
# tokenizer = AutoTokenizer.from_pretrained(
# #     model_configs.llama_id,
# )
# # tokenizer = prepare_tokenizer(tokenizer)

# Llama model architecture, for initial experiments
llama_config = LlamaConfig(
    num_attention_heads=16,
    num_hidden_layers=8,
    num_key_value_heads=4,
    hidden_size=1024,
    intermediate_size=4096,
#     head_dim=32,
)

tiny_llama = LlamaModel(config=llama_config)
tiny_llama.config

tokenizer_config.json:   0%|          | 0.00/1.54k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


LlamaConfig {
  "attention_bias": false,
  "attention_dropout": 0.0,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_act": "silu",
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "max_position_embeddings": 2048,
  "mlp_bias": false,
  "model_type": "llama",
  "num_attention_heads": 16,
  "num_hidden_layers": 8,
  "num_key_value_heads": 4,
  "pretraining_tp": 1,
  "rms_norm_eps": 1e-06,
  "rope_scaling": null,
  "rope_theta": 10000.0,
  "tie_word_embeddings": false,
  "transformers_version": "4.44.2",
  "use_cache": true,
  "vocab_size": 32000
}

In [12]:
tokens = list(music_tokens.values())

tokenizer = prepare_tokenizer(tokenizer, tokens)
tiny_llama.resize_token_embeddings(len(tokenizer))
tiny_llama.lm_head = nn.Linear(llama_config.hidden_size, len(tokenizer), bias=False)
type(tiny_llama), tiny_llama.config

(transformers.models.llama.modeling_llama.LlamaModel,
 LlamaConfig {
   "attention_bias": false,
   "attention_dropout": 0.0,
   "bos_token_id": 1,
   "eos_token_id": 2,
   "hidden_act": "silu",
   "hidden_size": 1024,
   "initializer_range": 0.02,
   "intermediate_size": 4096,
   "max_position_embeddings": 2048,
   "mlp_bias": false,
   "model_type": "llama",
   "num_attention_heads": 16,
   "num_hidden_layers": 8,
   "num_key_value_heads": 4,
   "pretraining_tp": 1,
   "rms_norm_eps": 1e-06,
   "rope_scaling": null,
   "rope_theta": 10000.0,
   "tie_word_embeddings": false,
   "transformers_version": "4.44.2",
   "use_cache": true,
   "vocab_size": 40195
 })

In [14]:
dset = MusicData(tokenizer)
mini_train_loader = DataLoader(dataset=dset, batch_size=data_configs.batch_size)

x_sample = next(iter(mini_train_loader))
# x_sample/

audio_codes.shape =torch.Size([1, 1, 4, 250])
audio_codes.shape =torch.Size([1, 1, 4, 250])
audio_codes.shape =torch.Size([1, 1, 4, 250])
audio_codes.shape =torch.Size([1, 1, 4, 250])


In [16]:
# training definitions
model = tiny_llama

loss_fn = nn.CrossEntropyLoss(reduction="none", ignore_index=tokenizer.pad_token_id)  # loss function
optimizer = optim.AdamW(model.parameters(), lr=train_configs.lr)
scheduler = lr_scheduler.CosineAnnealingWarmRestarts(
    optimizer,
    T_0=1000,  # restart every 1000 steps
    T_mult=1
)

scaler = GradScaler()

# configure accelerate
# accelerator = Accelerator()
# model, mini_train_loader, optimizer, scheduler = accelerator.prepare(
#     # cofnigure modules for training
#     model,
#     mini_train_loader,
#     optimizer,
#     scheduler,
# )

  scaler = GradScaler()


In [17]:
gen_configs = {
    "temperature": 1,
    "top_p": 1.0,
    "top_k": 200,
    "do_sample": True,
    "max_new_tokens": 2000,
    "min_new_tokens": 10,
    "repetition_penalty": 1.15,
}


def _postprocess(input):
    extract = extractor2(input)
    print('extract')
    print(extract)
    reconstruct_codes = content2rvq_codes(extract)
    print(f'recoded {reconstruct_codes.shape}')
    waveform = decode_music(reconstruct_codes)

    waveform = waveform[0].squeeze(0).detach().cpu()

    return waveform


@torch.no_grad()
def bird_call(
    prompt, model, tokenizer
):  # prompt might be just a class/single word/description for v1
    input = tokenizer.encode(prompt, return_tensors="pt")
    input_ids = input.input_ids
    gen_tokens = model.generate(
        **input_ids.to(config.device), generation_config=gen_configs
    )
    tokens = tokenizer.batch_decode(gen_tokens.sequences.cpu(), skip_special_tokens=True)

    output = _postprocess(tokens[0])
    print(f'postprocessed: {output}')
    return output

In [34]:
# model.set_output_embedding(model.lm_head)

model

LlamaModel(
  (embed_tokens): Embedding(40195, 1024)
  (layers): ModuleList(
    (0-7): 8 x LlamaDecoderLayer(
      (self_attn): LlamaSdpaAttention(
        (q_proj): Linear(in_features=1024, out_features=1024, bias=False)
        (k_proj): Linear(in_features=1024, out_features=256, bias=False)
        (v_proj): Linear(in_features=1024, out_features=256, bias=False)
        (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
        (rotary_emb): LlamaRotaryEmbedding()
      )
      (mlp): LlamaMLP(
        (gate_proj): Linear(in_features=1024, out_features=4096, bias=False)
        (up_proj): Linear(in_features=1024, out_features=4096, bias=False)
        (down_proj): Linear(in_features=4096, out_features=1024, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
      (post_attention_layernorm): LlamaRMSNorm((1024,), eps=1e-06)
    )
  )
  (norm): LlamaRMSNorm((1024,), eps=1e-06)
  (rotary_emb): LlamaRotaryEmbedding()
 

In [19]:
def count_params(model: nn.Module):
    p_count = sum(p.numel() for p in model.parameters() if p.requires_grad)

    return p_count


print(f"model parameters (training) = {count_params(model)}")

def clearmem():
    torch.cuda.empty_cache()
    gc.collect()

def logger(model) -> None:
    wandb.login(key=wkey)
    wandb.init(project="kaminari_v1", name="audiogen-1-sandbox-8k")
    wandb.watch(model)

logger(model)


@torch.no_grad
def epoch_sample(model: LlamaModel = model, prompt_class="classical"):
    sample_tokens = bird_call(prompt, model, tokenizer)
    now = datetime.datetime.now()
    filename = now.strftime("%m%d_%H%M%S") + ".wav"
    file_name = os.path.join(config.outpath, filename)
    print("saved: ", file_name)
    torchaudio.save(file_name, sample_tokens, data_configs.sample_rate)

    return filename


model parameters (training) = 203971584


In [20]:
clearmem()

In [23]:
CUDA_LAUNCH_BLOCKING=1
TORCH_USE_CUDA_DSA=True

import torch._dynamo
torch._dynamo.config.suppress_errors = True

In [24]:
# model=model.to(train_configs.device)

In [25]:
# xc = torch.randn(4, 1, 1024).to(config.device)

# out = model(input_ids=x_sample['input_ids'].long().squeeze().to(train_configs.device))
# out

In [26]:
print(f"Vocabulary size: {model.config.vocab_size}")

Vocabulary size: 40195


In [35]:
model.resize_token_embeddings(len(tokenizer))
model.config.vocab_size = len(tokenizer)

len(tokenizer)

40195

In [28]:
tokenizer.pad_token_id

40194

In [37]:
print(f"Tokenizer vocabulary size: {len(tokenizer)}")
print(f"Model embedding size: {model.embed_tokens.num_embeddings}")
print(f"Model config vocab size: {model.config.vocab_size}")

Tokenizer vocabulary size: 40195
Model embedding size: 40195
Model config vocab size: 40195


In [29]:
print(f"Tokenizer vocabulary size: {len(tokenizer)}")
print(f"Model vocabulary size: {model.config.vocab_size}")
assert len(tokenizer) == model.config.vocab_size, "Tokenizer and model vocabulary sizes should match"

Tokenizer vocabulary size: 40195
Model vocabulary size: 40195


In [54]:
def trainer(
    model=model, train_loader=mini_train_loader, epoch_count=train_configs.epochs
):
    model.train()
    model.to(config.device)

    train_loss = 0.0
    # training loop
    for epoch in tqdm(range(epoch_count)):
        print(f'training for epoch {epoch+1}')
        start_time = time()
        optimizer.zero_grad()  # clear gradient graph

        for step, batch in tqdm(enumerate(train_loader)):
            optimizer.zero_grad()  # clear gradient graph

            input_tokens = batch["input_ids"].to(config.device)
            attn_mask = batch["attention_mask"].to(config.device)

            assert input_tokens.max() < model.config.vocab_size, f"Input contains token ID {input_tokens.max().item()} which is >= vocab size {model.config.vocab_size}"
            # Mixed precision training
            with torch.autocast(device_type="cuda", dtype=torch.float16):
                outputs = model(
                    input_ids=input_tokens.long().squeeze(),  # .squeeze(),
                    attention_mask=attn_mask.long().squeeze(),  # .squeeze(),
#                     labels=input_tokens.long().squeeze(),
                )[0]
                outputs = model.lm_head(outputs)

                # clear memory
                clearmem()

                # slice tensors, due to 'next-token prediction' objective
                # all except last token
                output_tensor = outputs[..., :-1, :].contiguous()
                # all except the first token
                targets = input_tokens[..., 1:].contiguous()
                shift_mask = attn_mask[..., 1:].contiguous()

                model_output = output_tensor.view(-1, output_tensor.size(-1))
                targets = targets.view(-1)

                # compute loss for step
                step_loss = loss_fn(model_output, targets)
                clearmem()
                
                total_tokens = shift_mask.sum()
                step_loss = step_loss.sum() / (total_tokens + 1e-8)
                
                # Scale loss by accumulation steps
                train_loss = step_loss / train_configs.grad_steps  # Normalize the loss
                
                print(f"step {step}: loss {step_loss:.4f}")
                clearmem()                
            # optimizer.step()

            # Scales loss. Calls backward() on scaled loss to create scaled gradients.
            scaler.scale(train_loss).backward()

            clearmem()

            if (step + 1) % train_configs.grad_steps == 0:
                # Unscales the gradients of optimizer's assigned params in-place
                scaler.step(optimizer)
                # Updates the scale for next iteration
                scaler.update()
                optimizer.zero_grad()

            if step % 5 == 0:
                wandb.log({"train_loss": train_loss})
            
            if (step % 100) == 0:
                checkpoint = {
                    "epoch": epoch,
                    "model_state_dict": model.state_dict(),
                    "optimizer_state_dict": optimizer.state_dict(),
                    "scheduler_state": scheduler.state_dict(),
                    "loss": train_loss,
                }

                # save checkpoint
                torch.save(checkpoint, f"kaminari_mini_check_{epoch}.pth")
                # log audio sample to WandB
                try:
                    test_sample_file = epoch_sample(model)
                    wandb.log(
                        {
                            "audio_sample": wandb.Audio(
                                test_sample_file,
                                caption=f"test_audio_track_{step}",
                                sample_rate=data_configs.sample_rate,
                            )
                        }
                    )
                except Exception as e:
                    print(f'error logging sample: {e}')
                
        scheduler.step()

        gc.collect()
        epoch_time = time() - start_time

        print(f"Epoch {epoch} of {epoch_count}, train_loss: {train_loss:.4f}")

        print(f"Epoch @ {epoch} complete in {epoch_time}!")

    print(f"End metrics for run of {epoch_count}, train_loss: {train_loss:.4f}")

    save_model(model, train_configs.sft_file)  # save to .safetensors file

    torch.save(model.state_dict(), f"{train_configs.model_file}")

trainer()

  0%|          | 0/2 [00:00<?, ?it/s]

training for epoch 1


0it [00:00, ?it/s]

audio_codes.shape =torch.Size([1, 1, 4, 250])
audio_codes.shape =torch.Size([1, 1, 4, 250])
audio_codes.shape =torch.Size([1, 1, 4, 250])
audio_codes.shape =torch.Size([1, 1, 4, 250])
step 0: loss 10.696185111999512
audio_codes.shape =torch.Size([1, 1, 4, 250])
audio_codes.shape =torch.Size([1, 1, 4, 250])
audio_codes.shape =torch.Size([1, 1, 4, 250])
audio_codes.shape =torch.Size([1, 1, 4, 250])
step 1: loss 10.750092506408691
audio_codes.shape =torch.Size([1, 1, 4, 250])
audio_codes.shape =torch.Size([1, 1, 4, 250])
audio_codes.shape =torch.Size([1, 1, 4, 250])
audio_codes.shape =torch.Size([1, 1, 4, 250])
step 2: loss 10.599929809570312
audio_codes.shape =torch.Size([1, 1, 4, 250])
audio_codes.shape =torch.Size([1, 1, 4, 250])
audio_codes.shape =torch.Size([1, 1, 4, 250])
audio_codes.shape =torch.Size([1, 1, 4, 250])
step 3: loss 10.716127395629883
audio_codes.shape =torch.Size([1, 1, 4, 250])
audio_codes.shape =torch.Size([1, 1, 4, 250])
audio_codes.shape =torch.Size([1, 1, 4, 250]

KeyboardInterrupt: 

In [None]:

# def trainer_wrapper(train_function=):
#     train_function()


# notebook_launcher(trainer_wrapper, num_processes=2)

In [51]:
clearmem()