In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from pathlib import Path
from mlc_llm.testing.debug_chat import DebugChat
import torch

### MLC LLM basics

In [None]:
# This is how you would normally use MLC LLM with MLCEngine

from mlc_llm import MLCEngine

# Create engine
model = "HF://mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC"
engine = MLCEngine(model)

# Run chat completion in OpenAI API.
for response in engine.chat.completions.create(
    messages=[{"role": "user", "content": "What is the meaning of life?"}],
    model=model,
    stream=True,
):
    for choice in response.choices:
        print(choice.delta.content, end="", flush=True)
print("\n")

engine.terminate()

### DebugChat API

In [72]:
# Since we want access to logits, we need to hack DebugChat a bit.

# First create a dummy debugging callback to avoid writing a bunch of files
# that are otherwise written by default. This is a partial workaround, as _sample_token_from_logits()
# in debug_chat.py still writes logits to the debug_dir given to generate().

class DummyDebugInstrument:
    def __init__(self, debug_out: Path):
        self.debug_out = debug_out
        pass

    def reset(self, debug_out: Path):
        pass

    def __call__(self, func, name, before_run, ret_val, *args):
        pass


In [None]:
# Then DebugChat works just like MLCChat

dc = DebugChat(
    model="/Users/npb/.cache/mlc_llm/model_weights/hf/mlc-ai/Llama-3-8B-Instruct-q4f16_1-MLC",
    debug_dir=Path("./debug-llama-2"),
    model_lib="/Users/npb/.cache/mlc_llm/model_lib/b571abb2b761fd7ab22fc51747ece6d7.dylib",
    debug_instrument=DummyDebugInstrument(Path("./debug-llama-2")),
)
dc.generate("", 15)

### Let's make music 

We first have to convert anticipatory music transformer model weights into MLC format. As of 6/20, this is how you do it ([following these docs](https://llm.mlc.ai/docs/compilation/convert_weights.html#convert-weights-via-mlc)):

**Note: q0f32 quantization seems to be unsupported by the kernel https://github.com/mlc-ai/mlc-llm/issues/2598**

**Note: See an example model produced from the procedure below here: https://huggingface.co/caenopy/music-medium-800k-mlc-q0f16/. Similarly, the GPT-2 model used for debugging is here: https://huggingface.co/mlc-ai/gpt2-q0f16-MLC**

0. git lfs install
1. Clone the AMT weights https://huggingface.co/stanford-crfm/music-medium-800k
2. Convert weights with a command like: `mlc_llm convert_weight ./models/music-medium-800k --quantization q0f16 -o ./music-medium-800k-q0f16-06262024`
4. Copy [this tokenizer](https://huggingface.co/mlc-ai/mlc-chat-stanford-crfm-music-medium-800k-q0f32-MLC/blob/main/tokenizer.json) into the folder created by the previous command.
3. Instead of their config utility, you can just create a file called mlc-chat-config.json in the folder created by the previous command. Write the following (changing quantization or other vocabulary attributes where appropriate) ([ref](https://huggingface.co/mlc-ai/mlc-chat-stanford-crfm-music-medium-800k-q0f32-MLC/blob/main/mlc-chat-config.json)) to the file:

```
{
  "model_type": "gpt2",
  "quantization": "q0f16",
  "model_config": {
    "vocab_size": 55028,
    "n_embd": 1024,
    "n_layer": 24,
    "n_head": 16,
    "layer_norm_epsilon": 1e-05,
    "n_inner": 4096,
    "context_window_size": 1024,
    "prefill_chunk_size": 1024,
    "scale_attn_by_inverse_layer_idx": true,
    "tensor_parallel_shards": 1,
    "head_dim": 64,
    "max_batch_size": 80
  },
  "vocab_size": 55028,
  "context_window_size": 1024,
  "sliding_window_size": -1,
  "prefill_chunk_size": 1024,
  "attention_sink_size": -1,
  "tensor_parallel_shards": 1,
  "mean_gen_len": 128,
  "max_gen_len": 512,
  "shift_fill_factor": 0.3,
  "temperature": 1.0,
  "presence_penalty": 0.0,
  "frequency_penalty": 0.0,
  "repetition_penalty": 1.0,
  "top_p": 1.0,
  "conv_template": {
    "name": "LM",
    "system_template": "{system_message}",
    "system_message": "",
    "system_prefix_token_ids": [
      1
    ],
    "add_role_after_system_message": true,
    "roles": {
      "user": "",
      "assistant": ""
    },
    "role_templates": {
      "user": "{user_message}",
      "assistant": "{assistant_message}",
      "tool": "{tool_message}"
    },
    "messages": [],
    "seps": [
      ""
    ],
    "role_content_sep": "",
    "role_empty_sep": "",
    "stop_str": [
      ""
    ],
    "stop_token_ids": [
      2
    ],
    "function_string": "",
    "use_function_calling": false
  },
  "pad_token_id": 0,
  "bos_token_id": 55025,
  "eos_token_id": 55025,
  "tokenizer_files": [
    "tokenizer.json"
  ],
  "version": "0.1.0"
}
```

Finally, generate the model library. Model libraries are platform-specific (cuda, webgpu, metal, etc.), so verify the correct commands [here](https://llm.mlc.ai/docs/compilation/compile_models.html). At the time of writing, this is how you would do it for metal:

4. `mlc_llm compile ./music-medium-800k-q0f16-06262024/mlc-chat-config.json \
    --device metal -o ./music-medium-800k-q0f16-06262024/music-medium-800k-q0f16-metal.so`

In [None]:
dc_amt = DebugChat(
    model="./mlc_music_models/music-medium-800k-q0f16-06262024",
    debug_dir=Path("./debug-anticipation"),
    model_lib="./mlc_music_models/music-medium-800k-q0f16-06262024/music-medium-800k-q0f16-metal.so",
    debug_instrument=DummyDebugInstrument(Path("./debug-anticipation"))
)

# Let's use GPT2 for debugging...

# dc_gpt2 = DebugChat(
#     model="./mlc_music_models/gpt2-q0f16-MLC",
#     debug_dir=Path("./debug-anticipation"),
#     model_lib="./mlc_music_models/gpt2-q0f16-MLC/gpt2-q0f16-metal.so",
#     debug_instrument=DummyDebugInstrument(Path("./debug-anticipation")),
# )

In [None]:
# Custom generate method for music DebugChat

from typing import List
import tvm
import numpy as np

def generate(
    dc: DebugChat,
    input_tokens: List[str],
    generate_length: int,
    temperature: float = 1.0,
    top_p: float = 1.0
):
    """Generates the response from the model given a user prompt. User will need to
    specify the generation length for debugging purpose. For example, a generation
    length of 3 will include 1 prefill step and 2 decode steps.

    Parameters
    ----------
    dc : DebugChat
        The DebugChat object that contains the model and tokenizer
        for generating the response.

    generate_length : int
        How many tokens to generate.
        
    input_tokens : List[str]
        Prompt to the model.

    temperature : float
        Softmax temperature for sampling.
        
    top_p : float
        Nucleus sampling parameter.
    """

    out_tokens = []

    input_tokens = tvm.nd.array(np.array(input_tokens).astype("int32"), device=dc.device)
    embedding, input_len = dc._embed(input_tokens)
    logits, kv_caches = dc._prefill(embedding, input_len)
    next_token = dc._sample_token_from_logits(logits, temperature=temperature, top_p=top_p)
    out_tokens.append(next_token)

    for i in range(generate_length - 1):   
        logits = dc._decode(next_token, kv_caches)

        next_token = dc._sample_token_from_logits(logits)
        out_tokens.append(next_token)

    return out_tokens


In [None]:
# Finally, here's a stripped down forward method to return logits. 

from typing import List
import tvm
import numpy as np

def debugchat_forward(
    dc: DebugChat,
    input_tokens: List[int],
    kv_caches: List[tvm.nd.NDArray]
):
    """
    Parameters
    ----------
    dc : DebugChat
        The DebugChat object that contains the model and tokenizer
        for generating the response.
        
    input_tokens : List[str]
        Either a prompt to the model if kv_caches is None, or the last token.

    temperature : float
        Softmax temperature for sampling.
        
    top_p : float
        Nucleus sampling parameter.
    """

    assert((len(input_tokens) == 1 and kv_caches is not None) or (kv_caches is None))

    if kv_caches is None:
        input_tokens = tvm.nd.array(np.array(input_tokens).astype("int32"), device=dc.device)
        embedding, input_len = dc._embed(input_tokens)
        logits, kv_caches = dc._prefill(embedding, input_len)
    else:
        last_token = input_tokens[-1]
        logits = dc._decode(last_token, kv_caches)
    
    return logits.numpy(), kv_caches

In [None]:
logits, kv_caches = debugchat_forward(dc_amt, torch.tensor([0]), kv_caches)

In [None]:
# Start with an AUTOREGRESS (55026) or ANTICIPATE (55027) token 
output = generate(dc_amt, torch.tensor([55026]), 600)

In [None]:
import midi2audio
from IPython.display import Audio
from anticipation.convert import events_to_midi

# a MIDI synthesizer
fs = midi2audio.FluidSynth('/usr/share/sounds/sf2/FluidR3_GM.sf2')

# the MIDI synthesis script
def synthesize(fs, tokens):
    mid = events_to_midi(tokens)
    mid.save('tmp.mid')
    fs.midi_to_audio('tmp.mid', 'tmp.wav')
    return 'tmp.wav'

In [None]:
Audio(synthesize(fs, output))