In [1]:
device = "cuda"

# Quantization settings
quantization_enabled = True
use_4bit = True
bnb_4bit_compute_dtype = "float16"
bnb_4bit_quant_type = "nf4"
use_nested_quant = True

# Model
model_name='mistralai/Mistral-7B-Instruct-v0.1'

In [2]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
)

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
print(f"{torch.__version__=}")
print(f"{torch.version.cuda=}")
print(f"{torch.cuda.is_available()=}")
print(f"{torch.cuda.device_count()=}")

if "cuda" in device:
    assert torch.cuda.is_available(), "CUDA is not available"

torch.__version__='2.1.1+cu118'
torch.version.cuda='11.8'
torch.cuda.is_available()=True
torch.cuda.device_count()=1


In [4]:
!nvidia-smi

Wed Dec 13 00:28:46 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off | 00000000:07:00.0 Off |                  N/A |
|  0%   39C    P8              19W / 420W |    119MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [5]:
# Quantization config for bitsandbytes
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

In [6]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, device_map="auto")
tokenizer.pad_token_id = tokenizer.unk_token_id

if quantization_enabled:
    model_kwargs = {"quantization_config": bnb_config}
else:
    model_kwargs = {"torch_dtype": torch.float16}

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    **model_kwargs,
)

Downloading shards: 100%|██████████| 2/2 [00:00<00:00, 11.63it/s]
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.07s/it]


In [7]:
!nvidia-smi

Wed Dec 13 00:28:54 2023       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.129.03             Driver Version: 535.129.03   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3090        Off | 00000000:07:00.0 Off |                  N/A |
|  0%   43C    P2             118W / 420W |   5002MiB / 24576MiB |      8%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [8]:
result = tokenizer.encode("Hello world")

# Note how the BOS token is automatically added
assert result[0] == tokenizer.bos_token_id

# However, the EOS token is not automatically added
assert result[-1] != tokenizer.eos_token_id

# Consequently, if we want an EOS token, we need to add it ourselves
result = tokenizer.encode("Hello world</s>")
assert result[-1] == tokenizer.eos_token_id

tokenizer.encode("Hello world</s>")

[1, 22557, 1526, 2]

In [9]:
# Mistral Instruct encloses instructions in [INST] and [/INST] tags, but they
# are not special tokens in the tokenizer
assert tokenizer.convert_tokens_to_ids(["[INST]"])[0] == tokenizer.unk_token_id

In [10]:
def print_tokens_with_scores(tokens, scores):
    # This prefix used for tokens makes the supposedly monospace font in VS code
    # not be monospaced, so we remove it
    tokens = [token.removeprefix("▁") for token in tokens]

    # It's common to be passed in tokens that are not just for the generated
    # text, but also for the prompt that preceeds it. We only look at the tokens
    # that are for the generated text (the same ones we have scores for).
    tokens = tokens[-len(scores):]

    # We find the max token length so that we know how to space out the output
    # for displaying scores underneath. We also add the length of the score
    # itself, which is always 5 characters (e.g. "99.99")
    lengths = [len(token) for token in tokens]
    max_length = max(lengths + [len("99.99")])

    # Print the tokens
    for token in tokens:
        print(token.ljust(max_length), end=" ")
    print()

    # Print the scores underneath their respective tokens
    for score in scores:
        print(f"{score:.2f}".ljust(max_length), end=" ")
    print()

In [11]:
prompt = tokenizer.encode("[INST] What is the capital of the United States? [/INST]", return_tensors="pt").to(device)
result = model.generate(prompt,  temperature=0.7, max_new_tokens=1000, return_dict_in_generate=True, output_scores=True)
scores = model.compute_transition_scores(result.sequences, result.scores)
print_tokens_with_scores(tokenizer.convert_ids_to_tokens(result.sequences[0]), scores[0])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


The        capital    of         the        United     States     is         Washington ,          D          .          C          .          </s>       
19.03      23.44      21.75      20.16      22.45      25.83      20.12      18.80      20.75      23.73      22.33      25.97      21.20      18.38      


In [12]:
messages = [
    {"role": "user", "content": "Hello, how are you?"},
    {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
    {"role": "user", "content": "I'd like to show off how chat templating works!"},
]

tokenizer.apply_chat_template(
    messages,
    tokenize=False
)

"<s>[INST] Hello, how are you? [/INST]I'm doing great. How can I help you today?</s> [INST] I'd like to show off how chat templating works! [/INST]"

In [13]:
prompt_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt=True,
    return_tensors="pt",
).to(model.device)
result = model.generate(prompt_ids, do_sample=True, temperature=0.4, max_new_tokens=1000, return_dict_in_generate=True, output_scores=True, num_return_sequences=3)
result = [r[len(prompt_ids[0]):] for r in result['sequences']]
result = tokenizer.batch_decode(result, skip_special_tokens=True)

print("\n\n---\n\n".join(result))

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Sure, I'd love to see it! Please provide me with the template you'd like to use.

---

Sure, I'd be happy to see how chat templating works. What is chat templating?

---

Sure, I'd be happy to see what you've got!


In [14]:
from transformers import pipeline
from pprint import pprint

messages = [
    {"role": "user", "content": "Hello, how are you?"},
    {"role": "assistant", "content": "I'm doing great. How can I help you today?"},
    {"role": "user", "content": "I'd like to show off how chat templating works!"},
]

# https://huggingface.co/blog/how-to-generate
generate_kwargs = {
    "do_sample": True,
    # "top_k": 0,
    # "top_p": 0.92,
    # "temperature": 0.7,
    "num_return_sequences": 4
}

pipe = pipeline(
    "conversational",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=1000,
    **generate_kwargs,
)

result = pipe(messages[:], num_return_sequences=4)
pprint(result)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Conversation id: fc44916f-a3a3-4353-9897-275ace24ce46
user: Hello, how are you?
assistant: I'm doing great. How can I help you today?
user: I'd like to show off how chat templating works!
assistant: Sure, I'd be happy to help you showcase chat templating! Can you tell me a bit more about what you would like to do with it?



In [15]:
# Print tokenizer class heirarchy
print(tokenizer.__class__.__mro__)

(<class 'transformers.models.llama.tokenization_llama_fast.LlamaTokenizerFast'>, <class 'transformers.tokenization_utils_fast.PreTrainedTokenizerFast'>, <class 'transformers.tokenization_utils_base.PreTrainedTokenizerBase'>, <class 'transformers.tokenization_utils_base.SpecialTokensMixin'>, <class 'transformers.utils.hub.PushToHubMixin'>, <class 'object'>)


In [21]:
from typing import List, Optional, Any, Dict
from langchain.chat_models.base import SimpleChatModel
from langchain.schema import BaseMessage, AIMessage, ChatGeneration, ChatResult
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.utils import enforce_stop_tokens
from transformers import PreTrainedModel, PreTrainedTokenizerBase


class HuggingFaceChatModel(SimpleChatModel):
    model: PreTrainedModel
    tokenizer: PreTrainedTokenizerBase
    generate_kwargs: Dict[str, Any]

    @property
    def _llm_type(self) -> str:
        return "hf-chat"
    
    def _generate(
        self,
        messages: List[BaseMessage],
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> ChatResult:
        sequences = self._call(messages, stop=stop, run_manager=run_manager, **kwargs)
        generations = [ChatGeneration(message=AIMessage(content=sequence)) for sequence in sequences]
        return ChatResult(generations=generations)

    def _call(
        self,
        messages: List[BaseMessage],
        stop: Optional[List[str]] = None,
        run_manager: Optional[CallbackManagerForLLMRun] = None,
        **kwargs: Any,
    ) -> str:
        
        type_to_role = {
            "ai": "assistant",
            "human": "user",
            "system": "system"
        }

        messages_hf = [
            {
                "role": type_to_role[message.type],
                "content": message.content,
            }
            for message in messages
        ]

        input_ids = self.tokenizer.apply_chat_template(
            messages_hf,
            add_generation_prompt=True,
            return_tensors="pt",
        ).to(self.model.device)

        generate_kwargs = {**self.generate_kwargs, **kwargs, "return_dict_in_generate": True}

        results = self.model.generate(input_ids, **generate_kwargs)

        sequences = [s[len(input_ids[0]):] for s in results["sequences"]]
        sequences = self.tokenizer.batch_decode(sequences, skip_special_tokens=True)
        if stop:
            sequences = [enforce_stop_tokens(s, stop) for s in sequences]

        return sequences
            

In [22]:
from langchain.schema import BaseMessage, AIMessage, HumanMessage

messages = [
    HumanMessage(content="Hello, how are you?"),
    AIMessage(content="I'm doing great. How can I help you today?"),
    HumanMessage(content="I'd like to show off how chat templating works!"),
]

chat_model = HuggingFaceChatModel(
    model=model,
    tokenizer=tokenizer,
    generate_kwargs=dict(
        do_sample=True, 
        temperature=0.4, 
        top_k=0,
        # top_p=0.92,
        max_new_tokens=1000,
        output_scores=True, 
        # num_return_sequences=3
    )
)
result = chat_model.invoke(messages)

print(result.content)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Sure, I'd love to see it! What's an example of a chat template you'd like to show off?


In [18]:
import transformers
import langchain
print(f"{transformers.__version__=}")
print(f"{langchain.__version__=}")

transformers.__version__='4.36.0'
langchain.__version__='0.0.349rc2'
