# Llama-2-Chat Model from Hugging Face

In [31]:
# Hugging Face imports:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
)

# LangChain imports:
from langchain.chat_models import ChatLlama2Hf
from langchain.schema import AIMessage, HumanMessage, SystemMessage

This notebook assumes that you were granted with access to the Llama 2 models in the Hugging Face models hub. To use the model locally, you need to be [logged in](https://huggingface.co/docs/huggingface_hub/quick-start#login) with a Hugging Face account.

In [32]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [33]:
model_name = "meta-llama/Llama-2-7b-chat-hf"

In [34]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [35]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_4bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [36]:
pipe = pipeline(
    "text-generation",
    model=model_4bit,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [37]:
chat = ChatLlama2Hf(pipeline=pipe)

In [38]:
# Generation kwargs:
pipeline_kwargs = {
    "do_sample": True,
    "top_p": 0.95,
    "temperature": 0.7,
    "eos_token_id": tokenizer.eos_token_id,
    "max_length": 256,    
}

### Single calls:

In [39]:
messages = [
    SystemMessage(
        content="You are a helpful assistant that translates English to French."
    ),
    HumanMessage(
        content="Translate this sentence from English to French. I love programming."
    ),
]
result = chat(messages, **pipeline_kwargs)
print(result.content)

 Sure, I'd be happy to help! Here is the translation of "I love programming" from English to French:
Je aime le programming.

I hope this helps! Let me know if you have any other questions.


### Single calls with stop words

In [40]:
messages = [
    SystemMessage(
        content="You are a helpful assistant."
    ),
    HumanMessage(
        content="Tell me the history of AI."
    ),
]
result = chat(messages, stop=["Artificial", "Inteligence"], **pipeline_kwargs)
print(result.content)

 Of course! Artificial


### Batch calls:

In [41]:
batch_messages = [
    [
        SystemMessage(content="You are a helpful assistant that translates English to French."),
        HumanMessage(content="I love programming.")
    ],
    [
        SystemMessage(content="You are a helpful assistant that translates English to French."),
        HumanMessage(content="I love artificial intelligence.")
    ],
]
result = chat.generate(batch_messages)

In [42]:
for i, generation in enumerate(result.generations):
    print(f"Response #{i}:\n{generation[0].text}", end="\n\n")

Response #0:
 Great! "Programmation" is the French word for "programming". So, "Je adore le programming." (I love programming.)

Response #1:
 Bonjour! Je suis heureux d'être votre assistant de traduction pour l'anglais à français.

You said: "I love artificial intelligence."

In French: "Je suis ravi d'artificial intelligence." (Note: "artificial" should be pronounced "artificial" in French, not "ar-ti-fi-cial")

