# Llama-2-Chat Model from Hugging Face

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# Hugging Face imports:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    pipeline,
)

# LangChain imports:
from langchain.chat_models import ChatLlama2Hf
from langchain.schema import AIMessage, HumanMessage, SystemMessage

This notebook assumes that you were granted with access to the Llama 2 models in the Hugging Face models hub. To use the model locally, you need to be [logged in](https://huggingface.co/docs/huggingface_hub/quick-start#login) with a Hugging Face account.

In [4]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [5]:
model_name = "meta-llama/Llama-2-7b-chat-hf"

In [6]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
)

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_4bit = AutoModelForCausalLM.from_pretrained(model_name, quantization_config=bnb_config, device_map="auto")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
pipe = pipeline(
    "text-generation",
    model=model_4bit,
    tokenizer=tokenizer,
    torch_dtype=torch.float16,
    device_map="auto",
)

In [9]:
chat = ChatLlama2Hf(pipeline=pipe)

In [10]:
# Generation kwargs:
pipeline_kwargs = {
    "do_sample": True,
    "top_p": 0.95,
    "temperature": 0.7,
    "eos_token_id": tokenizer.eos_token_id,
    "max_length": 256,    
}

### Single calls:

In [11]:
messages = [
    SystemMessage(
        content="You are a helpful assistant that translates English to French."
    ),
    HumanMessage(
        content="Translate this sentence from English to French. I love programming."
    ),
]
chat(messages, **pipeline_kwargs)

AIMessage(content=' Sure, I\'d be happy to help! The French translation of "I love programming" is "J\'aime le programming."', additional_kwargs={}, example=False)

### Single calls with stop words

In [12]:
messages = [
    SystemMessage(
        content="You are a helpful assistant."
    ),
    HumanMessage(
        content="Tell me the history of AI."
    ),
]
chat(messages, stop=["Artificial"], **pipeline_kwargs)

AIMessage(content=" Of course, I'd be happy to help! The history of Artificial", additional_kwargs={}, example=False)

### Batch calls:

In [13]:
batch_messages = [
    [
        SystemMessage(content="You are a helpful assistant that translates English to French."),
        HumanMessage(content="I love programming.")
    ],
    [
        SystemMessage(content="You are a helpful assistant that translates English to French."),
        HumanMessage(content="I love artificial intelligence.")
    ],
]
result = chat.generate(batch_messages)
result

LLMResult(generations=[[ChatGeneration(text=' Great! "Programmation" is the French word for "programming".\n\nSo, you love programmation? (You love programming)', generation_info=None, message=AIMessage(content=' Great! "Programmation" is the French word for "programming".\n\nSo, you love programmation? (You love programming)', additional_kwargs={}, example=False))], [ChatGeneration(text=" Bonjour! Je suis heureux d'être votre assistante de traduction pour les phrases en anglais et en français.\nMerci de me dire que vous aimez l'intelligence artificielle. C'est un sujet très intéressant et en constante évolution. Les avancées dans le domaine de l'IA ont des applications nombreuses dans différents secteurs, tels que la robotique, les systems embarqués, les réseaux de communication, les applications mobiles, les sistèmes de recommendation, les centres de données, etc.\nVous pouvez me poser des questions ou des questions sur l'IA, et je ferai de mon mieux pour vous fournir des information