In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
import os

# Set up the environment variable for HuggingFace and initialize the desired model.
HF_TOKEN = os.getenv("HUGGINGFACEHUB_API_TOKEN")

conversation_model_name =  "meta-llama/Llama-3.1-8B-Instruct" # "meta-llama/Meta-Llama-3-8B" # "microsoft/DialoGPT-medium" # "bigscience/bloom" # 
conversation_tokenizer = AutoTokenizer.from_pretrained(conversation_model_name, token=HF_TOKEN)
conversation_model = AutoModelForCausalLM.from_pretrained(conversation_model_name, token=HF_TOKEN)

  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 4/4 [02:59<00:00, 44.93s/it]


In [2]:
def get_conversation_response(prompt, history):
    # Encode the input with an attention mask
    input_ids = conversation_tokenizer.encode(prompt + conversation_tokenizer.eos_token, return_tensors="pt")
    attention_mask = torch.ones_like(input_ids)

    # Combine with history if available
    if history is not None:
        bot_input_ids = torch.cat([history, input_ids], dim=-1)
        attention_mask = torch.cat([torch.ones_like(history), attention_mask], dim=-1)
    else:
        bot_input_ids = input_ids

    # Generate the response
    response = conversation_model.generate(
        bot_input_ids, 
        attention_mask=attention_mask, 
        max_length=1000, 
        pad_token_id=conversation_tokenizer.eos_token_id
    )

    # Decode and return the response and history
    return conversation_tokenizer.decode(response[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True), bot_input_ids


In [3]:
get_conversation_response('Hello', None)

('Hello! How can I assist you today?', tensor([[128000,   9906, 128009]]))