In [1]:
import os
from langchain_huggingface import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain.prompts import PromptTemplate
from langchain.schema.output_parser import StrOutputParser
import torch
from dotenv import load_dotenv

# Set HuggingFace token
load_dotenv()
HF_TOKEN = os.getenv("HF_TOKEN")

# Set device
device = 0 if torch.cuda.is_available() else -1

# Directory where the model and tokenizer are saved
saved_model_directory = "F:\LLM\model"  # Replace this with your model's saved path

# Load the tokenizer from the saved directory
tokenizer = AutoTokenizer.from_pretrained(saved_model_directory)

# Set up padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.pad_token_id = tokenizer.eos_token_id

# Load the model from the saved directory
model = AutoModelForCausalLM.from_pretrained(
    saved_model_directory,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="auto" if torch.cuda.is_available() else None
)

# Create the pipeline
text_generation_pipeline = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    max_new_tokens=500,
    min_new_tokens=30,
    do_sample=True,
    temperature=0.7,
    top_p=0.9,
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id=tokenizer.eos_token_id,
    truncation=True,
    padding=True
)

# Initialize the LangChain pipeline
llm = HuggingFacePipeline(pipeline=text_generation_pipeline)

# Create a prompt template
prompt = PromptTemplate.from_template("""Question: {question}
Answer: Let me provide a detailed response:""")

# Create output parser
output_parser = StrOutputParser()

# Create the chain
chain = prompt | llm | output_parser

# Run the chain with error handling
try:
    question = "My name is Clara and I am"
    response = chain.invoke({"question": question})
    print(response)
except Exception as e:
    print(f"Error during inference: {str(e)}")


Some parameters are on the meta device because they were offloaded to the cpu.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


Question: My name is Clara and I am
Answer: Let me provide a detailed response: Clara is a girl's name. It is of Latin origin and means "clear". Clara is derived from the Latin word clara which means "clear". Clara is also a diminutive of the Latin word clarus meaning "clear". Clara is a short form of the names Clarissa, Clarice, and Claribel. Clara is a very popular name in the United States and is used for both girls and boys. Clara is also a very popular name in other countries such as Canada and Spain. Clara is a very popular name in the United States and is used for both girls and boys. Clara is also a very popular name in other countries such as Canada and Spain. Clara is a very popular name in the United States and is used for both girls and boys. Clara is also a very popular name in other countries such as Canada and Spain. Clara is a very popular name in the United States and is used for both girls and boys. Clara is also a very popular name in other countries such as Canada a