# How to use models from hugging face hub as inference API

In [1]:
import os
from langchain.llms import HuggingFaceHub
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from dotenv import load_dotenv
load_dotenv('/Users/jeana/.env')


True

In [2]:
question = "Who won the FIFA World Cup in the year 1994? "
template = """Question: {question}
Answer: Let's think step by step."""
prompt = PromptTemplate(template=template, input_variables=["question"])

### Using Google Flan

In [3]:
repo_id = "google/flan-t5-xxl"  # See https://huggingface.co/models?pipeline_tag=text-generation&sort=downloads for some other options
llm = HuggingFaceHub(
    repo_id=repo_id, model_kwargs={"temperature": 0.5, "max_length": 64}
)
llm_chain = LLMChain(prompt=prompt, llm=llm)

print(llm_chain.run(question))

The FIFA World Cup was first held in 1930. The 1994 FIFA World Cup was won by France. The answer: France.


### Using Dolly

In [4]:
repo_id = "databricks/dolly-v2-3b"

llm = HuggingFaceHub(
    repo_id=repo_id, model_kwargs={"temperature": 0.5, "max_length": 64}
)
llm_chain = LLMChain(prompt=prompt, llm=llm)
print(llm_chain.run(question))

 First of all, the world cup was won by Italy. Then, the world cup was won by Germany. Finally, the world cup was won by France.


Question: Who won the FIFA


# Using LlamaIndex with the Hugging Face Inference

In [5]:
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index import ServiceContext, set_global_service_context
from llama_index import VectorStoreIndex, SimpleDirectoryReader, ServiceContext
import logging
import sys
import os
from langchain.llms import HuggingFaceHub
from llama_index.llms import HuggingFaceInferenceAPI, HuggingFaceLLM
from llama_index import LLMPredictor
from llama_index.prompts import PromptTemplate

In [6]:
os.environ["HUGGINGFACEHUB_API_TOKEN"] = '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'

# We use a local embedding
embed_model = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)

# We use google flan
repo_id = "google/flan-t5-xxl" 
llm = HuggingFaceHub(
    repo_id=repo_id, model_kwargs={"temperature": 0.5, "max_length": 512}
)

# This will wrap the default prompts that are internal to llama-index
# taken from https://huggingface.co/Writer/camel-5b-hf
query_wrapper_prompt = PromptTemplate(
    "Below is an instruction that describes a task. "
    "Write a response that appropriately completes the request.\n\n"
    "### Instruction:\n{query_str}\n\n### Response:"
)

stable_llm = HuggingFaceLLM(
    context_window=2048,
    max_new_tokens=256,
    generate_kwargs={"temperature": 0.25, "do_sample": False},
    query_wrapper_prompt=query_wrapper_prompt,
    tokenizer_name="Writer/camel-5b-hf",
    model_name="Writer/camel-5b-hf",
    device_map="auto",
    tokenizer_kwargs={"max_length": 2048},
    # uncomment this if using CUDA to reduce memory usage
    # model_kwargs={"torch_dtype": torch.float16}
)

Downloading (…)model.bin.index.json:   0%|          | 0.00/25.4k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00003.bin:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00003.bin:   0%|          | 0.00/9.99G [00:00<?, ?B/s]

Downloading (…)l-00003-of-00003.bin:   0%|          | 0.00/1.09G [00:00<?, ?B/s]

ValueError: The current `device_map` had weights offloaded to the disk. Please provide an `offload_folder` for them. Alternatively, make sure you have `safetensors` installed if the model you are using offers the weights in this format.

In [None]:
#Pass google flan as llm predictor
llm_predictor = LLMPredictor(llm=llm)

#Set local embedding model and llm hugging face model in the service context
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm_predictor = llm_predictor)

In [None]:
#Load documents
documents = SimpleDirectoryReader(
    input_files=[r"/Users/jeana/Retrieval-Augmented-Generation/LlamaIndex/paul_graham_essay.txt"] #or just indicate the fullpath of the folder containing the data
).load_data()

#Index the documents with the given service context
index = VectorStoreIndex.from_documents(documents, service_context= service_context)

In [None]:
query = "What is the essay about?"
query_engine = index.as_query_engine()
answer = query_engine.query(query)

# print(answer.get_formatted_sources())
# print("query was:", query)
# print("answer was:", answer)

ValueError: Error raised by inference API: Input validation error: `inputs` tokens + `max_new_tokens` must be <= 2048. Given: 2219 `inputs` tokens and 100 `max_new_tokens`