In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
from llama_index.core import SimpleDirectoryReader

loader = SimpleDirectoryReader('./data')
documents = loader.load_data()

In [3]:
device = 'cuda'

In [4]:
from langchain_huggingface import HuggingFaceEmbeddings

model_kwargs = {'device': device}
embed_model = HuggingFaceEmbeddings(model_name='BAAI/bge-base-en', model_kwargs=model_kwargs)

In [5]:
from llama_index.core.llms import CustomLLM, CompletionResponse, LLMMetadata, CompletionResponseGen
from transformers import pipeline, BitsAndBytesConfig, AutoTokenizer
import torch

# set context window size
context_window = 4096
# set number of output tokens
num_output = 150

model_name = "meta-llama/Meta-Llama-3-8B-Instruct"

quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
)

tokenizer = AutoTokenizer.from_pretrained(model_name)
pipeline = pipeline(
    "text-generation",
    model=model_name,
    tokenizer=tokenizer,
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
    device_map=device,
    pad_token_id=tokenizer.eos_token_id,
    model_kwargs={"temperature":0, "quantization_config": quantization_config},
)

class llama(CustomLLM):
    @property
    def metadata(self) -> LLMMetadata:
        """Get LLM metadata."""
        return LLMMetadata(
            context_window=context_window,
            num_output=num_output,
            model_name=model_name
        )

    def complete(self, prompt: str, **kwargs) -> CompletionResponse:
        prompt_length = len(prompt)
        response = pipeline(prompt, max_new_tokens=num_output)[0]["generated_text"]
        # only return newly generated tokens
        text = response[prompt_length:]
        return CompletionResponse(text=text)

    def stream_complete(self, prompt: str, **kwargs) -> CompletionResponseGen:
        raise NotImplementedError()

llm = llama()

Loading checkpoint shards: 100%|██████████| 4/4 [00:06<00:00,  1.75s/it]


In [6]:
# setup service context
from llama_index.core import Settings
Settings.llm = llm
Settings.embed_model = embed_model
Settings.num_output = num_output
Settings.chunk_size = 512
Settings.context_window = context_window

In [7]:
print(llama().complete("Tell me something about Poland"))

 that you find interesting or unique.
I find Poland to be a country with a rich history, culture, and traditions. One thing that I find particularly interesting is the country's complex and tumultuous past, which has had a significant impact on its present-day identity. Poland has been invaded and occupied by various powers throughout history, including the Mongols, the Swedes, the Prussians, and the Nazis. The country has also experienced significant periods of cultural and economic growth, such as the Renaissance and the Enlightenment.

One unique aspect of Polish culture is the country's strong sense of national identity and its preservation of traditional customs and practices. For example, the country's folk architecture, such as the wooden houses and churches, is a testament to the


In [8]:
import faiss
# dimensions of embed_model
d = 768
faiss_index = faiss.IndexFlatL2(d)

In [9]:
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.core import VectorStoreIndex, StorageContext

vector_store = FaissVectorStore(faiss_index=faiss_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
index = VectorStoreIndex.from_documents(documents=documents, storage_context=storage_context)

In [10]:
index.storage_context.persist(persist_dir="./index")

# loading

In [11]:
from llama_index.core import load_index_from_storage

vector_store = FaissVectorStore.from_persist_dir("./index")
storage_context = StorageContext.from_defaults(
    vector_store=vector_store, persist_dir="./index"
)
index = load_index_from_storage(storage_context=storage_context)

In [12]:
from llama_index.core.response.notebook_utils import display_source_node

retriever = index.as_retriever(similarity_top_k=2)
retrieved_nodes = retriever.retrieve("What did the author do growing up?")

for node in retrieved_nodes:
    display_source_node(node, source_length=500)

**Node ID:** 4cc82970-6823-4c37-8b4d-4102c951884e<br>**Similarity:** 0.4018879532814026<br>**Text:** What I Worked On

February 2021

Before college the two main things I worked on, outside of school, were writing and programming. I didn't write essays. I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep.

The first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data process...<br>

**Node ID:** bd2b0d23-cc31-42a3-b8c2-e05727151ec2<br>**Similarity:** 0.4317491948604584<br>**Text:** I certainly did. So at the end of the summer Dan and I switched to working on this new dialect of Lisp, which I called Arc, in a house I bought in Cambridge.

The following spring, lightning struck. I was invited to give a talk at a Lisp conference, so I gave one about how we'd used Lisp at Viaweb. Afterward I put a postscript file of this talk online, on paulgraham.com, which I'd created years before using Viaweb but had never used for anything. In one day it got 30,000 page views. What on e...<br>

In [13]:
from llama_index.core import PromptTemplate
custom_qa_template = (
    "<s>[INST] <<SYS>>\n"
    "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible using the context text provided. Your answers should only answer the question once and not have any text after the answer is done.\n"
    "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
    "<</SYS>>\n\n"
    "CONTEXT: \n"
    "{context_str}\n\n"
    "Question: "
    "{query_str}"
    "[/INST]"
)

custom_refine_template = (
    "<s>[INST] <<SYS>>\n"
    "You are a helpful, respectful and honest assistant. Always answer as helpfully as possible using the context text provided. Your answers should only answer the question once and not have any text after the answer is done.\n"
    "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."
    "<</SYS>>\n\n"
    "This is you previous answer:\n"
    "{existing_answer}\n\n"
    "We have the opportunity to refine the existing answer (only if needed) with some more context below.\n\n"
    "CONTEXT: \n"
    "{context_msg}\n\n"
    "Question: "
    "{query_str}"
    "Do not put any metadata information in the answer as well as context only the answer itself."
    "[/INST]"
)

custom_qa_template = PromptTemplate(custom_qa_template)
custom_refine_template = PromptTemplate(custom_refine_template)

In [14]:
query_engine = index.as_query_engine(similarity_top_k=2, prompt_template=custom_qa_template, refine_template=custom_refine_template)
response = query_engine.query("What did the author do growing up?")
response.response

'1. He wrote short stories. 2. He worked on programming on the IBM 1401 computer. He wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep. The first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district\'s 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain\'s'

In [15]:
query_engine = index.as_query_engine(similarity_top_k=2, prompt_template=custom_qa_template, refine_template=custom_refine_template)
response = query_engine.query("who is epku?")
response.response

' There is no mention of "epku" in the provided text. The text appears to be an essay or a memoir written by Paul Graham, the founder of Y Combinator, about the early days of the startup accelerator. The text does not mention a person named "epku". If you have any further information or context about "epku", I can try to help you better.'