### OpenAI Embeddings

In [None]:
!pip install llama_index

In [34]:
from llama_index import ServiceContext
from llama_index.embeddings import OpenAIEmedding

embed_model = OpenAIEmbedding()
service_context = ServiceContext.from_defaults(embed_model=embed_model)
# service_context = ServiceContext.from_defaults(embed_model="local")

# Optionally set a global service context to avoid passing it into other objects every time
from llama_index import set_global_service_context

set_global_service_context(service_context)

documents = SimpleDirectoryReader("./data").load_data()

index = VectorStoreIndex.from_documents(documents)


ImportError: cannot import name 'OpenAIEmedding' from 'llama_index.embeddings' (/usr/local/lib/python3.10/dist-packages/llama_index/embeddings/__init__.py)

In [None]:
query_engine = index.as_query_engine()

response = query_engine.query("query string")

### Langchain Integration

In [None]:
!pip install langchain
!pip install sentence_transformers

In [35]:
#Langchain integrations
from langchain.embeddings.huggingface import HuggingFaceBgeEmbeddings
from llama_index import ServiceContext

embed_model = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en")

service_context = ServiceContext.from_defaults(embed_model=embed_model)

### HuggingFace Optimum ONNX Embeddings

In [None]:
#HuggingFace Optimum ONNX Embeddings
!pip install transformers optimum[exporters]

In [None]:
from llama_index.embeddings import OptimumEmbedding

OptimumEmbedding.create_and_save_optimum_model(
    "BAAI/bge-small-en-v1.5", "./bge_onnx"
)

In [None]:
embed_model = OptimumEmbedding(folder_name="./bge_onnx")
service_context = ServiceContext.from_defaults(embed_model=embed_model)

### IMPORT DATA

In [42]:
def read_titles_and_summaries(filename):
    combined_texts = []
    with open(filename, "r") as file:
        content = file.read().split("|||")
        for entry in content:
            if entry.strip():
                parts = entry.split('|')
                if len(parts) >= 4:
                    title, _, summary, _ = parts[:4]
                    combined_text = title + " " + summary  # Concatenating title and summary
                    combined_texts.append(combined_text)
    return combined_texts

# Replace with combined titles and summaries from your file
combined_texts = read_titles_and_summaries("./data/research_papers/user_inputs.txt")

### Sentence Transformers

In [39]:
#SENTENCE TRANSFORMERS
from sentence_transformers import SentenceTransformer

# Initialize the model, 'all-MiniLM-L6-v2' is a good general-purpose model for embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

#REPLACE WITH TITLES OF RESEARCH PAPERS
# Your list of sentences / queries
sentences = [
    "How to bake a cake",
    "Ways to prepare for a marathon",
    "Best practices in software development",
    # Add more sentences as needed
]

# Generate embeddings for all sentences
embeddings = model.encode(sentences)

# 'embeddings' now contains the embeddings for each sentence in the list
print(embeddings)


[[ 0.07309512  0.05413244 -0.03563    ...  0.0762304   0.02083809
  -0.11177988]
 [ 0.01857957  0.03129777  0.01264536 ... -0.00444165 -0.0810978
   0.00860157]
 [-0.0088776   0.01615504 -0.03355166 ...  0.03692413  0.00147551
   0.05038412]]


### BM25 Retrieval (SETUP)




In [29]:
!pip install llama-index

In [67]:
import os
import openai
os.environ["OPENAI_API_KEY"] = ""
openai.api_key = ["OPENAI_API_KEY"]

In [68]:
import logging
import sys

logging.basicConfig(stream=sys.stdout, level=logging.INFO)
logging.getLogger().handlers = []
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

from llama_index import (
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
    VectorStoreIndex,
)
from llama_index.retrievers import BM25Retriever
from llama_index.indices.vector_store.retrievers.retriever import (
    VectorIndexRetriever,
)
from llama_index.llms import OpenAI

In [None]:
# #REPLACE WITH OUR DATASET/API
# !mkdir -p 'data/paul_graham/'
# !wget 'https://raw.githubusercontent.com/run-llama/llama_index/main/docs/examples/data/paul_graham/paul_graham_essay.txt' -O 'data/paul_graham/paul_graham_essay.txt'


In [103]:
# load documents
#REPLACE WITH OUR DATASET/API
documents = SimpleDirectoryReader("./data/research_papers").load_data()
print(documents)


[Document(id_='da9e4c3b-6903-46e6-861e-c3fdaa0593c1', embedding=None, metadata={'page_label': '59', 'file_name': '1-s2.0-S0007681309001232-main.pdf', 'file_path': 'data/research_papers/1-s2.0-S0007681309001232-main.pdf', 'file_type': 'application/pdf', 'file_size': 182136, 'creation_date': '2024-01-27', 'last_modified_date': '2024-01-27', 'last_accessed_date': '2024-01-27'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='Users of the world, unite! The challenges and\nopportunities of Social Media\nAndreas M. Kaplan *, Michael Haenlein\nESCP Europe, 79 Avenue de la Re ´publique, F-75011 Paris, France\n1. The specter of Social Media\nAs of January 2009, the online social networking\napplication Facebook registered more than 175million active users. To put

In [97]:
# initialize service context (set chunk size)
llm = OpenAI(model="gpt-4")
service_context = ServiceContext.from_defaults(chunk_size=1024, llm=llm)
nodes = service_context.node_parser.get_nodes_from_documents(documents)

print(nodes)

[TextNode(id_='51afdf5d-6fee-4980-aaa7-2fe209ae8cd1', embedding=None, metadata={'page_label': '59', 'file_name': '1-s2.0-S0007681309001232-main.pdf', 'file_path': 'data/research_papers/1-s2.0-S0007681309001232-main.pdf', 'file_type': 'application/pdf', 'file_size': 182136, 'creation_date': '2024-01-27', 'last_modified_date': '2024-01-27', 'last_accessed_date': '2024-01-27'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='316059c4-9851-401f-b9eb-01a21cba8118', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '59', 'file_name': '1-s2.0-S0007681309001232-main.pdf', 'file_path': 'data/research_papers/1-s2.0-S0007681309001232-main.pdf', 'file_type': 'application/pdf', 'file_size': 182136, 'creation

In [71]:
# initialize storage context (by default it's in-memory)
storage_context = StorageContext.from_defaults()
storage_context.docstore.add_documents(nodes)


In [72]:
index = VectorStoreIndex(
    nodes=nodes,
    storage_context=storage_context,
    service_context=service_context,
)

### BM25 Retriever

In [40]:
!pip install rank_bm25

Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


In [73]:
# We can pass in the index, doctore, or list of nodes to create the retriever
retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=2)


In [None]:
from llama_index.response.notebook_utils import display_source_node

# will retrieve context from specific companies
nodes = retriever.retrieve("How much sleep do we need?")
for node in nodes:
    display_source_node(node)


In [None]:
nodes = retriever.retrieve("What's the?")
for node in nodes:
    display_source_node(node)


###  Router Retriever with bm25 method

In [76]:
from llama_index.tools import RetrieverTool

vector_retriever = VectorIndexRetriever(index)
bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=2)

retriever_tools = [
    RetrieverTool.from_defaults(
        retriever=vector_retriever,
        description="Useful in most cases",
    ),
    RetrieverTool.from_defaults(
        retriever=bm25_retriever,
        description="Useful if searching about specific information",
    ),
]


In [77]:
from llama_index.retrievers import RouterRetriever

retriever = RouterRetriever.from_defaults(
    retriever_tools=retriever_tools,
    service_context=service_context,
    select_multi=True,
)


In [None]:
# will retrieve all context from the author's life
nodes = retriever.retrieve(
    "What's the percentage of injuries or deaths caused by sleepiness?"
)
for node in nodes:
    display_source_node(node)


In [None]:
!curl https://www.ipcc.ch/report/ar6/wg2/downloads/report/IPCC_AR6_WGII_Chapter03.pdf --output IPCC_AR6_WGII_Chapter03.pdf


In [None]:
!pip install pypdf


In [87]:
from llama_index import (
    VectorStoreIndex,
    ServiceContext,
    StorageContext,
    SimpleDirectoryReader,
)
from llama_index.llms import OpenAI

# load documents

documents = SimpleDirectoryReader("./data/research_papers").load_data()

# documents = SimpleDirectoryReader(
#     input_files=["AP Research 3.35 (1).pdf"]
# ).load_data()

# initialize service context (set chunk size)
# -- here, we set a smaller chunk size, to allow for more effective re-ranking
llm = OpenAI(model="gpt-3.5-turbo")
service_context = ServiceContext.from_defaults(chunk_size=256, llm=llm)
nodes = service_context.node_parser.get_nodes_from_documents(documents)

# initialize storage context (by default it's in-memory)
storage_context = StorageContext.from_defaults()
storage_context.docstore.add_documents(nodes)


In [88]:
index = VectorStoreIndex(
    nodes, storage_context=storage_context, service_context=service_context
)

In [89]:
from llama_index.retrievers import BM25Retriever

# retireve the top 10 most similar nodes using embeddings
vector_retriever = index.as_retriever(similarity_top_k=10)

# retireve the top 10 most similar nodes using bm25
bm25_retriever = BM25Retriever.from_defaults(nodes=nodes, similarity_top_k=10)


### Custom Retriever Implementation

In [90]:
from llama_index.retrievers import BaseRetriever


class HybridRetriever(BaseRetriever):
    def __init__(self, vector_retriever, bm25_retriever):
        self.vector_retriever = vector_retriever
        self.bm25_retriever = bm25_retriever
        super().__init__()

    def _retrieve(self, query, **kwargs):
        bm25_nodes = self.bm25_retriever.retrieve(query, **kwargs)
        vector_nodes = self.vector_retriever.retrieve(query, **kwargs)

        # combine the two lists of nodes
        all_nodes = []
        node_ids = set()
        for n in bm25_nodes + vector_nodes:
            if n.node.node_id not in node_ids:
                all_nodes.append(n)
                node_ids.add(n.node.node_id)
        return all_nodes


In [91]:
index.as_retriever(similarity_top_k=5)

hybrid_retriever = HybridRetriever(vector_retriever, bm25_retriever)


### Re ranker Setup

In [92]:
from llama_index.postprocessor import SentenceTransformerRerank

reranker = SentenceTransformerRerank(top_n=4, model="BAAI/bge-reranker-base")

### Retrieve

In [None]:
from llama_index import QueryBundle

nodes = hybrid_retriever.retrieve(
    "What is the impact of preconditioning on MSCs?"
)
reranked_nodes = reranker.postprocess_nodes(
    nodes,
    query_bundle=QueryBundle(
        "What is the impact of preconditioning on MSCs?"
    ),
)

print("Initial retrieval: ", len(nodes), " nodes")
print("Re-ranked retrieval: ", len(reranked_nodes), " nodes")

In [None]:
from llama_index.response.notebook_utils import display_source_node

for node in reranked_nodes:
    print(node)



### Full Query Engine

In [None]:
from llama_index.query_engine import RetrieverQueryEngine

query_engine = RetrieverQueryEngine.from_args(
    retriever=hybrid_retriever,
    node_postprocessors=[reranker],
    service_context=service_context,
)

response = query_engine.query(
    "What is the impact of preconditioning on MSCs?"
)


In [None]:
from llama_index.response.notebook_utils import display_response

display_response(response)
