In [None]:
!pip install langchain chromadb faiss-cpu huggingface tiktoken langchain_huggingface langchain-community wikipedia

Collecting chromadb
  Downloading chromadb-1.1.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Collecting huggingface
  Downloading huggingface-0.0.1-py3-none-any.whl.metadata (2.9 kB)
Collecting langchain_huggingface
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
Collecting langchain-community
  Downloading langchain_community-0.3.30-py3-none-any.whl.metadata (3.0 kB)
Collecting wikipedia
  Downloading wikipedia-1.4.0.tar.gz (27 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.2-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl.metadata (8.7 kB)
Collecting posthog<6.0.0,>=2.4.0 (from chromadb)
  Downloading posthog-5.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting onnxru

In [None]:
from langchain_community.retrievers import WikipediaRetriever

In [None]:
retriever = WikipediaRetriever(top_l_result=2, lang='en')

In [None]:
query = 'The geopolitics of the India'

docs = retriever.invoke(query)

In [None]:
for i , doc in enumerate(docs):
    print(f'Document {i+1}')
    print(doc.page_content)

Document 1
The India-Middle East-Europe Economic Corridor (commonly abbreviated as IMEC; sometimes also referred to as IMEEC) is a planned economic corridor that aims to bolster economic development by fostering connectivity and economic integration between Asia, the Persian Gulf and Europe. The corridor is a proposed route from India to Europe through the United Arab Emirates, Saudi Arabia, Israel, and Italy (or Greece).
The route draws on what is now described as the ancient Golden Road.


== Signing date and place ==
On 9 September 2023 the Memorandum of Understanding (MoU) was signed during the 2023 G20 New Delhi summit by the governments of India, United States, United Arab Emirates, Saudi Arabia, France, Germany, Italy and the European Union.


== Details ==

The project was launched to bolster transportation and communication links between Europe and Asia through rail and shipping networks and is seen as a counter to China's Belt and Road Initiative. The memorandum of understand

In [None]:
from langchain_community.vectorstores import Chroma
from langchain_huggingface.embeddings import HuggingFaceEmbeddings
from langchain_core.documents import Document

In [None]:
embedding = HuggingFaceEmbeddings(model_name='sentence-transformers/all-MiniLM-L6-v2')


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
documents = [
    Document(page_content="LangChain helps developers build LLM applications easily."),
    Document(page_content="Chroma is a vector database optimized for LLM-based search."),
    Document(page_content="Embeddings convert text into high-dimensional vectors."),
    Document(page_content="OpenAI provides powerful embedding models."),
]

In [None]:
# Create Chroma vector store in memory

vectorstore = Chroma.from_documents(
    documents=documents,
    embedding=embedding,
    collection_name="my_collection"
)

In [None]:
# Convert vectorstore into a retriever

retriever = vectorstore.as_retriever(search_kwargs={"k": 2})

In [None]:
query = "What is Chroma used for?"
results = retriever.invoke(query)

In [None]:
for i, doc in enumerate(results):
    print(f"\n--- Result {i+1} ---")
    print(doc.page_content)



--- Result 1 ---
Chroma is a vector database optimized for LLM-based search.

--- Result 2 ---
LangChain helps developers build LLM applications easily.


#
 ***MMR***

In [None]:
# Sample documents
docs = [
    Document(page_content="LangChain makes it easy to work with LLMs."),
    Document(page_content="LangChain is used to build LLM based applications."),
    Document(page_content="Chroma is used to store and search document embeddings."),
    Document(page_content="Embeddings are vector representations of text."),
    Document(page_content="MMR helps you get diverse results when doing similarity search."),
    Document(page_content="LangChain supports Chroma, FAISS, Pinecone, and more."),
]

In [None]:
from langchain_community.vectorstores import FAISS


vectorstore = FAISS.from_documents(
    documents=docs,
    embedding=embedding
)

In [None]:
retriever = vectorstore.as_retriever(
  search_type="mmr",                     # <-- This enables MMR
  search_kwargs={"k": 2, "fetch_k": 3}   # k = top results, lambda_mult = relevance-diversity balance
)

In [None]:
query = "What is langchain?"
results = retriever.invoke(query)

In [None]:
for i, doc in enumerate(results):
    print(f"\n--- Result {i+1} ---")
    print(doc.page_content)


--- Result 1 ---
LangChain supports Chroma, FAISS, Pinecone, and more.

--- Result 2 ---
LangChain is used to build LLM based applications.
