## RAG example with Langchain, PostgreSQL+pgvector, and HFTGI

Requirements:
- A PostgreSQL cluster with the pgvector extension installed (https://github.com/pgvector/pgvector)
- A Database created in the cluster with the extension enabled (in this example, the database is named `vectordb`. Run the following command in the database as a superuser:
`CREATE EXTENSION vector;`
- All the information to connect to the database

### Needed packages

In [1]:
!pip install -q pgvector
!pip install langchain
!pip install pypdf
!pip install sentence-transformers
!pip install text_generation


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Collecting langchain
  Downloading langchain-0.1.16-py3-none-any.whl (817 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m817.7/817.7 kB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
Collecting langchain-text-splitters<0.1,>=0.0.1
  Downloading langchain_text_splitters-0.0.1-py3-none-any.whl (21 kB)
Collecting langsmith<0.2.0,>=0.1.17
  Downloading langsmith-0.1.45-py3-none-any.whl (104 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.2/104.2 kB[0m [31m225.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting dataclasses-json<0.7,>=0.5.7
  Downloading dataclasses_json-0.6.4-py3-none-any.whl (28 kB)
Collecting langchain-core<0.2.0,>=0.1.42
  Downloading langchain_core-0.1.42-py3-none-any.whl (287 kB)
[2K     

#### Bases parameters, Inference server and PostgreSQL info

In [2]:
#inference_server_url = "http://hf-tgi.llm-hosting.svc.cluster.local:3000/"
inference_server_url = "https://ollama-models-irs-rag-demo.apps.cluster-n4hd4.dynamic.redhatworkshops.io"
#CONNECTION_STRING = "postgresql+psycopg://user:password@postgresql-server:5432/vectordb"
CONNECTION_STRING = "postgresql+psycopg://vectordb:vectordb@postgresql.pgvector.svc.cluster.local:5432/vectordb"
#COLLECTION_NAME = "documents_test"
COLLECTION_NAME = "rhoai-doc-2.6"

#### Imports

In [3]:
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from langchain.vectorstores.pgvector import PGVector
from langchain.chains import RetrievalQA
from langchain.llms import HuggingFaceTextGenInference
from langchain.llms import Ollama
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.prompts import PromptTemplate

#### Initialize the connection

In [4]:
embeddings = HuggingFaceEmbeddings()
store = PGVector(
    connection_string=CONNECTION_STRING,
    collection_name=COLLECTION_NAME,
    embedding_function=embeddings)

  warn_deprecated(


#### Initialize query chain

In [5]:
!pip install text_generation
# NOTE: This template syntax is specific to Llama2
import os
template="""<s>[INST] <<SYS>>
You are a helpful, respectful and honest assistant.
You will be given a question you need to answer, and a context to provide you with information. You must answer the question based as much as possible on this context.
Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.
<</SYS>>

Question: {question}
Context: {context} [/INST]
"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)
#os.environ["TOKENIZERS_PARALLELISM"] = 'false'

# llm = HuggingFaceTextGenInference(
#     inference_server_url=inference_server_url,
#     max_new_tokens=512,
#     top_k=10,
#     top_p=0.95,
#     typical_p=0.95,
#     temperature=0.1,
#     repetition_penalty=1.175,
#     streaming=True,
#     callbacks=[StreamingStdOutCallbackHandler()]
# )

llm = Ollama(
    base_url=inference_server_url,
    model="mistral",
    top_p=0.92,
    temperature=0.01,
    num_predict=512,
    repeat_penalty=1.03,
    callbacks=[StreamingStdOutCallbackHandler()]
)

# llm = Ollama(
#     base_url=inference_server_url,
#     model="tinyllama",
#     callbacks=[StreamingStdOutCallbackHandler()]
# )

qa_chain = RetrievalQA.from_chain_type(llm,
                                       retriever=store.as_retriever(search_type="similarity_score_threshold", search_kwargs={"k": 4, "score_threshold": 0.2 }),
                                       chain_type_kwargs={"prompt": QA_CHAIN_PROMPT},
                                       return_source_documents=True)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


#### Query example

In [6]:
!pip install gradio
import gradio as gr
# Gradio Interface
def answer_question(question):
    #result = chain.run(question)
    result = qa_chain({"query": question})
    # print('Result Items= ' + result.items)
    return result

iface = gr.Interface(answer_question, "textbox", "textbox")
iface.launch(share=True)

# question = "How can I work with GPU and taints in OpenShift Data Science?"
# #question = "What is RHOAI in 100 words or less?"
# result = qa_chain({"query": question})

Collecting gradio
  Downloading gradio-4.26.0-py3-none-any.whl (17.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m155.4 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting httpx>=0.24.1
  Downloading httpx-0.27.0-py3-none-any.whl (75 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m75.6/75.6 kB[0m [31m160.5 MB/s[0m eta [36m0:00:00[0m
Collecting ruff>=0.2.2
  Downloading ruff-0.3.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m146.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tomlkit==0.12.0
  Downloading tomlkit-0.12.0-py3-none-any.whl (37 kB)
Collecting uvicorn>=0.14.0
  Downloading uvicorn-0.29.0-py3-none-any.whl (60 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.8/60.8 kB[0m [31m147.9 MB/s[0m eta [36m0:00:00[0m
Collecting fastapi
  Downloading fastapi-0.110.1-py3-non



Traceback (most recent call last):
  File "/opt/app-root/lib64/python3.9/site-packages/gradio/queueing.py", line 527, in process_events
    response = await route_utils.call_process_api(
  File "/opt/app-root/lib64/python3.9/site-packages/gradio/route_utils.py", line 261, in call_process_api
    output = await app.get_blocks().process_api(
  File "/opt/app-root/lib64/python3.9/site-packages/gradio/blocks.py", line 1786, in process_api
    result = await self.call_function(
  File "/opt/app-root/lib64/python3.9/site-packages/gradio/blocks.py", line 1338, in call_function
    prediction = await anyio.to_thread.run_sync(
  File "/opt/app-root/lib64/python3.9/site-packages/anyio/to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
  File "/opt/app-root/lib64/python3.9/site-packages/anyio/_backends/_asyncio.py", line 2144, in run_sync_in_worker_thread
    return await future
  File "/opt/app-root/lib64/python3.9/site-packages/anyio/_backends/_a

#### Retrieve source

In [7]:
def remove_duplicates(input_list):
    unique_list = []
    for item in input_list:
        if item.metadata['source'] not in unique_list:
            unique_list.append(item.metadata['source'])
    return unique_list

results = remove_duplicates(result['source_documents'])

for s in results:
    print(s)

NameError: name 'result' is not defined