# Project: RAG - Q&A on Private Documents using LangChain, OpenAI and Pinecone




In [3]:
pip install -q -r ./requirements.txt

  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.4/63.4 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.5/2.5 MB[0m [31m28.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m516.6/516.6 kB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m303.4/303.4 kB[0m [31m29.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m438.3/438.3 kB[0m [31m38.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.4/44.4 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.9/50.9 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone


In [4]:
!pip install -y pinecone-client


Usage:   
  pip3 install [options] <requirement specifier> [package-index-options] ...
  pip3 install [options] -r <requirements file> [package-index-options] ...
  pip3 install [options] [-e] <vcs project url> ...
  pip3 install [options] [-e] <local project path> ...
  pip3 install [options] <archive url/path> ...

no such option: -y


In [5]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [6]:
# loading PDF, DOCX and TXT files as LangChain Documents
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    elif extension == '.txt':
        from langchain.document_loaders import TextLoader
        loader = TextLoader(file)
    else:
        print('Document format is not supported!')
        return None

    data = loader.load()
    return data


In [7]:
# wikipedia
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data


In [8]:
#creating chunks
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks


In [9]:
#calculating cost
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    # check prices here: https://openai.com/pricing
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.00002:.6f}')

In [10]:
!pip install -q langchain_pinecone

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m421.9/421.9 kB[0m [31m34.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m52.2/52.2 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [11]:
#embedding and uploading to vectordatabase (pinecone)
def insert_or_fetch_embeddings(index_name, chunks):
    # importing the necessary libraries and initializing the Pinecone client
    import pinecone
    from pinecone import ServerlessSpec
    # Use langchain_pinecone instead of langchain_community.vectorstores
    from langchain_pinecone import PineconeVectorStore
    from langchain_openai import OpenAIEmbeddings
    import os

    # Initialize Pinecone client
    pc = pinecone.Pinecone(api_key=os.getenv("PINECONE_API_KEY"))

    # Initialize embeddings
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)

    # Check if index already exists
    if index_name in pc.list_indexes().names():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        # Create vector store from existing index
        vector_store = PineconeVectorStore(
            pinecone_api_key=os.getenv("PINECONE_API_KEY"),
            index_name=index_name,
            embedding=embeddings
        )
        print('Ok')
    else:
        # Creating a new index
        print(f'Creating index {index_name} and embeddings ...', end='')

        pc.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )

        # Create vector store and add documents
        vector_store = PineconeVectorStore(
            pinecone_api_key=os.getenv("PINECONE_API_KEY"),
            index_name=index_name,
            embedding=embeddings
        )

        # Add documents to the vector store
        vector_store.add_documents(chunks)
        print('Ok')

    return vector_store

# You'll also need to update the ask_and_get_answer function
def ask_and_get_answer(vector_store, q, k=3):
    from langchain.chains import RetrievalQA
    from langchain_openai import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

    retriever = vector_store.as_retriever(search_kwargs={'k': k})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

    answer = chain.invoke(q)
    return answer

In [None]:
#deleting previous pinecone instances
def delete_pinecone_index(index_name='all'):
    import pinecone
    pc = pinecone.Pinecone()

    if index_name == 'all':
        indexes = pc.list_indexes().names()
        print('Deleting all indexes ... ')
        for index in indexes:
            pc.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name} ...', end='')
        pc.delete_index(index_name)
        print('Ok')


### Asking and Getting Answers

In [12]:
#calling api and creating responses
def ask_and_get_answer(vector_store, q, k=3):
    from langchain.chains import RetrievalQA
    from langchain_openai import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)

    answer = chain.invoke(q)
    return answer


In [None]:
# import warnings
# warnings.filterwarnings('ignore')

In [13]:
#loading private document
data = load_document('/content/21BCT0180_18967_REPORT.pdf')
# print(data[1].page_content)
# print(data[10].metadata)

print(f'You have {len(data)} pages in your data')
print(f'There are {len(data[20].page_content)} characters in the page')

Loading /content/21BCT0180_18967_REPORT.pdf
You have 58 pages in your data
There are 803 characters in the page


In [None]:
# data = load_document('files/the_great_gatsby.docx')
# print(data[0].page_content)

In [None]:
# data = load_from_wikipedia('GPT-4', 'de')
# print(data[0].page_content)

In [14]:
chunks = chunk_data(data)
print(len(chunks))
# print(chunks[10].page_content)

324


In [15]:
print_embedding_cost(chunks)

Total Tokens: 14346
Embedding Cost in USD: 0.000287


In [17]:
index_name = 'askadocument'
vector_store = insert_or_fetch_embeddings(index_name=index_name, chunks=chunks)

Index askadocument already exists. Loading embeddings ... Ok


In [18]:
q = 'What is the title of the document?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'What is the title of the document?', 'result': 'The title of the document appears to be "Documentation and Final Report" based on the context provided.'}


In [19]:
#back and forth interaction
import time
i = 1
print('Write Quit or Exit to quit.')
while True:
    q = input(f'Question #{i}: ')
    i = i + 1
    if q.lower() in ['quit', 'exit']:
        print('Quitting ... bye bye!')
        time.sleep(2)
        break

    answer = ask_and_get_answer(vector_store, q)
    print(f'\nAnswer: {answer}')
    print(f'\n {"-" * 50} \n')



Write Quit or Exit to quit.
Question #1: what is the document about?

Answer: {'query': 'what is the document about?', 'result': 'The document is about the documentation and final report for a project. It includes detailed information on the system design, methodology, results, key findings, potential areas for further research, model improvements, and real-world applications. The document likely provides a comprehensive overview and analysis of the project from start to finish.'}

 -------------------------------------------------- 

Question #2: what project is being referred to?

Answer: {'query': 'what project is being referred to?', 'result': 'The project being referred to is a project on video upscaling using GANs, specifically focusing on a comparison of SRGAN, ESRGAN, and Real-ESRGAN. The project aims to enhance the quality of realistic videos through these different models.'}

 -------------------------------------------------- 

Question #3: bye

Answer: {'query': 'bye', 'res

#### Ask Wikipedia

In [None]:
delete_pinecone_index()

Deleting all indexes ... 
Ok


In [None]:
#loading data from online source
data = load_from_wikipedia('Google Gemini', 'de')
chunks = chunk_data(data)

In [None]:
chunks

[Document(page_content='Google Gemini (ehemals Google Bard) ist ein von Google entwickelter KI-basierter, multimodaler Chatbot. Er wurde als direkte Reaktion auf den Erfolg von ChatGPT entwickelt und im März 2023 in eingeschränkter Kapazität veröffentlicht, bevor er im Laufe des', metadata={'title': 'Google Gemini', 'summary': 'Google Gemini (ehemals Google Bard) ist ein von Google entwickelter KI-basierter, multimodaler Chatbot. Er wurde als direkte Reaktion auf den Erfolg von ChatGPT entwickelt und im März 2023 in eingeschränkter Kapazität veröffentlicht, bevor er im Laufe des Sommers in weiteren Ländern verfügbar wurde. Google Gemini ist in 40 Sprachen verfügbar.', 'source': 'https://de.wikipedia.org/wiki/Google_Gemini'}),
 Document(page_content='Sommers in weiteren Ländern verfügbar wurde. Google Gemini ist in 40 Sprachen verfügbar.', metadata={'title': 'Google Gemini', 'summary': 'Google Gemini (ehemals Google Bard) ist ein von Google entwickelter KI-basierter, multimodaler Chatbo

In [None]:
index_name = 'gemini'
vector_store = insert_or_fetch_embeddings(index_name=index_name, chunks=chunks)

Creating index gemini and embeddings ...Ok


In [None]:
q = 'Was ist Google Gemini?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'Was ist Google Gemini?', 'result': 'Google Gemini (ehemals Google Bard) ist ein von Google entwickelter KI-basierter, multimodaler Chatbot. Es wurde entwickelt als Reaktion auf den Erfolg von ChatGPT und bietet eine kostenlose Version sowie eine kostenpflichtige Version namens Gemini Advanced an. Es ist in 40 Sprachen verfügbar.'}
