# RAG Project - Q&A on Private Documents using LangChain, OpenAI and Pinecone

This notebook uses **the latest versions** of the OpenAI, LangChain, and Pinecone libraries.

In [26]:
pip install -q -r ./requirements.txt

Note: you may need to restart the kernel to use updated packages.


In [27]:
import os
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv(), override=True)

True

In [3]:
pip install docx2txt -q

Note: you may need to restart the kernel to use updated packages.


### Loading Documents

In [4]:
# loading PDF, DOCX and TXT files as LangChain Documents
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain_classic.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain_classic.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    elif extension == '.txt':
        from langchain_classic.document_loaders import TextLoader
        loader = TextLoader(file)
    else:
        print('Document format is not supported!')
        return None

    data = loader.load()
    return data
  

In [30]:
# wikipedia
def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain_core.document_loaders import WikipediaLoader
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data


### Chunking Data

In [31]:
def chunk_data(data, chunk_size=256):
    from langchain_classic.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=0)
    chunks = text_splitter.split_documents(data)
    return chunks
    

### Calculating Cost

In [32]:
def print_embedding_cost(texts):
    import tiktoken
    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    # check prices here: https://openai.com/pricing
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000 * 0.00002:.6f}')

### Embedding and Uploading to a Vector Database (Pinecone)

In [34]:
def insert_or_fetch_embeddings(index_name, chunks):
    # importing the necessary libraries and initializing the Pinecone client
    import pinecone
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings
    from pinecone import ServerlessSpec

    
    pc = pinecone.Pinecone()
        
    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)  # 512 works as well

    # loading from existing index
    if index_name in pc.list_indexes().names():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')
    else:
        # creating the index and embedding the chunks into the index 
        print(f'Creating index {index_name} and embeddings ...', end='')

        # creating a new index
        pc.create_index(
            name=index_name,
            dimension=1536,
            metric='cosine',
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
        ) 
        )

        # processing the input documents, generating embeddings using the provided `OpenAIEmbeddings` instance,
        # inserting the embeddings into the index and returning a new Pinecone vector store object. 
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')
        
    return vector_store
    

In [35]:
def delete_pinecone_index(index_name='all'):
    import pinecone
    pc = pinecone.Pinecone()
    
    if index_name == 'all':
        indexes = pc.list_indexes().names()
        print('Deleting all indexes ... ')
        for index in indexes:
            pc.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name} ...', end='')
        pc.delete_index(index_name)
        print('Ok')
    

### Asking and Getting Answers

In [9]:
def ask_and_get_answer(vector_store, q, k=3):
    from langchain_classic.chains import RetrievalQA
    from langchain_openai import ChatOpenAI

    llm = ChatOpenAI(model='gpt-4o-mini', temperature=1)

    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    
    answer = chain.invoke(q)
    return answer


### Running Code

In [10]:
# import warnings
# warnings.filterwarnings('ignore')

#### Ask a PDF

In [8]:
data = load_document('files/us_constitution.pdf')
# print(data[1].page_content)
# print(data[10].metadata)
print(data[0].page_content)
print(data[0].metadata)
print(f'There are {len(data)} pages in the data')
print(f'There are {len(data[0].page_content)} characters on the first page')

Loading files/us_constitution.pdf
TheUnitedStatesConstitution
WethePeopleoftheUnitedStates,inOrdertoformamoreperfect
Union,establishJustice,insuredomesticTranquility, provideforthe
commondefence,promotethegeneralWelfare,andsecurethe
BlessingsofLibertytoourselvesandourPosterity, doordainand
establishthisConstitutionfortheUnitedStatesofAmerica.
TheConstitutionalConvention
ArticleI
Section1:Congress
AlllegislativePowershereingrantedshallbevestedinaCongressof
theUnitedStates,whichshallconsistofaSenateandHouseof
Representatives.
Section2:TheHouseofRepresentatives
{'producer': 'Skia/PDF m115 Google Docs Renderer', 'creator': 'PyPDF', 'creationdate': '', 'title': 'us_constitution', 'source': 'files/us_constitution.pdf', 'total_pages': 41, 'page': 0, 'page_label': '1'}
There are 41 pages in the data
There are 530 characters on the first page


In [15]:
data = load_document('files/the_great_gatsby.docx')
# print(data[0].page_content)
print(data[0].metadata)
print(f'There are {len(data)} pages in the data')

Loading files/the_great_gatsby.docx
{'source': 'files/the_great_gatsby.docx'}
There are 1 pages in the data


In [13]:
# data = load_from_wikipedia('GPT-4', 'de')
# print(data[0].page_content)

In [14]:
chunks = chunk_data(data)
print(len(chunks))
# print(chunks[10].page_content)

190


In [15]:
print_embedding_cost(chunks)

Total Tokens: 16711
Embedding Cost in USD: 0.000334


In [16]:
delete_pinecone_index()

Deleting all indexes ... 
Ok


In [17]:
index_name = 'askadocument'
vector_store = insert_or_fetch_embeddings(index_name=index_name, chunks=chunks)

Creating index askadocument and embeddings ...Ok


In [18]:
q = 'What is the Bill of Rights?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'What is the Bill of Rights?', 'result': 'The Bill of Rights is the first ten amendments to the United States Constitution. Ratified in 1791, these amendments were created to guarantee individual liberties and limit the powers of the federal government. The Bill of Rights includes protections for freedom of speech, religion, and assembly, the right to bear arms, protection against unreasonable searches and seizures, the right to a fair trial, and other essential rights.'}


#### While Loop for Asking Questions

In [19]:
import time
i = 1
print('Write Quit or Exit to quit.')
while True:
    q = input(f'Question #{i}: ')
    i = i + 1
    if q.lower() in ['quit', 'exit']:
        print('Quitting ... bye bye!')
        time.sleep(2)
        break
    
    answer = ask_and_get_answer(vector_store, q)
    print(f'\nAnswer: {answer}')
    print(f'\n {"-" * 50} \n')

    

Write Quit or Exit to quit.


Question #1:  What is the 2nd amendment? Answer from the provided context only.



Answer: {'query': 'What is the 2nd amendment? Answer from the provided context only.', 'result': 'The Second Amendment states: "A well regulated Militia, being necessary to the security of a free State, the right of the people to keep and bear Arms, shall not be infringed."'}

 -------------------------------------------------- 



Question #2:  quit


Quitting ... bye bye!


#### Ask Wikipedia

In [20]:
delete_pinecone_index()

Deleting all indexes ... 
Ok


In [26]:
data = load_from_wikipedia('DeepSeek R1', 'de')
chunks = chunk_data(data)

In [27]:
chunks

[Document(metadata={'title': 'DeepSeek', 'summary': 'DeepSeek ist ein chinesisches Startup, das sich auf die Entwicklung fortschrittlicher Sprachmodelle und künstlicher Intelligenz spezialisiert hat. Das Unternehmen gewann internationale Aufmerksamkeit mit der Veröffentlichung seines im Januar 2025 vorgestellten Modells DeepSeek R1, das mit etablierten KI-Systemen wie ChatGPT von OpenAI und Claude von Anthropic konkurriert. Das Unternehmen wird ausschließlich vom chinesischen Hedgefonds High-Flyer finanziert. Beide Unternehmen haben ihren Sitz in Hangzhou, Zhejiang.', 'source': 'https://de.wikipedia.org/wiki/DeepSeek'}, page_content='DeepSeek ist ein chinesisches Startup, das sich auf die Entwicklung fortschrittlicher Sprachmodelle und künstlicher Intelligenz spezialisiert hat. Das Unternehmen gewann internationale Aufmerksamkeit mit der Veröffentlichung seines im Januar 2025'),
 Document(metadata={'title': 'DeepSeek', 'summary': 'DeepSeek ist ein chinesisches Startup, das sich auf die

In [28]:
index_name = 'deepseek'
vector_store = insert_or_fetch_embeddings(index_name=index_name, chunks=chunks)

Creating index deepseek and embeddings ...Ok


In [30]:
q = 'Was ist DeepSeek?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'Was ist DeepSeek?', 'result': 'DeepSeek ist ein chinesisches Startup, das sich auf die Entwicklung fortschrittlicher Sprachmodelle und künstlicher Intelligenz spezialisiert hat. Es wurde im April 2023 von Liang Wenfeng als unabhängiges Unternehmen für künstliche Intelligenz gegründet, mit dem Ziel, eine universelle künstliche Intelligenz zu entwickeln und zu vermarkten. DeepSeek gewann internationale Aufmerksamkeit mit der Veröffentlichung seines Produkts im Januar 2025.'}
