# Project: Question-Answering on Private Documents

## Here is all the package needed

In [1]:
!pip install pypdf -q
#Once install you don't need to explicitly import pypdf after installing it, because it's already there ready to use

In [2]:
!pip install docx2txt -q
#Once install you don't need to explicitly import pypdf after installing it, because it's already there ready to use

In [7]:
import os
from dotenv import load_dotenv, find_dotenv
import docx2txt
from langchain_community.document_loaders import TextLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

load_dotenv(find_dotenv(), override=True)

# You can add other package here if needed


True

### Loading Documents

In [8]:
# The goal is to loading PDF, DOCX and TXT files as LangChain Documents
def load_document(file):

    name, extension = os.path.splitext(file) # check splitext()

    if extension == '.pdf':
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    elif extension == '.txt':
        loader = TextLoader(file)
    else:
        print('Document format is not supported!')
        return None

    data = loader.load()

    # Verification
    if len(data) != 0:
        return data
    else:
        print('Document is empty')
        return None
  

#### Verification 

In [55]:
pdf_file="./documents/churchill_speech.pdf"
docx_file="./documents/churchill_speech.docx"
text_file="./documents/churchill_speech.txt"

print(load_document(pdf_file))
print(load_document(docx_file))
print(load_document(text_file))

Loading ./documents/churchill_speech.pdf
[Document(page_content="Winston Churchill Speech - We Shall Fight on the Beaches We Shall Fight on the Beaches June 4, 1940 House of Commons From the moment that the French defenses at Sedan and on the Meuse were broken at the end of the second week of May, only a rapid retreat to Amiens and the south could have saved the BriGsh and French Armies who had entered Belgium at the appeal of the Belgian King; but this strategic fact was not immediately realized. The French High Command hoped they would be able to close the gap, and the Armies of the north were under their orders. Moreover, a reGrement of this kind would have involved almost certainly the destrucGon of the ﬁne Belgian Army of over 20 divisions and the abandonment of the whole of Belgium. Therefore, when the force and scope of the German penetraGon were realized and when a new French Generalissimo, General Weygand, assumed command in place of General Gamelin, an eﬀort was made by the F

### Chunking Data

In [53]:
def chunk_data(data):
    # Find how to load a built-in document transformers that make it easy to split with https://python.langchain.com/docs/modules/data_connection/document_transformers/
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=256, # maximum size of text chunk in number of characters
        chunk_overlap=20, # pecifies the number of overlapping characters between adjacent chunks.
                          # if chunk 1 ends at character 100, chunk 2 will start at character 80
    )
    chunks = text_splitter.create_documents([data])
    # Verification
    if len(chunks) != 0:
        return chunks
    else:
        print('Document is not split')
        return None
    

#### Verification

In [26]:
chunks = chunk_data(str(load_document(text_file)))
print(len(chunks))


92


### Embedding and Uploading to a Vector Database (Pinecone)

In [80]:
def insert_or_fetch_embeddings(index_name, chunks):
    from pinecone import Pinecone, PodSpec
    from langchain.vectorstores import Pinecone as Pinecone_langchain
    from langchain.embeddings import CohereEmbeddings
    # Find how to create embeddings instance 
    embeddings = CohereEmbeddings()

    # Initialize Pinecone  with API key 
    pinecone = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))
    if index_name in pinecone.list_indexes().names():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Ok')
    else:
        print(f'Creating index {index_name} and embeddings ...', end='')
        # Create index
        # TO DO
        # Index the text chunks into Pinecone
        # TO DO - use chunks/embeddings/index_name
        pinecone.create_index(index_name, dimension=4096, metric='cosine',spec=PodSpec(environment="gcp-starter"))
        vector_store = Pinecone_langchain.from_documents(chunks, embeddings, index_name=index_name)
        print('Ok')
        
    return vector_store
    

#### Verification

In [83]:
index_name="hello world"

vectorstore = insert_or_fetch_embeddings(index_name, chunks)
if index_name in pinecone.list_indexes().names():
        print(f'Index {index_name} already exists. Loading embeddings ... ', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('Verification complete')
else:
    print('Verification incomplete')


Creating index hello world and embeddings ...

PineconeApiException: (400)
Reason: Bad Request
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'X-Cloud-Trace-Context': 'b647f04cec8e36b1d4d0da6815dc6195', 'Date': 'Mon, 25 Mar 2024 22:05:06 GMT', 'Server': 'Google Frontend', 'Content-Length': '125', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"INVALID_ARGUMENT","message":"Name must consist of lower case alphanumeric characters or '-'"},"status":400}


In [43]:
def delete_pinecone_index(index_name='all', client=None):
    if client is None:
        raise ValueError("You need to provide a Pinecone client instance.")

    if index_name == 'all':
        indexes = client.list_indexes().names()
        print('Deleting all indexes ... ')
        for index in indexes:
            client.delete_index(index)
        print('Ok')
    else:
        print(f'Deleting index {index_name} ...', end='')
        client.delete_index(index_name)
        print('Ok')

    

In [82]:
from pinecone import Pinecone

# Remplacez 'YOUR_API_KEY' par votre propre clé API Pinecone
client = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))

# Appelez la fonction en passant l'instance du client Pinecone
delete_pinecone_index(client=client)

Deleting all indexes ... 
Ok


### Asking and Getting Answers

In [76]:
def ask_and_get_answer(vector_store, q):
    from langchain.chains import RetrievalQA
    from langchain.chat_models import ChatCohere
    from langchain_community.llms import Cohere
    # llm = ?
    llm = Cohere(temperature=0.75, cohere_api_key=os.environ.get('COHERE_API_KEY'))
    # retriever = ?
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': 10}) 
    
    # chain = ?
    chain = RetrievalQA.from_chain_type(llm=llm,chain_type="stuff", retriever=retriever)
    
    answer = chain.invoke(q)
    return answer

### Running Code in order to complete this project

In [51]:
data = load_document(text_file)

print(f'You have {len(data)} pages in your data')
if len(data) > 0:
    print(f'There are {len(data[0].page_content)} characters in the page')
else:
    print("No pages found in your data")

You have 1 pages in your data
There are 21271 characters in the page


In [61]:
# Assurez-vous que 'data' est une liste de chaînes de caractères
data = load_document(text_file)
type(data)
# Si 'data' est une liste de chaînes de caractères, vous pouvez continuer
#chunks = chunk_data(data)
#print(type(data))

#print(len(chunks))
#print(chunks[10].page_content)



list

In [68]:
from pinecone import Pinecone

# Remplacez 'YOUR_API_KEY' par votre propre clé API Pinecone
client = Pinecone(api_key=os.environ.get('PINECONE_API_KEY'))

# Appelez la fonction en passant l'instance du client Pinecone
delete_pinecone_index(client=client)

Deleting all indexes ... 
Ok


In [69]:
index_name = 'meriam-index'
vector_store = insert_or_fetch_embeddings(index_name,chunks)

Creating index meriam-index and embeddings ...Ok


In [78]:
q = 'What is the whole document about?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'What is the whole document about?', 'result': ' The document is a transcript of a speech given by Winston Churchill, former Prime Minister of the United Kingdom, to the House of Commons in 1940. \n\nIn his speech, Churchill acknowledges the dire situation that the Allied Forces were facing with the rapid advancement of the Axis Powers, specifically the fall of France at the hands of Germany. He asserts the determination of the British people and the British Empire in continuing to fight, regardless of the circumstances. \n\nHe also acknowledges the role of the French in the fight and expresses his hope that, through their combined efforts, they can liberate France from the oppressive regime of the Axis Powers. \n\nThe speech was well-received and is remembered for its inspiring message, promising that the British would continue to fight against the Axis Powers, wherever that may be necessary. \n\nThroughout the speech, Churchill employs powerful and evocative language to sti

In [79]:
import time
i = 1
print('Write Quit or Exit to quit.')
while True:
    q = input(f'Question #{i}: ')
    i = i + 1
    if q.lower() in ['quit', 'exit']:
        print('Quitting ... bye bye!')
        time.sleep(2)
        break
    
    answer = ask_and_get_answer(vector_store, q)
    print(f'\nAnswer: {answer}')
    print(f'\n {"-" * 50} \n')

    

Write Quit or Exit to quit.


Question #1:  c'est quoi la capitale de la france



Answer: {'query': "c'est quoi la capitale de la france", 'result': ' The capital of France is Paris and has been since the medieval ages in 1792. \nIt is known for its history of culture, art, food and fashion. \nWould you like to know more about the city of Paris? '}

 -------------------------------------------------- 



Question #2:  Exit


Quitting ... bye bye!


In [None]:
delete_pinecone_index()