In [2]:
from dotenv import load_dotenv

load_dotenv()

True

In [7]:
def load_document(file):
    from langchain_community.document_loaders import PyPDFLoader
    print(f'Loading {file}')
    loader = PyPDFLoader(file)
    data = loader.load()
    return data

#### Below text parses through extensions of files

In [15]:
def load_document(file):
    import os
    name, extension = os.path.splitext(file)

    if extension == '.pdf':
        from langchain_community.document_loaders import PyPDFLoader
        print(f'Loading {file}')
        loader = PyPDFLoader(file)
    elif extension == '.docx':
        from langchain_community.document_loaders import Docx2txtLoader
        print(f'Loading {file}')
        loader = Docx2txtLoader(file)
    else:
        print('Document format not supported')
        return None


    data = loader.load()
    return data


def load_from_wikipedia(query, lang='en', load_max_docs=2):
    from langchain_community.document_loaders import WikipediaLoader
    print(f'Loading {query} from Wikipedia')
    loader = WikipediaLoader(query=query, lang=lang, load_max_docs=load_max_docs)
    data = loader.load()
    return data


In [50]:
def chunk_data(data, chunk_size=256):
    from langchain.text_splitter import RecursiveCharacterTextSplitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=0
    )
    chunks = text_splitter.split_documents(data)
    return chunks




In [26]:
def print_embeddings_costs(texts):
    import tiktoken
    #enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    enc = tiktoken.encoding_for_model('text-embedding-3-small')
    #enc = tiktoken.encoding_for_model('text-embedding-3-large')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total tokens: {total_tokens}')
    print(f'Total Embeddings cost in USD: {total_tokens / 1000 * 0.0004:.6f}')
# print(embeddings_costs(chunks))

### Embedding and Uploading to a Vector Database (Pinecone)

In [51]:
def insert_of_fetch_embeddings(index_name):
    import pinecone
    from langchain_community.vectorstores import Pinecone
    from langchain_openai import OpenAIEmbeddings
    from pinecone import PodSpec


    embeddings = OpenAIEmbeddings(model='text-embedding-3-small', dimensions=1536)
    pc = pinecone.Pinecone()

    if index_name in pc.list_indexes():
        print(f'Index {index_name} already exists.  Loading embeddings...', end='')
        vector_store = Pinecone.from_existing_index(index_name, embeddings)
        print('OK')
    else:
        print(f'Creating index {index_name} and embeddigns...', end='')

        # create a new index
        pc.create_index(
            name=index_name,
            metric='cosine',
            dimension=1536,
            spec=PodSpec(
                environment='gcp-starter'
            )
        )
        vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
        print('OK')

    return vector_store    

In [47]:
def delete_pineconde_index(index_name='all'):
    import pinecone
    import os
    #pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))    
    pc = pinecone.Pinecone()

    if index_name == 'all':
        indexes = pc.list_indexes().names()
        for index in indexes:
            print(f'Deleting all indexes...')
            pc.delete_index(index)
            print('OK')
    else:
        print(f'Deleting index {index_name}...', end='')
        pc.delete_index(index_name)
        print('OK')
        



## Running Code

In [22]:
data = load_document('resources/files/us_constitution.pdf')
#print(data[1].page_content)
#print(data[10].metadata)

print(f'You have {len(data)} pages in the document')
print(f'You have {len(data[20].page_content)} characters in page 20')

Loading resources/files/us_constitution.pdf
You have 41 pages in the document
You have 1137 characters in page 20


In [None]:
# data = load_document('resources/files/the_great_gatsby.docx')
# print(data[0].page_content)

In [None]:
#data = load_from_wikipedia('GPT-4')
# data = load_from_wikipedia('GPT-4', lang='fr')  # French ouptput
# print(data[0].page_content)

In [None]:
chunks = chunk_data(data)
print(len(chunks))
print(chunks[10].page_content)

In [27]:
print_embeddings_costs(chunks)

Total tokens: 16711
Total Embeddings cost in USD: 0.006684


In [48]:
delete_pineconde_index()

In [52]:
index_name = 'askadocument'
vector_store = insert_of_fetch_embeddings(index_name)


Creating index askadocument and embeddigns...OK


### Asking and Getting Answers

In [54]:
def ask_and_get_answer(vector_store, q, k=3):
    from langchain.chains import RetrievalQA
    from langchain_openai import ChatOpenAI

    llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})

    chain = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever)
    
    answer = chain.invoke(q)
    return answer

def ask_with_memory(vector_store, question, chat_history=[], k=3):
    from langchain.chains import ConversationalRetrievalChain
    from langchain_openai import ChatOpenAI
    
    llm = ChatOpenAI(temperature=1)
    retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k': k})
    
    crc = ConversationalRetrievalChain.from_llm(llm, retriever)
    result = crc.invoke({'question': question, 'chat_history': chat_history})
    chat_history.append((question, result['answer']))
    
    return result, chat_history
    

In [57]:
q = 'What is the whole document about?'
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'What is the whole document about?', 'result': 'The excerpts provided are from the United States Constitution, which is the supreme law of the United States. It outlines the framework for the government, the rights of citizens, and the relationship between the federal government and the states. It establishes the three branches of government: the legislative branch, the executive branch, and the judicial branch, as well as their powers and responsibilities. The Constitution ensures the protection of individual liberties and sets the foundation for the United States government.'}


In [61]:
import time
i = 1
print('Write Quit or Exit to quit!')
while True:
    q = input(f'Ask Question #{1}: ')
    i += 1
    if q.lower() in ['quit', 'exit']:
        print('Quitting....')
        time.sleep(2)
        break
    answer = ask_and_get_answer(vector_store, q)
    print(f'Answer: {answer}')
    print(f'\n {"-"*50} \n')




Write Quit or Exit to quit!
Answer: {'query': 'what is first amendment of constitution?', 'result': 'The First Amendment of the Constitution of the United States ensures the protection of freedoms such as freedom of religion, freedom of speech, freedom of the press, and the right to peaceably assemble and petition the government.'}

 -------------------------------------------------- 

Answer: {'query': 'explain the concept of federalism as presented in US consitution?', 'result': 'Federalism, as presented in the US Constitution, is the principle of shared power between the national government and state governments. The US Constitution divides governmental authority between the federal government (national government) and state governments. Both levels of government have their own separate powers and responsibilities, and this division of power helps to prevent any one level of government from becoming too powerful. This is an important aspect of the US Constitution and helps maintain 

In [62]:
delete_pineconde_index()

Deleting all indexes...
OK


In [63]:
data = load_from_wikipedia('ChatGPT', 'ro')
chunks = chunk_data(data)
index_name = 'chatgpt'
vector_store = insert_of_fetch_embeddings(index_name)

Loading ChatGPT from Wikipedia
Creating index chatgpt and embeddigns...OK


In [64]:
q = "Ce este ChatGPT?"
answer = ask_and_get_answer(vector_store, q)
print(answer)

{'query': 'Ce este ChatGPT?', 'result': 'ChatGPT este un membru al familiei de modele de limbaj generative pre-antrenate (GPT). A fost reglat fin peste o versiune îmbunătățită a GPT-3 a OpenAI cunoscută sub numele de „GPT-3.5”. Este utilizat pentru a genera texte pe baza input-ului furnizat de utilizatori și poate servi diverse scopuri, inclusiv educația.'}
