In [2]:
import os
os.environ['OPENAI_API_KEY'] = ""

In [3]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.embeddings  import OpenAIEmbeddings
from langchain.vectorstores import FAISS
from langchain.prompts  import PromptTemplate
from langchain_openai  import  ChatOpenAI

In [4]:
def process_document(path, chunk_size=1000, chunk_overlap=200):
    # Load PDF documents
    loader = PyPDFLoader(path)
    documents = loader.load()

    # Split documents into chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size, chunk_overlap=chunk_overlap, length_function=len
    )
    texts = text_splitter.split_documents(documents)
    # Create embeddings and vector store
    embeddings = OpenAIEmbeddings()
    vectorstore = FAISS.from_documents(texts, embeddings)
    return vectorstore

In [5]:
path = "/home/erginous/Desktop/attention.pdf"

In [6]:
vectorstore = process_document(path)

  embeddings = OpenAIEmbeddings()


In [9]:
llm = ChatOpenAI(model = "gpt-4o-mini" , temperature = 0)
embeddings = OpenAIEmbeddings()

document_len = 1000
hyde_prompt = PromptTemplate(
            input_variables=["query", "document_len"],
            template="""Given the question '{query}', generate a hypothetical document that directly answers this question. The document should be detailed and in-depth.
            the document size has be exactly {document_len} characters.""",
        )

hyde_chain = hyde_prompt | llm

query = "What is positional encoding ?"
hypothetical_docs = hyde_chain.invoke({"query":query , "document_len": document_len})

In [17]:
hypothetical_docs

AIMessage(content='**Positional Encoding: An In-Depth Overview**\n\nPositional encoding is a technique used in neural networks, particularly in transformer architectures, to provide information about the position of tokens in a sequence. Unlike recurrent neural networks (RNNs), which inherently process data in order, transformers process all tokens simultaneously. This parallel processing necessitates a method to retain the sequential order of the input data.\n\nPositional encodings are added to the input embeddings of tokens to inject this positional information. Typically, these encodings are generated using sine and cosine functions of varying frequencies. The formula for the positional encoding for a position \\( pos \\) and dimension \\( i \\) is:\n\n\\[\nPE(pos, 2i) = \\sin\\left(\\frac{pos}{10000^{\\frac{2i}{d_{model}}}}\\right)\n\\]\n\\[\nPE(pos, 2i+1) = \\cos\\left(\\frac{pos}{10000^{\\frac{2i}{d_{model}}}}\\right)\n\\]\n\nHere, \\( d_{model} \\) is the dimensionality of the e

In [13]:
context_docs = vectorstore.similarity_search(hypothetical_docs.content , k=3)

In [15]:
len(context_docs)

3

In [16]:
context_docs

[Document(metadata={'source': '/home/erginous/Desktop/attention.pdf', 'page': 5}, page_content='tokens in the sequence. To this end, we add "positional encodings" to the input embeddings at the\nbottoms of the encoder and decoder stacks. The positional encodings have the same dimension dmodel\nas the embeddings, so that the two can be summed. There are many choices of positional encodings,\nlearned and fixed [9].\nIn this work, we use sine and cosine functions of different frequencies:\nPE(pos,2i)=sin(pos/100002i/d model)\nPE(pos,2i+1)=cos(pos/100002i/d model)\nwhere posis the position and iis the dimension. That is, each dimension of the positional encoding\ncorresponds to a sinusoid. The wavelengths form a geometric progression from 2πto10000 ·2π. We\nchose this function because we hypothesized it would allow the model to easily learn to attend by\nrelative positions, since for any fixed offset k,PEpos+kcan be represented as a linear function of\nPEpos.\nWe also experimented with usi