In [None]:
## Data Ingestion
from langchain_community.document_loaders import TextLoader

loader = TextLoader("speech.txt")
text_documents=loader.load()
text_documents

In [7]:
import os
from dotenv import load_dotenv
# Load environment variables from .env file
load_dotenv()

# Retrieve the OPENAI API key
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

if not OPENAI_API_KEY:
    raise ValueError("OPENAI_API_KEY is not set in the .env file")

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

In [8]:
# web based loader
from langchain_community.document_loaders import WebBaseLoader
import bs4

## load, chunk and index the content of the html page
os.environ["USER_AGENT"] = "MyScript/1.0"
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-title", "post-content", "post-header")
        )
    ),
)

text_documents = loader.load()

In [None]:
text_documents

In [10]:
## Pdf reader
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("attention.pdf")
docs = loader.load()

In [None]:
docs

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(docs)
documents[:5]

In [None]:
## Vector Embedding And Vector Store
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

db = Chroma.from_documents(documents[:20], OpenAIEmbeddings())

In [None]:
query = "Who are the authors of attention is all you need?"
retireved_results = db.similarity_search(query)
print(retireved_results[0].page_content)

In [16]:
## FAISS Vector Database
from langchain_community.vectorstores import FAISS

db = FAISS.from_documents(documents[:15], OpenAIEmbeddings())

In [None]:
query = "Who are the authors of attention is all you need?"
retireved_results = db.similarity_search(query)
print(retireved_results[0].page_content)