In [1]:
## Data Ingestion
from langchain_community.document_loaders import TextLoader

loader = TextLoader("speech.txt")
text_documents = loader.load()


In [2]:
import os
from dotenv import load_dotenv

load_dotenv()

os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [3]:
# web base loader
from langchain_community.document_loaders import WebBaseLoader
import bs4

## load, chunk and index the content of the html page
loader = WebBaseLoader(
    web_paths=("https://lilianweng.github.io/posts/2023-06-23-agent/",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-title", "post_content", "post_header")
        )
    )
)
text_documents = loader.load()

In [4]:
# PDF Reader
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("attention.pdf")
text_documents = loader.load()

In [5]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
documents = text_splitter.split_documents(text_documents)

In [6]:
## Vector Embedding and Vector store
from langchain_community.embeddings import OllamaEmbeddings
## Chroma Vector database
from langchain_community.vectorstores import Chroma

db1 = Chroma.from_documents(documents, OllamaEmbeddings(model="mxbai-embed-large"))
query = "We trained our models on one machine with 8 NVIDIA P100 GPUs."
result = db1.similarity_search(query)

In [12]:
## FAISS Vector Database
from langchain_community.vectorstores import FAISS

db2 = FAISS.from_documents(documents, OllamaEmbeddings(model="mxbai-embed-large"))
query = "We trained our models on one machine with 8 NVIDIA P100 GPUs."
result = db2.similarity_search(query)

In [13]:
result

[Document(page_content='We trained our models on one machine with 8 NVIDIA P100 GPUs. For our base models using\nthe hyperparameters described throughout the paper, each training step took about 0.4 seconds. We\ntrained the base models for a total of 100,000 steps or 12 hours. For our big models,(described on the\nbottom line of table 3), step time was 1.0 seconds. The big models were trained for 300,000 steps\n(3.5 days).\n5.3 Optimizer\nWe used the Adam optimizer [ 20] with β1= 0.9,β2= 0.98andϵ= 10−9. We varied the learning\nrate over the course of training, according to the formula:\nlrate =d−0.5\nmodel·min(step_num−0.5, step _num·warmup _steps−1.5) (3)\nThis corresponds to increasing the learning rate linearly for the first warmup _steps training steps,\nand decreasing it thereafter proportionally to the inverse square root of the step number. We used\nwarmup _steps = 4000 .\n5.4 Regularization\nWe employ three types of regularization during training:\n7', metadata={'source': 'atte