In [None]:
import os 
from dotenv import load_dotenv, find_dotenv
_=load_dotenv(find_dotenv())
groq_api_key=os.environ['GROQ_API_KEY']

In [None]:
from langchain_community.document_loaders import TextLoader
loader=TextLoader("data/be-good.txt")
loaded_data=loader.load()
print(loaded_data)

## Character Text Splitter


In [None]:
from langchain_text_splitters import CharacterTextSplitter
text_splitter=CharacterTextSplitter(
    separator='\n\n',
    chunk_size=1000,
    chunk_overlap=200,
    length_function=len,
    is_separator_regex=False,
)
texts=text_splitter.create_documents([loaded_data[0].page_content])
texts


In [None]:
len(texts)

In [None]:
texts[0]

In [None]:
texts[1]

## RecursiveCharacterTextSplitter


In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
recursive_splitter=RecursiveCharacterTextSplitter(
    chunk_size=26,
    chunk_overlap=4
)
texts=recursive_splitter.split_text(loaded_data[0].page_content)
texts




## Embeddings
- Transform the small parts of text in numbers (vectors) that are easily stored and searched by vector databases.

In [None]:
# from langchain_openai import OpenAIEmbeddings
# embedding_model=OpenAIEmbeddings()
# chunks_of_text =     [
#         "Hi there!",
#         "Hello!",
#         "What's your name?",
#         "Bond, James Bond",
#         "Hello Bond!"
#     ]
# embeddings=embedding_model.embed_documents(chunks_of_text)
# embeddings

In [None]:
 chunks_of_text =[
        "Hi there!",
        "Hello!",
        "What's your name?",
        "Bond, James Bond",
        "Hello Bond!"
    ]

In [None]:
# pip install sentence-transformers
from langchain.embeddings import HuggingFaceEmbeddings

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
embeddings = embedding_model.embed_documents(chunks_of_text)


In [None]:
embeddings

In [None]:
print(len(embeddings))      # Returns the total number of text chunks (number of embedding vectors)
print(len(embeddings[0]))     # Returns the size of each embedding vector (number of dimensions/features)


In [None]:
print(embeddings[0][:5])   # Prints the first 5 values of the first embedding vector


## Vector Stores (aka. Vector Databases)
- Store embeddings in a very fast searchable database.


In [None]:
from langchain_community.document_loaders import TextLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.vectorstores import FAISS



In [None]:
loaded_document=TextLoader("data/state_of_the_union.txt").load()
text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
chunk_of_text=text_splitter.split_documents(loaded_document)
len(chunk_of_text)

In [None]:
vector_db=Chroma.from_documents(chunk_of_text,HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"))

In [None]:
vector_db

In [None]:
question = "What did the president say about the John Lewis Voting Rights Act?"
response=vector_db.similarity_search(question)
print(response[0].page_content)

## Vector Store as Retriever
- Find the embedding that best answers your question.

In [None]:
from langchain_community.vectorstores import FAISS


In [None]:
loaded_document=TextLoader("data/state_of_the_union.txt").load()
text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
chunk_of_text=text_splitter.split_documents(loaded_document)
len(chunk_of_text)
embeddings=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_db=FAISS.from_documents(chunk_of_text,HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2"))

In [None]:
# conda install -c conda-forge faiss-cpu


In [None]:
vector_db


In [None]:
retriever = vector_db.as_retriever(search_kwargs={"k": 1})

In [None]:
response = retriever.invoke("what did he say about ketanji brown jackson?")
response

In [None]:
len(response)