### Install all packages and libraries required

In [2]:
# %pip install langchain
# %pip install huggingface_hub
# %pip install sentence_transformers
# %pip install faiss-cpu
# %pip install unstructured
# %pip install chromadb
# %pip install Cython
# %pip install tiktoken
# %pip install unstructured[local-inference]
# %pip install pypdf
# %pip install ipywidgets
# %pip install unstructured

### Set Hugging Face Acess Token

In [3]:
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

### Import Libraries

In [4]:
from langchain.document_loaders import TextLoader  #for textfiles
from langchain.text_splitter import CharacterTextSplitter #text splitter
from langchain.embeddings import HuggingFaceEmbeddings #for using HugginFace models
# Vectorstore: https://python.langchain.com/en/latest/modules/indexes/vectorstores.html
from langchain.vectorstores import FAISS  #facebook vectorizationfrom langchain.chains.question_answering import load_qa_chain
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub
from langchain.document_loaders import UnstructuredPDFLoader  #load pdf
from langchain.indexes import VectorstoreIndexCreator #vectorize db index with chromadb
from langchain.chains import RetrievalQA
from langchain.document_loaders import UnstructuredURLLoader  #load urls into docoument-loader

Could not import azure.core python package.


### Declare suport function to print text

In [5]:
import textwrap
def wrap_text_preserve_newlines(text, width=110):
    # Split the input text into lines based on newline characters
    lines = text.split('\n')
    # Wrap each line individually
    wrapped_lines = [textwrap.fill(line, width=width) for line in lines]
    # Join the wrapped lines back together using newline characters
    wrapped_text = '\n'.join(wrapped_lines)
    return wrapped_text

# Talk to PDF using a combination of FAISS and VectorstoreIndexCreator

### Load PDFs

We load pdfs in two different ways, because each one wields a better result when building each of the two vector stores we are going to use.

In [6]:
# Load all pdfs using PyPDFDirectoryLoader -> This will be used by FAISS
from langchain.document_loaders import PyPDFDirectoryLoader
loader = PyPDFDirectoryLoader("apostila_biologia/")
docs = loader.load()

In [7]:
# Text Splitter
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=10)
documents = text_splitter.split_documents(docs)

In [8]:
# Load all pdfs using UnstructuredPDFLoader -> This will be used by VectorstoreIndexCreator
import os
pdf_folder_path = 'apostila_biologia/'
os.listdir(pdf_folder_path)
loaders = [UnstructuredPDFLoader(os.path.join(pdf_folder_path, fn)) for fn in os.listdir(pdf_folder_path)]

### Load Embeddings

In [9]:
# Embeddings
from langchain.embeddings import HuggingFaceEmbeddings
embeddings = HuggingFaceEmbeddings()

### Create Vectore Stores

Doing a similarity search on the pdfs using FAISS to get the document pages for the query first, them using VictorstoreIndexCreator as the retriever in the chain wields best results

In [10]:
# This should take some time (depending on pdf size)
from langchain.vectorstores import FAISS
db = FAISS.from_documents(docs, embeddings)

In [12]:
# This should take some time (depending on pdf size)
# Chunk size warnings here can be ignored
index = VectorstoreIndexCreator(
    embedding=HuggingFaceEmbeddings(),
    text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loaders)

Created a chunk of size 1138, which is longer than the specified 1000
Created a chunk of size 1897, which is longer than the specified 1000
Created a chunk of size 1715, which is longer than the specified 1000
Created a chunk of size 3771, which is longer than the specified 1000
Created a chunk of size 1297, which is longer than the specified 1000
Created a chunk of size 2006, which is longer than the specified 1000
Created a chunk of size 2777, which is longer than the specified 1000
Created a chunk of size 1752, which is longer than the specified 1000
Created a chunk of size 3268, which is longer than the specified 1000
Created a chunk of size 1054, which is longer than the specified 1000
Created a chunk of size 1011, which is longer than the specified 1000
Created a chunk of size 2214, which is longer than the specified 1000
Created a chunk of size 1146, which is longer than the specified 1000
Created a chunk of size 1127, which is longer than the specified 1000
Created a chunk of s

### Hugging Face LLM

In [13]:
# Set model repo name
# Must be text2text-generation or text-generation model
# Change this to use other LLMs
# Find them here: https://huggingface.co/models?pipeline_tag=text2text-generation&sort=downloads
# Model MUST have it's Hosted Inference API active
# If a model is too large, it's more likely the API is going to timeout

model_repo_name = "unicamp-dl/ptt5-base-portuguese-vocab"


In [14]:
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub

#Load llm with selected one
llm=HuggingFaceHub(repo_id=model_repo_name, model_kwargs={"temperature":0, "max_length":512})
#Prepare the pipeline
from langchain.chains import RetrievalQA

chain = RetrievalQA.from_chain_type(llm=llm, 
                                    chain_type="stuff", 
                                    retriever=index.vectorstore.as_retriever(),
                                    input_key="question")

### Ask Questions

In [19]:
# This should return pages containing the answer to our query
query = "Qual a diferença de floema e xilema?"
pages = db.similarity_search(query)

In [20]:
# Pass pages to LLM to build an answer
# First query should take some time
answer = chain.run(input_documents=pages, question=query)
# First sentence (163 characters) is left over from a langchain prompt template so we just throw it out
print(wrap_text_preserve_newlines(answer[163:]))

Quando faltam elementos fl orais (também denominados verticilos), a fl or é chamada de incompleta. Quando um
dos verticilos férteis está ausente, a fl or é denominada díclina: ou possui apenas gineceu (fl or pistilada),
ou possui apenas androceu (fl or estaminada). A fl or que possui gineceu e androceu é denominada monóclina.
Flor monóclina Flores díclinas Pistilada e estaminada Flor pistilada Flor estaminada Possui estruturas
femininas (car- pelos) e masculinas (estames) Possui somente estruturas femininas (carpelos) Possui somente
estruturas masculinas (estames) Fonte: elaborada pela autora.
Fonte:https://commons.wikimedia.org/wiki/File:Flower_morphology_sex_staminate.png, https://commons.
wikimedia.org/wiki/File:Flower_morphology_attachment_pedicellate., https://commons.wikimedia.org/wiki/File:-
Flower_morphology_sex_pistillate.pngpng?uselang=pt-br Figura 69: Flor da família Magnoliaceae. Ao centro,
vários carpelos. Fonte: http://www.sxc.hu/photo/815609 Figura70: Flores da família L

# Talk to websites using VectorstoreIndexCreator

Here we will only use VectorstoreIndexCreator to show another way to use langchain, but this can easily be modified to use FAISS

### Load urls

In [None]:
from langchain.document_loaders import UnstructuredURLLoader
urls = [
    "https://pt.wikipedia.org/wiki/Mitose",
    "https://pt.wikipedia.org/wiki/Meiose",
    "https://pt.wikipedia.org/wiki/Divis%C3%A3o_celular",
    "https://pt.wikipedia.org/wiki/Sistema_imunit%C3%A1rio",
    "https://pt.wikipedia.org/wiki/Mam%C3%ADferos",
    "https://pt.wikipedia.org/wiki/Floema",
    "https://pt.wikipedia.org/wiki/Xilema",
    "https://pt.wikipedia.org/wiki/Angiosperma",
]
loader_url = [UnstructuredURLLoader(urls=urls)]

### Create Vector Store using VectorstoreIndexCreator

In [None]:
index_url = VectorstoreIndexCreator(
    embedding=HuggingFaceEmbeddings(),
    text_splitter=CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)).from_loaders(loader_url)

Created a chunk of size 1096, which is longer than the specified 1000
Created a chunk of size 1717, which is longer than the specified 1000
Created a chunk of size 1860, which is longer than the specified 1000
Using embedded DuckDB without persistence: data will be transient


### Hugging Face LLM

In [None]:
# Set model repo name
# Must be text2text-generation or text-generation model
# Change this to use other LLMs
# Find them here: https://huggingface.co/models?pipeline_tag=text2text-generation&sort=downloads
# Model MUST have it's Hosted Inference API active
# If a model is too large, it's more likely the API is going to timeout

model_repo_name_url = "unicamp-dl/ptt5-base-portuguese-vocab"

In [None]:
# Load model
llm_url=HuggingFaceHub(repo_id=model_repo_name_url, model_kwargs={"temperature":0, "max_length":512})

# Create chain
from langchain.chains import RetrievalQA
chain_url = RetrievalQA.from_chain_type(llm=llm_url, 
                                    chain_type="stuff", 
                                    retriever=index_url.vectorstore.as_retriever(), 
                                    input_key="question")

### Ask Questions

In [None]:
query_url = "Qual a função do floema?"
answer_url = chain_url.run(query_url)
# First sentence (163 characters) is left over from a langchain prompt template so we just throw it out
print(wrap_text_preserve_newlines(answer_url[163:]))

Os produtos transportados pelo floema são substâncias inorgânicas e orgânicas, como água, lipídios e
carboidratos, são transportados desde os órgãos da planta com capacidade fotossintética (ou produtores), como
folhas maduras, até outros que funcionam como consumidores dessas substâncias, para a formação de novos órgãos
ou para reserva, nomeadamente, os meristemas, as células do interior do caule, da raiz, das flores, dos frutos
e dos órgãos de reserva - que podem estar dispersos dentro do caule e da raiz, mas que podem estar
especializados, como os tubérculos e rizomas. Ocorrência[editar | editar código-fonte] O floema está presente
praticamente em toda fase da vida da planta, tanto estrutura primária, na qual a planta ainda está em sua
forma jovem, quanto em estrutura secundária na qual os órgãos adquirem uma certa resistência. Ocorre em todas
as partes da planta: caule, raiz, folha, partes florais etc. Normalmente, durante o crescimento primário (em
altura), o floema e o xilema se a

Referencia:

https://www.python-engineer.com/posts/langchain-crash-course/

https://python.langchain.com/en/latest/modules/models/llms/integrations/huggingface_hub.html

https://python.langchain.com/en/latest/_modules/langchain/embeddings/huggingface.html

https://medium.com/the-techlife/using-huggingface-openai-and-cohere-models-with-langchain-db57af14ac5b

https://python.langchain.com/en/latest/reference/modules/embeddings.html

https://artificialcorner.com/answering-question-about-your-documents-using-langchain-and-not-openai-2f75b8d639ae
