# DocumentLoaders

### Tipos de Document Loaders no Langchain

![Document loaders](arquivos/loaders.png)

## Carregando PDFs

In [None]:
from langchain_community.document_loaders.pdf import PyPDFLoader

caminho = 'arquivos/Explorando o Universo das IAs com Hugging Face.pdf'
loader = PyPDFLoader(caminho)
documentos = loader.load()

In [None]:
len(documentos)

In [None]:
print(documentos[3].page_content)

In [None]:
documentos[3].metadata

### Fazendo perguntas para o arquivo

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain_openai.chat_models import ChatOpenAI

chat = ChatOpenAI(model='gpt-3.5-turbo-0125')

chain = load_qa_chain(llm=chat, chain_type='stuff', verbose=True)

In [None]:
pergunta = 'Quais assuntos são tratados no documento?'

chain.run(input_documents=documentos[:10], question=pergunta)

## Carregando csv

In [None]:
from langchain_community.document_loaders.csv_loader import CSVLoader

caminho = 'arquivos/Top 1000 IMDB movies.csv'
loader = CSVLoader(caminho)
documentos = loader.load()

In [None]:
len(documentos)

In [None]:
print(documentos[2].page_content)

In [None]:
documentos[2].metadata

In [None]:
from langchain.chains.question_answering import load_qa_chain
from langchain_openai.chat_models import ChatOpenAI

chat = ChatOpenAI(model='gpt-3.5-turbo-0125')

chain = load_qa_chain(llm=chat, chain_type='stuff', verbose=True)

In [None]:
pergunta = 'Qual é o filme com maior metascore?'
chain.run(input_documents=documentos[:10], question=pergunta)

## Carregando da Internet

### Youtube

In [None]:
from langchain_community.document_loaders.generic import GenericLoader
from langchain_community.document_loaders.blob_loaders.youtube_audio import YoutubeAudioLoader
from langchain.document_loaders.parsers import OpenAIWhisperParser

In [None]:
url = 'https://www.youtube.com/watch?v=rOjusRRO1EI'
save_dir='docs/youtube/'
loader = GenericLoader(
    YoutubeAudioLoader([url], save_dir),
    OpenAIWhisperParser()
)
docs = loader.load()

In [None]:
len(docs)

In [None]:
print(docs[0].page_content[:500])

In [None]:
docs[1].metadata

### URLs

In [None]:
from langchain_community.document_loaders.web_base import WebBaseLoader

url = 'https://hub.asimov.academy/blog/listas-em-python/'
loader = WebBaseLoader(url)
documentos = loader.load()

In [None]:
len(documentos)

In [None]:
print(documentos[0].page_content[1000:2000])

## Carregando do Notion

In [None]:
from langchain_community.document_loaders.notion import NotionDirectoryLoader
caminho = 'arquivos/notion_db'
loader = NotionDirectoryLoader(caminho)
documentos = loader.load()

In [None]:
len(documentos)

In [None]:
print(documentos[0].page_content)

In [None]:
documentos[2].metadata