# Example 

Lets extract information from webpages - using Unstructured

https://python.langchain.com/docs/integrations/document_loaders/unstructured_file/#overview

In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

True

In [50]:
from langchain_unstructured import UnstructuredLoader

loader = UnstructuredLoader(web_url="https://python.langchain.com/docs/integrations/document_loaders/unstructured_file/#loading-web-pages")
docs = loader.load()

for doc in docs:
    print(f"{doc}\n")

page_content='Open In Colab' metadata={'image_url': 'https://colab.research.google.com/assets/colab-badge.svg', 'link_texts': ['Open In Colab'], 'link_urls': ['https://colab.research.google.com/github/langchain-ai/langchain/blob/master/docs/docs/integrations/document_loaders/unstructured_file.ipynb'], 'languages': ['eng'], 'filetype': 'text/html', 'url': 'https://python.langchain.com/docs/integrations/document_loaders/unstructured_file/#loading-web-pages', 'category': 'Image', 'element_id': '76f10732f139a03f24ecf55613a5116a'}

page_content='Open on GitHub' metadata={'image_url': 'https://img.shields.io/badge/Open%20on%20GitHub-grey?logo=github&logoColor=white', 'link_texts': ['Open on GitHub'], 'link_urls': ['https://github.com/langchain-ai/langchain/blob/master/docs/docs/integrations/document_loaders/unstructured_file.ipynb'], 'languages': ['eng'], 'filetype': 'text/html', 'url': 'https://python.langchain.com/docs/integrations/document_loaders/unstructured_file/#loading-web-pages', 'c

In [54]:
# Split document into small chunks
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,  # chunk size (characters)
    chunk_overlap=200,  # chunk overlap (characters)
    add_start_index=True,  # track index in original document
)
all_splits = text_splitter.split_documents(docs)

# get embedding models, create vectorstore, store chunks in vectorstore

from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

from langchain_core.vectorstores import InMemoryVectorStore
vector_store = InMemoryVectorStore(embeddings)

# adding documents in vectorstores
document_ids = vector_store.add_documents(documents=all_splits)

INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


In [59]:
question = "Show me an example of a python code of how to load a web-page in langchain."

In [60]:


# extract chunks which matches with your query
search_results = vector_store.similarity_search_with_score(question, k=10)

# conbine text from differrent chunks into one long string
doc_content = "\n\n".join(doc.page_content for (doc,score) in search_results)

# RAG promt template
prompt_template = """You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:"""

# make the LLM read see the prompt, and analyse the retrieved document, and generate response

from langchain.chat_models import init_chat_model
llm = init_chat_model("gpt-4o-mini", model_provider="openai")

response = llm.invoke(prompt_template.format(
    context=doc_content,
    question=question))
    
print(response.content)

INFO: HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO: HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


To load a web page using LangChain, you can use the following Python code:

```python
from langchain_unstructured import UnstructuredLoader

loader = UnstructuredLoader(web_url="https://www.example.com")
docs = loader.load()

for doc in docs:
    print(f"{doc}\n")
```

Make sure to install the required package with `pip install "langchain-unstructured[local]"`.


In [61]:
from IPython.display import Markdown
Markdown(response.content)

To load a web page using LangChain, you can use the following Python code:

```python
from langchain_unstructured import UnstructuredLoader

loader = UnstructuredLoader(web_url="https://www.example.com")
docs = loader.load()

for doc in docs:
    print(f"{doc}\n")
```

Make sure to install the required package with `pip install "langchain-unstructured[local]"`.

# Exercise: Repeat this exercise with 

- different question
- different url
- different llm model
- different embedding model
- different vectorstore