# Working with Web pages

In [None]:
import os
import openai
from IPython.display import display, HTML, Markdown
from pprint import pprint

from dotenv import load_dotenv, find_dotenv
_ = load_dotenv(find_dotenv()) # read local .env file
openai.api_key = os.environ['OPENAI_API_KEY']

In [None]:
from langchain.callbacks import OpenAICallbackHandler

totals_cb = OpenAICallbackHandler()

print(totals_cb)

In [None]:
from langchain.document_loaders import WebBaseLoader

pages=[
    "https://yoshuabengio.org/2023/06/24/faq-on-catastrophic-ai-risks/", # total tokens used - ca. 14k
    "https://openai.com/research/instruction-following", # total tokens used - ca. 5k
    "https://blog.langchain.dev/announcing-langsmith/", # total tokens used - ca. 3k
]

loader = WebBaseLoader(pages[0])

docs = loader.load()

print(f"Document has {len(docs)} pages\n")
pprint(docs[0])

In [None]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI()

total_tokens = 0
for n, page in enumerate(docs):
    tokens = llm.get_num_tokens(page.page_content)
    total_tokens += tokens
    print(f"Page {n+1:2d}: {tokens:>}")
    
print(f"Total number of tokens in document: {total_tokens}")

In [None]:
from langchain.text_splitter import CharacterTextSplitter

text_splitter = CharacterTextSplitter(        
    separator = "\n",
    chunk_size = 3000,
    chunk_overlap  = 0,
)

chunks = text_splitter.split_documents(docs)

print(f"Documents split into {len(chunks)} chunks\n")
pprint(chunks[0])

Check **LangChain**'s API reference on [text splitters](https://api.python.langchain.com/en/latest/api_reference.html#module-langchain.text_splitter)

In [None]:
from langchain.text_splitter import TokenTextSplitter

token_splitter = TokenTextSplitter(chunk_size=2000, chunk_overlap=50)

chunks = token_splitter.split_documents(docs)

print(f"Documents split into {len(chunks)} chunks\n")

In [None]:
print(chunks[0].page_content)

In [None]:
from langchain.chat_models import ChatOpenAI
from langchain.chains.summarize import load_summarize_chain

summary_chain_refine = load_summarize_chain(
    ChatOpenAI(temperature=0.0), 
    chain_type="refine", 
)

In [None]:
summary_refine = summary_chain_refine(chunks, callbacks=[totals_cb])

In [None]:
display(Markdown(summary_refine["output_text"]))

In [None]:
pprint(totals_cb)