https://github.com/AndreasFischer1985/code-snippets/blob/master/py/LangChain_HuggingFace_examples.py

In [1]:
!pip uninstall widgetsnbextension nbdev -y

Found existing installation: widgetsnbextension 4.0.9
Uninstalling widgetsnbextension-4.0.9:
  Successfully uninstalled widgetsnbextension-4.0.9
[0m

In [2]:
!pip install -Uqq ipywidgets

In [7]:
# the warnings in the libraries used can get old, do not show them
import warnings
warnings.filterwarnings('ignore')

In [8]:
from transformers import pipeline
model= pipeline(model="google/flan-t5-large") #'text2text-generation'
model.save_pretrained("flan-t5-large")

In [9]:
from langchain import PromptTemplate, LLMChain
from langchain.llms import HuggingFacePipeline
llm = HuggingFacePipeline.from_model_id(model_id="flan-t5-large", task="text2text-generation", model_kwargs={"temperature":1e-10})


In [10]:
llm

HuggingFacePipeline(pipeline=<transformers.pipelines.text2text_generation.Text2TextGenerationPipeline object at 0x7f52be6bea60>, model_id='flan-t5-large', model_kwargs={'temperature': 1e-10}, pipeline_kwargs={})

In [11]:
template = PromptTemplate(input_variables=["input"], template="{input}")
chain = LLMChain(llm=llm, verbose=True, prompt=template)
chain("What is the meaning of life?")



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mWhat is the meaning of life?[0m

[1m> Finished chain.[0m


{'input': 'What is the meaning of life?', 'text': 'love'}

In [12]:
from langchain import PromptTemplate, LLMChain
template = """Question: {question}
Answer: Let's think step by step."""
prompt = PromptTemplate(template=template, input_variables=["question"])
chain = LLMChain(llm=llm, verbose=True, prompt=prompt)
chain("What is the meaning of life?")




[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mQuestion: What is the meaning of life?
Answer: Let's think step by step.[0m

[1m> Finished chain.[0m


{'question': 'What is the meaning of life?',
 'text': 'The meaning of life is to be happy. Happiness is the result of living a happy'}

In [17]:
!pip install -Uqq sentence-transformers

In [18]:
# https://api.python.langchain.com/en/latest/embeddings/langchain_community.embeddings.huggingface.HuggingFaceEmbeddings.html

from langchain_community.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [20]:
!pip install -Uqq chromadb

In [22]:
from langchain.document_loaders import TextLoader
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain.llms.base import LLM
from typing import Optional, List, Mapping, Any
from langchain import PromptTemplate, LLMChain
from langchain.embeddings import HuggingFaceInstructEmbeddings 
from langchain.agents import initialize_agent, Tool

texts=["The meaning of life is to love","The meaning of vacation is to relax","Roses are red.","Hack the planet!"]

db = Chroma.from_texts(texts, hf, collection_name="my-collection", persist_directory="my_collection") #vs. from_documents

template = "Cite one of the following pieces of context as an answer to the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}Question: {question}\n\nAnswer:"
template = PromptTemplate(template=template, input_variables=["context", "question"])  
docsearcher = RetrievalQA.from_chain_type(
llm=llm, 
chain_type="stuff", #stuff, map_reduce, refine, map_rerank
chain_type_kwargs={"prompt": template},
return_source_documents=True,
retriever=db.as_retriever(search_type="similarity",search_kwargs={"k":1})) # similarity, mmr
result, sources = docsearcher.run("What is the meaning of life?")

ValueError: `run` not supported when there is not exactly one output key. Got ['result', 'source_documents'].

In [23]:
?docsearcher.run

[0;31mSignature:[0m
[0mdocsearcher[0m[0;34m.[0m[0mrun[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0margs[0m[0;34m:[0m [0mAny[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mcallbacks[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mlangchain_core[0m[0;34m.[0m[0mcallbacks[0m[0;34m.[0m[0mbase[0m[0;34m.[0m[0mBaseCallbackHandler[0m[0;34m][0m[0;34m,[0m [0mlangchain_core[0m[0;34m.[0m[0mcallbacks[0m[0;34m.[0m[0mbase[0m[0;34m.[0m[0mBaseCallbackManager[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mtags[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mmetadata[0m[0;34m:[0m [0mOptional[0m[0;34m[[0m[0mDict[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mAny[0m[0;34m][0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m

In [26]:
?Chain

Object `Chain` not found.


In [27]:
hf

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 384, 'do_lower_case': False}) with Transformer model: MPNetModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-mpnet-base-v2', cache_folder=None, model_kwargs={'device': 'cpu'}, encode_kwargs={'normalize_embeddings': False}, multi_process=False)

In [29]:
hf.embed_query("what is openshift")

[-0.04182109609246254,
 -0.013159789144992828,
 0.03801683336496353,
 -0.029535328969359398,
 -0.04847255349159241,
 -0.022125670686364174,
 -0.0012906419578939676,
 0.06128718703985214,
 0.008400006219744682,
 -0.028410136699676514,
 0.0005424160044640303,
 0.006250007078051567,
 0.02838965319097042,
 0.040865201503038406,
 -0.028696080669760704,
 -0.00023915059864521027,
 0.041187968105077744,
 0.004959468264132738,
 -0.15084506571292877,
 -0.02498084492981434,
 -0.010282588191330433,
 -0.03075416572391987,
 -0.017620496451854706,
 0.008729619905352592,
 0.0691290944814682,
 -0.02000194974243641,
 -0.016624964773654938,
 0.07622281461954117,
 0.04922495782375336,
 0.03286007046699524,
 -0.035471782088279724,
 0.011489353142678738,
 0.00887718703597784,
 0.03045700304210186,
 1.5997846958271111e-06,
 -0.006798353511840105,
 -0.010111760348081589,
 -0.008459136821329594,
 0.04957021400332451,
 -0.02109450288116932,
 0.011823740787804127,
 0.013927620835602283,
 -0.013388078659772873,
 

In [30]:
hf.embed_documents(["one", "two"])

[[0.01674247346818447,
  0.01106952503323555,
  0.02236040122807026,
  -0.0232512429356575,
  -0.015568011440336704,
  -0.030459711328148842,
  0.0196797214448452,
  0.004576556384563446,
  -0.017671067267656326,
  0.020458227023482323,
  0.11856047064065933,
  -0.024615507572889328,
  0.006804684177041054,
  0.012103755958378315,
  0.021607521921396255,
  0.023074032738804817,
  0.017425069585442543,
  -0.02573220804333687,
  0.016344256699085236,
  -0.021890532225370407,
  -0.026933802291750908,
  0.023309895768761635,
  0.03231760859489441,
  0.0021780524402856827,
  -0.030943863093852997,
  -0.005580266937613487,
  0.010307876393198967,
  -0.0219203419983387,
  -0.047083206474781036,
  0.0032627256587147713,
  -0.0076248603872954845,
  -0.04003312066197395,
  -0.007890126667916775,
  -0.046137735247612,
  2.3055965812091017e-06,
  0.004051618743687868,
  0.013616835698485374,
  0.001449541887268424,
  -0.022418003529310226,
  -0.021258141845464706,
  -0.043526168912649155,
  -0.028

In [31]:
texts=["The meaning of life is to love","The meaning of vacation is to relax","Roses are red.","Hack the planet!"]


In [33]:
db = Chroma.from_texts(texts, hf, collection_name="my-collection") #vs. from_documents

In [34]:
docsearcher = RetrievalQA.from_chain_type(
  llm=llm, 
  chain_type="stuff", #stuff, map_reduce, refine, map_rerank
  return_source_documents=False,
  retriever=db.as_retriever(search_type="similarity",search_kwargs={"k":1})) # similarity, mmr
docsearcher.run("What is the meaning of life?")

'to love'

Embedding is the vectorstore.  Load PDF and get into vectorstore...

In [35]:
!wget https://access.redhat.com/documentation/en-us/openshift_container_platform/4.14/pdf/about/openshift_container_platform-4.14-about-en-us.pdf -O ocp-about.pdf

--2024-01-19 11:33:02--  https://access.redhat.com/documentation/en-us/openshift_container_platform/4.14/pdf/about/openshift_container_platform-4.14-about-en-us.pdf
Resolving access.redhat.com (access.redhat.com)... 23.33.40.139, 23.33.40.146, 2600:1408:20::6870:eb6b, ...
Connecting to access.redhat.com (access.redhat.com)|23.33.40.139|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/pdf]
Saving to: ‘ocp-about.pdf’

ocp-about.pdf           [ <=>                ] 135.71K  --.-KB/s    in 0.1s    

2024-01-19 11:33:04 (1.36 MB/s) - ‘ocp-about.pdf’ saved [138966]



In [36]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("./ocp-about.pdf")
pages = loader.load_and_split()

In [37]:
pages

[Document(page_content='OpenShift Container Platform\n \n4.14\nAbout\nIntroduction to OpenShift Container Platform\nLast Updated: 2024-01-17', metadata={'source': './ocp-about.pdf', 'page': 0}),
 Document(page_content='OpenShift Container Platform\n \n4.14\n \nAbout\nIntroduction to OpenShift Container Platform', metadata={'source': './ocp-about.pdf', 'page': 2}),
 Document(page_content='Legal Notice\nCopyright \n©\n 2024 Red Hat, Inc.\nThe text of and illustrations in this document are licensed by Red Hat under a Creative Commons\nAttribution–Share Alike 3.0 Unported license ("CC-BY-SA"). An explanation of CC-BY-SA is\navailable at\nhttp://creativecommons.org/licenses/by-sa/3.0/\n. In accordance with CC-BY-SA, if you distribute this document or an adaptation of it, you must\nprovide the URL for the original version.\nRed Hat, as the licensor of this document, waives the right to enforce, and agrees not to assert,\nSection 4d of CC-BY-SA to the fullest extent permitted by applicable la

In [40]:
from langchain_community.vectorstores import FAISS

vectorestore = FAISS.from_documents(pages, hf)

In [42]:
docsearcher = RetrievalQA.from_chain_type(
  llm=llm, 
  chain_type="stuff", #stuff, map_reduce, refine, map_rerank
  return_source_documents=False,
  retriever=vectorestore.as_retriever(search_type="similarity",search_kwargs={"k":1})) # similarity, mmr
docsearcher.run("what is openshift")

'OpenShift Container Platform'

Yay, got something.  Now create a simple UI so I can play around.

In [43]:
def answer_question(txt):
    return docsearcher.run(txt)

In [45]:
answer_question("what is ocp")

'OpenShift Container Platform is a platform for developing and deploying containerized applications'

In [47]:
!pip install -Uqq "gradio==3.50"

In [51]:
import gradio as gr

intf = gr.Interface(fn=answer_question, inputs="textbox", outputs="textbox")
intf.launch(inline=False)

Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.




Get some more docs.

In [60]:
!mkdir -p openshift/
!rm -f openshift/*.pdf
!wget -P openshift/ https://access.redhat.com/documentation/en-us/openshift_container_platform/4.14/pdf/about/openshift_container_platform-4.14-about-en-us.pdf
!wget -P openshift/ https://access.redhat.com/documentation/en-us/openshift_container_platform/4.14/pdf/getting_started/openshift_container_platform-4.14-getting_started-en-us.pdf
!wget -P openshift/ https://access.redhat.com/documentation/en-us/openshift_container_platform/4.14/pdf/security_and_compliance/openshift_container_platform-4.14-security_and_compliance-en-us.pdf
!wget -P openshift/ https://access.redhat.com/documentation/en-us/openshift_container_platform/4.14/pdf/release_notes/openshift_container_platform-4.14-release_notes-en-us.pdf
!wget -P openshift/ https://access.redhat.com/documentation/en-us/openshift_container_platform/4.14/pdf/architecture/openshift_container_platform-4.14-architecture-en-us.pdf
!wget -P openshift/ https://access.redhat.com/documentation/en-us/openshift_container_platform/4.14/pdf/support/openshift_container_platform-4.14-support-en-us.pdf

--2024-01-19 12:47:02--  https://access.redhat.com/documentation/en-us/openshift_container_platform/4.14/pdf/about/openshift_container_platform-4.14-about-en-us.pdf
Resolving access.redhat.com (access.redhat.com)... 23.33.40.139, 23.33.40.146, 2600:1408:20::6870:eb6b, ...
Connecting to access.redhat.com (access.redhat.com)|23.33.40.139|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [application/pdf]
Saving to: ‘openshift/openshift_container_platform-4.14-about-en-us.pdf’

openshift_container     [ <=>                ] 135.71K  --.-KB/s    in 0.1s    

2024-01-19 12:47:02 (1.37 MB/s) - ‘openshift/openshift_container_platform-4.14-about-en-us.pdf’ saved [138966]

--2024-01-19 12:47:03--  https://access.redhat.com/documentation/en-us/openshift_container_platform/4.14/pdf/getting_started/openshift_container_platform-4.14-getting_started-en-us.pdf
Resolving access.redhat.com (access.redhat.com)... 23.33.40.139, 23.33.40.146, 2600:1408:20::6870:eb51, ..

In [61]:
from langchain_community.document_loaders import PyPDFDirectoryLoader 
?PyPDFDirectoryLoader

[0;31mInit signature:[0m
[0mPyPDFDirectoryLoader[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0mpath[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mglob[0m[0;34m:[0m [0mstr[0m [0;34m=[0m [0;34m'**/[!.]*.pdf'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0msilent_errors[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mload_hidden[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mrecursive[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mextract_images[0m[0;34m:[0m [0mbool[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Load a directory with `PDF` files using `pypdf` and chunks at character level.

Loader also stores page numbers in metadata.
[0;31mFile:[0m           ~/notebooks/venv/lib64/python3.9/site-packages/l

In [62]:
# load all the pdfs

from langchain_community.document_loaders import PyPDFDirectoryLoader

path="openshift/"
loader = PyPDFDirectoryLoader(path)
pages = loader.load_and_split()

In [63]:
len(pages)

743

In [82]:
# https://api.python.langchain.com/en/latest/embeddings/langchain_community.embeddings.huggingface.HuggingFaceEmbeddings.html

from langchain_community.embeddings import HuggingFaceEmbeddings

model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': False}
hf = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

In [83]:
from langchain_community.vectorstores import FAISS

vectorestore = FAISS.from_documents(pages, hf)

RuntimeError: PyTorch is not linked with support for opengl devices

In [99]:
template = "Cite one of the following pieces of context as an answer to the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.\n\n{context}Question: {question}\n\nAnswer:"
template = PromptTemplate(template=template, input_variables=["context", "question"])  
docsearcher = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", #stuff, map_reduce, refine, map_rerank
    chain_type_kwargs={"prompt": template},
    return_source_documents=True,
    retriever=vectorestore.as_retriever(search_type="similarity",search_kwargs={"k":1})) # similarity, mmr


In [107]:
answer = docsearcher.invoke({"query": "what is openshift?"})

In [108]:
answer

{'query': 'what is openshift?',
 'result': 'A managed RHEL OpenShift Container Platform offering on Amazon Web Services (AWS',
 'source_documents': [Document(page_content='An approach to writing software. Applications can be separated into the smallest components,\nindependent from each other by using microservices.\nmirror registry\nA registry that holds the mirror of OpenShift Container Platform images.\nmonolithic applications\nApplications that are self-contained, built, and packaged as a single piece.\nnamespaces\nA namespace isolates specific system resources that are visible to all processes. Inside a namespace,\nonly processes that are members of that namespace can see those resources.\nnetworking\nNetwork information of OpenShift Container Platform cluster.\nnode\nA worker machine in the OpenShift Container Platform cluster. A node is either a virtual machine\n(VM) or a physical machine.\nOpenShift Container Platform Update Service (OSUS)\nFor clusters with internet access, Re

In [116]:
doc=answer['source_documents']
type(doc)

list

In [117]:
doc[0]

Document(page_content='An approach to writing software. Applications can be separated into the smallest components,\nindependent from each other by using microservices.\nmirror registry\nA registry that holds the mirror of OpenShift Container Platform images.\nmonolithic applications\nApplications that are self-contained, built, and packaged as a single piece.\nnamespaces\nA namespace isolates specific system resources that are visible to all processes. Inside a namespace,\nonly processes that are members of that namespace can see those resources.\nnetworking\nNetwork information of OpenShift Container Platform cluster.\nnode\nA worker machine in the OpenShift Container Platform cluster. A node is either a virtual machine\n(VM) or a physical machine.\nOpenShift Container Platform Update Service (OSUS)\nFor clusters with internet access, Red Hat Enterprise Linux (RHEL) provides over-the-air updates by\nusing an OpenShift Container Platform update service as a hosted service located behi

In [118]:
doc[0].page_content

'An approach to writing software. Applications can be separated into the smallest components,\nindependent from each other by using microservices.\nmirror registry\nA registry that holds the mirror of OpenShift Container Platform images.\nmonolithic applications\nApplications that are self-contained, built, and packaged as a single piece.\nnamespaces\nA namespace isolates specific system resources that are visible to all processes. Inside a namespace,\nonly processes that are members of that namespace can see those resources.\nnetworking\nNetwork information of OpenShift Container Platform cluster.\nnode\nA worker machine in the OpenShift Container Platform cluster. A node is either a virtual machine\n(VM) or a physical machine.\nOpenShift Container Platform Update Service (OSUS)\nFor clusters with internet access, Red Hat Enterprise Linux (RHEL) provides over-the-air updates by\nusing an OpenShift Container Platform update service as a hosted service located behind public\nAPIs.\nOpen

In [123]:
{
        "query": answer['query'],
        "result": answer['result'],
        "metadata": answer['source_documents'][0].metadata,
        "page_content": answer['source_documents'][0].page_content,
    }

{'query': 'what is openshift?',
 'result': 'A managed RHEL OpenShift Container Platform offering on Amazon Web Services (AWS',
 'metadata': {'source': 'openshift/openshift_container_platform-4.14-architecture-en-us.pdf',
  'page': 10},
 'page_content': 'An approach to writing software. Applications can be separated into the smallest components,\nindependent from each other by using microservices.\nmirror registry\nA registry that holds the mirror of OpenShift Container Platform images.\nmonolithic applications\nApplications that are self-contained, built, and packaged as a single piece.\nnamespaces\nA namespace isolates specific system resources that are visible to all processes. Inside a namespace,\nonly processes that are members of that namespace can see those resources.\nnetworking\nNetwork information of OpenShift Container Platform cluster.\nnode\nA worker machine in the OpenShift Container Platform cluster. A node is either a virtual machine\n(VM) or a physical machine.\nOpenShi

In [124]:
import json

def answer_question(question):
    # {query,result,source_documents{page_content,metadata{source,page}}}
    answer = docsearcher.invoke({"query": question})
    return {
        "query": answer['query'],
        "result": answer['result'],
        "metadata": answer['source_documents'][0].metadata,
        "page_content": answer['source_documents'][0].page_content,
    }

In [125]:
answer_question("what is openshift?")

{'query': 'what is openshift?',
 'result': 'A managed RHEL OpenShift Container Platform offering on Amazon Web Services (AWS',
 'metadata': {'source': 'openshift/openshift_container_platform-4.14-architecture-en-us.pdf',
  'page': 10},
 'page_content': 'An approach to writing software. Applications can be separated into the smallest components,\nindependent from each other by using microservices.\nmirror registry\nA registry that holds the mirror of OpenShift Container Platform images.\nmonolithic applications\nApplications that are self-contained, built, and packaged as a single piece.\nnamespaces\nA namespace isolates specific system resources that are visible to all processes. Inside a namespace,\nonly processes that are members of that namespace can see those resources.\nnetworking\nNetwork information of OpenShift Container Platform cluster.\nnode\nA worker machine in the OpenShift Container Platform cluster. A node is either a virtual machine\n(VM) or a physical machine.\nOpenShi

In [126]:
import gradio as gr

intf = gr.Interface(fn=answer_question, inputs="textbox", outputs="textbox")
intf.launch(inline=False)

Running on local URL:  http://127.0.0.1:7866

To create a public link, set `share=True` in `launch()`.




In [139]:
# for cv2, needed to use url loaders
!pip install -Uqq scikit-build numpy opencv-python unstructured-inference pikepdf

In [151]:
!pip install -Uqq cookielib

[31mERROR: Could not find a version that satisfies the requirement cookielib (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for cookielib[0m[31m
[0m

In [152]:
!wget -O index.html "https://access.redhat.com/documentation/en-us/openshift_container_platform/4.14"

--2024-01-19 14:35:23--  https://access.redhat.com/documentation/en-us/openshift_container_platform/4.14
Resolving access.redhat.com (access.redhat.com)... 23.33.40.139, 23.33.40.146, 2600:1408:20::1722:f082, ...
Connecting to access.redhat.com (access.redhat.com)|23.33.40.139|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘index.html’

index.html              [  <=>               ] 130.04K   561KB/s    in 0.2s    

2024-01-19 14:35:23 (561 KB/s) - ‘index.html’ saved [133160]



In [160]:
import re

with open("index.html", "r") as f:
    html = f.read()

urls = []

for x in html.splitlines():
    if "Single-page HTML" in x:
        m = re.match(".*href=\"([^\"]*)\".*", x)
        urls.append(f"https://access.redhat.com{m.groups()[0]}")

urls

['https://access.redhat.com/documentation/en-us/openshift_container_platform/4.14/html-single/about',
 'https://access.redhat.com/documentation/en-us/openshift_container_platform/4.14/html-single/getting_started',
 'https://access.redhat.com/documentation/en-us/openshift_container_platform/4.14/html-single/release_notes',
 'https://access.redhat.com/documentation/en-us/openshift_container_platform/4.14/html-single/security_and_compliance',
 'https://access.redhat.com/documentation/en-us/openshift_container_platform/4.14/html-single/architecture',
 'https://access.redhat.com/documentation/en-us/openshift_container_platform/4.14/html-single/support',
 'https://access.redhat.com/documentation/en-us/openshift_container_platform/4.14/html-single/installing',
 'https://access.redhat.com/documentation/en-us/assisted_installer_for_openshift_container_platform/2024/html-single/installing_openshift_container_platform_with_the_assisted_installer',
 'https://access.redhat.com/documentation/en-us/o

In [161]:
from langchain_community.document_loaders import UnstructuredURLLoader

loader = UnstructuredURLLoader(urls=urls)

pages = loader.load_and_split()