In [None]:
from langchain_ollama import OllamaEmbeddings
from langchain_postgres import PGEngine, PGVectorStore

CONNECTION_STRING = "postgresql+psycopg://user:password@localhost:5432/vector_db"
COLLECTION_NAME = "documents"
VECTOR_SIZE = 768  # Adjust based on the model's output vector size

embeddings = OllamaEmbeddings(
    base_url="http://localhost:11434",
    model="nomic-embed-text"
)

engine = PGEngine.from_connection_string(url=CONNECTION_STRING)

# engine.init_vectorstore_table(
#     table_name=COLLECTION_NAME,
#     vector_size=VECTOR_SIZE,
# )

vector_store = PGVectorStore.create_sync(
    engine=engine,
    embedding_service=embeddings,
    table_name=COLLECTION_NAME,
)

# Add documents to the vector store
def add_documents_to_vector_store(documents):
    vector_store.add_documents(
        documents=documents,
        ids=[doc.metadata['source_type'] + "_" + doc.metadata['source'] + "_" + str(doc.metadata['chunk_number']) for doc in documents],
    )

    print(f"Added {len(documents)} documents to the vector store.")

ProgrammingError: (psycopg.errors.DuplicateTable) relation "documents" already exists
[SQL: CREATE TABLE "public"."documents"(
            "langchain_id" UUID PRIMARY KEY,
            "content" TEXT NOT NULL,
            "embedding" vector(768) NOT NULL
            ,
"langchain_metadata" JSON
);]
(Background on this error at: https://sqlalche.me/e/20/f405)

In [None]:
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urlparse, urljoin
import time

# List of file extensions to skip
SKIP_EXTENSIONS = (".png", ".jpg", ".jpeg", ".gif", ".bmp", ".svg", ".pdf", ".zip", ".tar", ".gz", ".mp4", ".webp")

def is_valid(url, netloc):
    parsed = urlparse(url)
    path = parsed.path.lower()
    return (
        parsed.scheme in {"http", "https"}
        and netloc in parsed.netloc
        and not path.endswith(SKIP_EXTENSIONS)
    )

def crawl(url, visited=set(), documents={}, netloc=None):
    if url in visited:
        return documents
    visited.add(url)

    netloc = netloc or urlparse(url).netloc

    try:
        response = requests.get(url, timeout=5)

        if response.url != url:
            print(f"Redirected from {url} to {response.url}")
            url = response.url
            if url in visited:
                return documents
            visited.add(url)

        content_type = response.headers.get("Content-Type", "")

        # Skip non-HTML content (e.g. image files served without .png in URL)
        if "text/html" not in content_type:
            print(f"Skipping non-HTML URL: {url}")
            return documents

        if response.status_code != 200:
            return documents

        soup = BeautifulSoup(response.text, "html.parser")

        if len(articles := soup.find_all("article")) != 1:
            print(f"Skipping {url} as it does not contain exactly one article")
        else:
            # Clear the soup to only contain the article content
            article_soup = BeautifulSoup(str(articles[0]), "html.parser")

            # Remove images before text extraction
            for tag in article_soup.find_all("img"):
                tag.decompose()

            # Remove "« Return to [previous page]" link
            return_link = article_soup.find("a", string=re.compile(r"^« Return to"))
            if return_link:
                return_link.decompose()

            metadata_tag = article_soup.find("p", class_="blog-post-meta")

            # Extract tags from the article metadata
            if metadata_tag:
                tags = [a_tag.get_text(strip=True) for a_tag in metadata_tag.find_all("a") if a_tag.get_text(strip=True)]
            else:
                tags = [a_tag.get_text(strip=True) for a_tag in article_soup.find_all("a", href=re.compile(r"^/tag/*")) if a_tag.get_text(strip=True)]
            
            date_tag = metadata_tag.find("time") if metadata_tag else None
            date_updated = date_tag['datetime'] if date_tag else None

            # Convert links to absolute URLs and convert to markdown format
            for a_tag in article_soup.find_all("a", href=True):
                href = a_tag['href']
                if not href.startswith("http"):
                    href = urljoin(url, href)
                a_tag['href'] = href
                a_tag.string = f"[{a_tag.get_text(strip=True)}]({href})"
            
            # Convert blog-post-title class to markdown
            title_tag = article_soup.find("h2", class_="blog-post-title")
            if title_tag:
                title_text = title_tag.get_text(strip=True)
                title_tag.string = f"# {title_text}\n\n"
            
            # Convert h1 tags to markdown
            for h1_tag in article_soup.find_all("h1"):
                h1_text = h1_tag.get_text(strip=True)
                h1_tag.string = f"\n\n## {h1_text}\n"
            
            # Remove tags from the article content
            if metadata_tag:
                metadata_tag.decompose()

            # Extract and clean text
            text = article_soup.get_text()
            text = re.sub(r"https?:\/\/\S+?\.png", "", text)  # remove .png URLs
            text = re.sub(r"\S+\.png", "", text)              # remove local refs
            text = re.sub(r"\n{3,}", "\n\n", text)
            text = re.sub(r"[ \t]+", " ", text)

            documents[url] = {
                "text": text.strip(),
                "metadata": {
                    "source_type": "website",
                    "source": url,
                    "chunk_number": 0,  # Assuming single chunk for simplicity
                    "tags": ["website", "blog-post"] + tags,
                    "date_updated": date_updated,
                }
            }
            print(f"Extracted text from {url}")
        
        # Crawl linked pages recursively
        for a_tag in soup.find_all("a", href=True):
            if a_tag['href'].startswith(("http://", "https://")) and not urlparse(a_tag['href']).netloc == netloc:
                continue

            next_url = urljoin(url, a_tag['href'])

            # if the url contains a #, remove it
            next_url = next_url.split('#')[0]

            if is_valid(next_url, netloc):
                crawl(next_url, visited, documents, netloc)

        time.sleep(0.1)

    except Exception as e:
        print(f"Failed to crawl {url}: {e}")
    
    return documents

In [40]:
documents = crawl("https://rc.virginia.edu/")

Skipping https://rc.virginia.edu/ as it does not contain exactly one article
Extracted text from https://rc.virginia.edu/userinfo/hpc/access/
Extracted text from https://rc.virginia.edu/userinfo/storage/
Skipping https://rc.virginia.edu/categories/ as it does not contain exactly one article
Extracted text from https://rc.virginia.edu/form/support-request/
Extracted text from https://rc.virginia.edu/userinfo/systems/
Extracted text from https://rc.virginia.edu/userinfo/user-guide/
Skipping https://rc.virginia.edu/tags/ as it does not contain exactly one article
Extracted text from https://rc.virginia.edu/about/mission/
Skipping https://rc.virginia.edu/about/people/ as it does not contain exactly one article
Extracted text from https://rc.virginia.edu/about/students/
Extracted text from https://rc.virginia.edu/service/high-performance-computing/
Extracted text from https://rc.virginia.edu/service/dac/
Extracted text from https://rc.virginia.edu/service/dtc/
Extracted text from https://rc

In [None]:
documents += crawl("https://learning.rc.virginia.edu/")

Skipping https://learning.rc.virginia.edu/ as it does not contain exactly one article
Skipping https://learning.rc.virginia.edu/courses/ as it does not contain exactly one article
Skipping https://learning.rc.virginia.edu/tutorials/ as it does not contain exactly one article
Skipping https://learning.rc.virginia.edu/authors/ as it does not contain exactly one article
Skipping https://learning.rc.virginia.edu/author/ruoshi-sun/ as it does not contain exactly one article
Extracted text from https://learning.rc.virginia.edu/tutorials/building-running-c-cpp-fortran/
Skipping https://learning.rc.virginia.edu/author/katherine-holcomb/ as it does not contain exactly one article
Extracted text from https://learning.rc.virginia.edu/tutorials/hpc-intro/
Skipping https://learning.rc.virginia.edu/author/jacalyn-huband/ as it does not contain exactly one article
Extracted text from https://learning.rc.virginia.edu/tutorials/deep-learning-distributed/
Skipping https://learning.rc.virginia.edu/author

In [43]:
print(len(documents))

915


In [47]:
from langchain.schema import Document

website_documents = []
for i, (url, doc) in enumerate(documents.items()):
    text = doc["text"]
    metadata = doc["metadata"]
    if not text.strip():
        print(f"Skipping empty document for URL: {url}")
        continue
    doc = Document(
        page_content=text,
        metadata={
            "source_type": metadata["source_type"],
            "source": metadata["source"],
            "chunk_number": metadata["chunk_number"],
            "tags": str(metadata["tags"]),
            "date_updated": metadata.get("date_updated", None),
        }
    )
    website_documents.append(doc)

In [None]:
# for doc in website_documents[:5]:
#     print(f"Document source: {doc.metadata['source']}, tags: {doc.metadata['tags']}")
#     print(f"Document text: {doc.page_content[:100]}...")  # Print first 100 characters of the text

print(website_documents[500].page_content)


# [Mission](https://rc.virginia.edu/about/mission/)

Research Computing empowers UVA researchers to achieve more with cutting-edge computational resources. Our support team strives to create innovative solutions for researchers who need 
help solving complex optimization, parallelization, workflow, and data analysis issues. We build and maintain the University's best computing platforms while educating the next 
generation of researchers on the power of advanced computing.


In [8]:
def document_to_id(doc):
    return f"{doc.metadata['source_type']}_{doc.metadata['source']}_{doc.metadata['chunk_number']}".replace("/", "_").replace(":", "_").replace("?", "_").replace("#", "_")

In [49]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain_community.storage import SQLStore
from langchain.storage._lc_store import create_kv_docstore

DOCUMENT_STORE_NAMESPACE = "full_documents"

child_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=200,
    length_function=len,
)

parent_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=500,
    length_function=len,
)

sql_store = SQLStore(
    namespace=DOCUMENT_STORE_NAMESPACE,
    db_url=CONNECTION_STRING,
)
doc_store = create_kv_docstore(sql_store)
sql_store.create_schema()

retriever = ParentDocumentRetriever(
    vectorstore=vector_store,
    docstore=doc_store,
    child_splitter=child_splitter,
    # parent_splitter=parent_splitter,
)

In [50]:
BATCH_SIZE = 10

to_add = [doc for doc in website_documents if sql_store.mget(keys=[document_to_id(doc)])[0] is None]
print(f"Documents to add: {len(to_add)}/{len(website_documents)} total documents")

# Add documents to the vector store in batches
for i in range(0, len(to_add), BATCH_SIZE):
    batch = to_add[i:i + BATCH_SIZE]
    # retriever.add_documents(documents=batch, ids=[document_to_id(x) for doc in batch for x in parent_splitter.split_documents([doc])])
    retriever.add_documents(documents=batch, ids=[document_to_id(doc) for doc in batch])
    print(f"Processed batch {i // BATCH_SIZE + 1}/{(len(to_add) + BATCH_SIZE - 1) // BATCH_SIZE}")

Documents to add: 540/915 total documents
Processed batch 1/54
Processed batch 2/54
Processed batch 3/54
Processed batch 4/54
Processed batch 5/54
Processed batch 6/54
Processed batch 7/54
Processed batch 8/54
Processed batch 9/54
Processed batch 10/54
Processed batch 11/54
Processed batch 12/54
Processed batch 13/54
Processed batch 14/54
Processed batch 15/54
Processed batch 16/54
Processed batch 17/54
Processed batch 18/54
Processed batch 19/54
Processed batch 20/54
Processed batch 21/54
Processed batch 22/54
Processed batch 23/54
Processed batch 24/54
Processed batch 25/54
Processed batch 26/54
Processed batch 27/54
Processed batch 28/54
Processed batch 29/54
Processed batch 30/54
Processed batch 31/54
Processed batch 32/54
Processed batch 33/54
Processed batch 34/54
Processed batch 35/54
Processed batch 36/54
Processed batch 37/54
Processed batch 38/54
Processed batch 39/54
Processed batch 40/54
Processed batch 41/54
Processed batch 42/54
Processed batch 43/54
Processed batch 44/54

In [26]:
from ollama import chat
# import PromptTemplate
from langchain.prompts import PromptTemplate

REPHRASE_TEMPLATE = PromptTemplate.from_template(
"""
Task: Given a multi-turn conversation and a follow-up user question, rewrite the follow-up as a clear, detailed, and standalone question suitable for retrieving relevant documents from a vector database.

- Use the context from the full conversation to preserve intent and necessary background.
- The rewritten question should not reference the conversation explicitly (e.g., avoid “as mentioned before”).
- Ensure the standalone question includes all important entities, topics, and context implied in the follow-up.
- Favor clarity and completeness over brevity.

Input:

Chat History:
{chat_history}

Follow-Up Question:
{input}

Rewritten Standalone Question:
"""
)

response = chat(
    model="qwen3",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": REPHRASE_TEMPLATE.format(
            chat_history="No chat history available.",
            input="What are the features of Open OnDemand?"
        )},
    ],
    think=False
)

print("Rewritten question:", response['message']['content'])

print("Retrieving documents...")
docs = retriever.invoke(
    response['message']['content'],
    search_type="similarity",
)

for doc in docs:
    print(f"Document source: {doc.metadata['source']}, tags: {doc.metadata['tags']}")

Rewritten question: What are the key features and functionalities of the Open OnDemand platform?
Retrieving documents...
Document source: https://rc.virginia.edu/userinfo/hpc/login/, tags: ['website', 'blog-post', 'userinfo', 'hpc', 'login', 'openondemand', 'rivanna', 'ssh', 'supercomputer']
Document source: https://rc.virginia.edu/userinfo/hpc/ood/desktop/, tags: ['website', 'blog-post', 'HPC', 'software', 'lang']
Document source: https://rc.virginia.edu/userinfo/hpc/ood/, tags: ['website', 'blog-post', 'userinfo', 'allocations', 'gpu', 'hpc', 'ood', 'parallel-computing', 'queues', 'rivanna']
