In [13]:
from langchain_yt_dlp import YoutubeLoaderDL
from langchain_community.document_loaders import YoutubeLoader
from langchain_community.document_loaders.youtube import TranscriptFormat
from yt_dlp import YoutubeDL

# Load the video metadata and transcript
def load_youtube_video(url):
    metadata_loader = YoutubeLoaderDL.from_youtube_url(
        url,
        add_video_info=True,
    )

    transcript_loader = YoutubeLoader.from_youtube_url(
        url,
        add_video_info=False,
        transcript_format=TranscriptFormat.CHUNKS
    )

    metadata_docs = metadata_loader.load()
    transcript_docs = transcript_loader.load()

    for i, doc in enumerate(transcript_docs):
        doc.metadata.update(metadata_docs[0].metadata)
        doc.metadata['source_type'] = 'youtube'
        doc.metadata['chunk_number'] = i + 1
    
    return transcript_docs

# Load all videos from a YouTube channel
def load_youtube_channel(channel_url):
    ydl_opts = {
        'extract_flat': True,
        'quiet': True,
        'force_generic_extractor': True,
    }

    print(f"Loading channel: {channel_url}")

    with YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(channel_url, download=False)
    
    print(f"Found {len(info.get('entries', []))} videos in channel: {info.get('title', 'Unknown Channel')}")
    
    if 'entries' not in info:
        raise ValueError("No entries found in the channel URL.")
    
    documents = []
    for entry in info['entries']:
        video_url = entry['url']
        print(f"Loading video: {entry['title']} ({video_url})")
        docs = load_youtube_video(video_url)
        documents.extend(docs)
    
    return documents

# documents = load_youtube_channel('https://www.youtube.com/@UVAResearchComputing')
documents = load_youtube_channel('https://www.youtube.com/playlist?list=PLT4bryHgBcRP7N-hB9u6EWs6tq_2nMoRO')

Loading channel: https://www.youtube.com/playlist?list=PLT4bryHgBcRP7N-hB9u6EWs6tq_2nMoRO
Found 2 videos in channel: RC Tutorial Series
Loading video: Connecting to HPC (https://www.youtube.com/watch?v=94qLtfdsXaM)
Loading video: Open OnDemand Interactive Apps (https://www.youtube.com/watch?v=o9XVUhCQuEI)


In [15]:
# print unique video metadatas
import json

unique_metadata = {json.dumps({k: v for k, v in d.metadata.items() if k not in ['chunk_number', 'start_seconds', 'start_timestamp']}, sort_keys=True) for d in documents}

print(f"Total unique video metadata entries: {len(unique_metadata)}")

for metadata in unique_metadata:
    print(json.loads(metadata))

Total unique video metadata entries: 2
{'author': 'UVA Research Computing', 'channel_id': 'UCDjikQvnYrZ3aNIdKgU54ag', 'description': "This short tutorial provides an overview of methods to connect to the University of Virginia's HPC systems, including Open OnDemand, FastX, and SSH.\n\n*Important Links*\nOpen OnDemand: https://ood.hpc.virginia.edu/\nFastX: https://fastx.hpc.virginia.edu/\nUVA VPN: https://in.virginia.edu/vpn\nRC Learning: https://learning.rc.virginia.edu\nMain RC Site: https://rc.virginia.edu\n\n*Video Chapters*\n0:00 Intro\n0:39 Open OnDemand\n2:09 FastX\n4:20 SSH\n5:51 VPN Info\n6:02 Getting Help", 'length': 411, 'publish_date': '2025-05-21', 'source': '94qLtfdsXaM', 'source_type': 'youtube', 'title': 'Connecting to HPC', 'view_count': 23, 'webpage_url': 'https://www.youtube.com/watch?v=94qLtfdsXaM'}
{'author': 'UVA Research Computing', 'channel_id': 'UCDjikQvnYrZ3aNIdKgU54ag', 'description': 'This short tutorial provides an overview of the interactive apps available 

In [None]:
from chromadb import PersistentClient
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings

CHROMA_PATH = "./chroma"
COLLECTION_NAME = "data"

embeddings = OllamaEmbeddings(
    base_url="http://localhost:11434",
    model="qwen3"
)

persistent_client = PersistentClient(
    path=CHROMA_PATH,
)

collection = persistent_client.get_or_create_collection(name=COLLECTION_NAME)

vector_store = Chroma(
    client=persistent_client,
    collection_name=COLLECTION_NAME,
    embedding_function=embeddings,
)

# Add documents to the vector store
def add_documents_to_vector_store(documents):
    vector_store.add_documents(
        documents=documents,
        ids=[doc.metadata['source_type'] + "_" + doc.metadata['source'] + "_" + str(doc.metadata['chunk_number']) for doc in documents],
    )
    print(f"Added {len(documents)} documents to the vector store.")

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


In [19]:
add_documents_to_vector_store(documents)

Added 10 documents to the vector store.


In [20]:
from langchain_ollama import ChatOllama
import json

def rag_query(query):
    """
    Perform a RAG query on the Chroma vector store.
    """
    # print number of documents in the vector store
    print(f"Number of documents in vector store: {len(vector_store.get())}")

    docs = vector_store.similarity_search_with_relevance_scores(query, k=5)
    context = "\n".join([result.page_content for result, _ in docs]) if docs else "No relevant documents found."
    print(vector_store.get())
    # print(f"Context for query '{query}': {context}")


    prompt = f"""
    You are a chatbot for University of Virginia Research Computing (UVA RC). You are given the following context from the UVA RC YouTube channel and from various other UVA RC resources. Respond to the question using the context provided succinctly and accurately.
    Context: {context}
    Question: {query}
    Answer:
    """

    chat = ChatOllama(
        base_url="http://localhost:11434",
        model="qwen3",
    )

    # disable thinking 
    response = chat.invoke(prompt, think=False)

    sources = {doc.metadata["source_type"]: doc.metadata for doc, _score in docs}

    sources_formatted = []
    for source_type, metadata in sources.items():
        if source_type == "youtube":
            sources_formatted.append(f'Source: [{metadata["title"]}]({metadata["webpage_url"]}) by [{metadata["author"]}](https://www.youtube.com/channel/{metadata["channel_id"]})')

    formatted_response = json.dumps({
        "response": response.content,
        "sources": sources_formatted
    }, indent=2)

    print(formatted_response)

# Example query
rag_query("What is Open OnDemand?")

Number of documents in vector store: 7
{
  "response": "Open OnDemand is a web-based portal that provides access to various interactive applications and tools available through the University of Virginia Research Computing (UVA RC). It allows users to launch applications like JupyterLab, RStudio Server, and others, as well as connect to desktop environments and manage computational resources.",
  "sources": [
    "Source: [Connecting to HPC](https://www.youtube.com/watch?v=94qLtfdsXaM) by [UVA Research Computing](https://www.youtube.com/channel/UCDjikQvnYrZ3aNIdKgU54ag)"
  ]
}


In [25]:
# drop langchain collection
vector_store.get()

{'ids': ['youtube_94qLtfdsXaM_1',
  'youtube_94qLtfdsXaM_2',
  'youtube_94qLtfdsXaM_3',
  'youtube_94qLtfdsXaM_4',
  'youtube_o9XVUhCQuEI_1',
  'youtube_o9XVUhCQuEI_2',
  'youtube_o9XVUhCQuEI_3',
  'youtube_o9XVUhCQuEI_4',
  'youtube_o9XVUhCQuEI_5',
  'youtube_o9XVUhCQuEI_6'],
 'embeddings': None,
 'documents': ["Hello and welcome back to the University of Virginia's high performance computing tutorial series. In this module, we will cover three different ways to connect to the HPC system at UVA. The first method is open on demand, which is a web application accessed through a web browser. From there, you can manipulate files, work with jobs, and use a host of different guey applications. The second method is Fastex, which is another web application that gives you direct access to a Linux desktop where you can access your files, run guey applications, and use a browser within the HPC system. Lastly, there is SSH, the secure shell client, which gives you a command lineon view of the clu