In [2]:
from langchain_yt_dlp import YoutubeLoaderDL
from langchain_community.document_loaders import YoutubeLoader
from langchain_community.document_loaders.youtube import TranscriptFormat
from yt_dlp import YoutubeDL

# Load the video metadata and transcript
def load_youtube_video(url):
    metadata_loader = YoutubeLoaderDL.from_youtube_url(
        url,
        add_video_info=True,
    )

    transcript_loader = YoutubeLoader.from_youtube_url(
        url,
        add_video_info=False,
        transcript_format=TranscriptFormat.CHUNKS
    )

    metadata_docs = metadata_loader.load()
    transcript_docs = transcript_loader.load()

    for i, doc in enumerate(transcript_docs):
        doc.metadata.update(metadata_docs[0].metadata)
        doc.metadata['source_type'] = 'youtube'
        doc.metadata['chunk_number'] = i + 1
    
    return transcript_docs

# Load all videos from a YouTube channel
def load_youtube_channel(channel_url):
    ydl_opts = {
        'extract_flat': True,
        'quiet': True,
        'force_generic_extractor': True,
    }

    print(f"Loading channel: {channel_url}")

    with YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(channel_url, download=False)
    
    print(f"Found {len(info.get('entries', []))} videos in channel: {info.get('title', 'Unknown Channel')}")
    
    if 'entries' not in info:
        raise ValueError("No entries found in the channel URL.")
    
    documents = []
    for entry in info['entries']:
        video_url = entry['url']
        print(f"Loading video: {entry['title']} ({video_url})")
        docs = load_youtube_video(video_url)
        documents.extend(docs)
    
    return documents

# documents = load_youtube_channel('https://www.youtube.com/@UVAResearchComputing')
documents = load_youtube_channel('https://www.youtube.com/playlist?list=PLT4bryHgBcRP7N-hB9u6EWs6tq_2nMoRO')

Loading channel: https://www.youtube.com/playlist?list=PLT4bryHgBcRP7N-hB9u6EWs6tq_2nMoRO
Found 3 videos in channel: RC Tutorial Series
Loading video: Connecting to HPC (https://www.youtube.com/watch?v=94qLtfdsXaM)
Loading video: Open OnDemand Interactive Apps (https://www.youtube.com/watch?v=o9XVUhCQuEI)
Loading video: Features of Open OnDemand (https://www.youtube.com/watch?v=MpzThi43iak)


In [3]:
# print unique video metadatas
import json

unique_metadata = {json.dumps({k: v for k, v in d.metadata.items() if k not in ['chunk_number', 'start_seconds', 'start_timestamp']}, sort_keys=True) for d in documents}

print(f"Total unique video metadata entries: {len(unique_metadata)}")

for metadata in unique_metadata:
    print(json.loads(metadata))

Total unique video metadata entries: 3
{'author': 'UVA Research Computing', 'channel_id': 'UCDjikQvnYrZ3aNIdKgU54ag', 'description': 'This short tutorial provides an overview of the various features available in Open OnDemand.\n\n*Important Links*\nOpen OnDemand: https://ood.hpc.virginia.edu/\nRC Learning: https://learning.rc.virginia.edu\nMain RC Site: https://rc.virginia.edu', 'length': 611, 'publish_date': '2025-06-30', 'source': 'MpzThi43iak', 'source_type': 'youtube', 'title': 'Features of Open OnDemand', 'view_count': 7, 'webpage_url': 'https://www.youtube.com/watch?v=MpzThi43iak'}
{'author': 'UVA Research Computing', 'channel_id': 'UCDjikQvnYrZ3aNIdKgU54ag', 'description': "This short tutorial provides an overview of methods to connect to the University of Virginia's HPC systems, including Open OnDemand, FastX, and SSH.\n\n*Important Links*\nOpen OnDemand: https://ood.hpc.virginia.edu/\nFastX: https://fastx.hpc.virginia.edu/\nUVA VPN: https://in.virginia.edu/vpn\nRC Learning: ht

In [4]:
from chromadb import PersistentClient
from langchain_chroma import Chroma
from langchain_ollama import OllamaEmbeddings

CHROMA_PATH = "./chroma"
COLLECTION_NAME = "data"

embeddings = OllamaEmbeddings(
    base_url="http://localhost:11434",
    model="qwen3"
)

persistent_client = PersistentClient(
    path=CHROMA_PATH,
)

collection = persistent_client.get_or_create_collection(name=COLLECTION_NAME)

vector_store = Chroma(
    client=persistent_client,
    collection_name=COLLECTION_NAME,
    embedding_function=embeddings,
)

# Add documents to the vector store
def add_documents_to_vector_store(documents):
    vector_store.add_documents(
        documents=documents,
        ids=[doc.metadata['source_type'] + "_" + doc.metadata['source'] + "_" + str(doc.metadata['chunk_number']) for doc in documents],
    )
    print(f"Added {len(documents)} documents to the vector store.")

Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


In [5]:
def document_to_id(doc):
    return f"{doc.metadata['source_type']}_{doc.metadata['source']}_{doc.metadata['chunk_number']}".replace("/", "_").replace(":", "_").replace("?", "_").replace("#", "_")

In [6]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain_community.storage import SQLStore
from langchain.storage._lc_store import create_kv_docstore

DOCUMENT_STORE_URL = "sqlite:///document_store.db"
DOCUMENT_STORE_NAMESPACE = "website_documents"

BATCH_SIZE = 100

child_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=200,
    length_function=len,
)

sql_store = SQLStore(
    namespace=DOCUMENT_STORE_NAMESPACE,
    db_url=DOCUMENT_STORE_URL,
)
doc_store = create_kv_docstore(sql_store)

retriever = ParentDocumentRetriever(
    vectorstore=vector_store,
    docstore=doc_store,
    child_splitter=child_splitter,
)

to_add = [doc for doc in documents if sql_store.mget(keys=[document_to_id(doc)])[0] is None]
print(f"Documents to add: {len(to_add)}/{len(documents)} total documents")

# Add documents to the vector store in batches
for i in range(0, len(to_add), BATCH_SIZE):
    batch = to_add[i:i + BATCH_SIZE]
    retriever.add_documents(documents=batch, ids=[document_to_id(doc) for doc in batch])
    print(f"Processed batch {i // BATCH_SIZE + 1}/{(len(to_add) + BATCH_SIZE - 1) // BATCH_SIZE}")

Documents to add: 16/16 total documents
Processed batch 1/1
