In [1]:
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
import bs4

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [2]:
loader = WebBaseLoader(
    web_paths=("https://www.josherich.me/podcast/low-level-technicals-of-llms-daniel-han",),
    bs_kwargs=dict(
        parse_only=bs4.SoupStrainer(
            class_=("post-page")
        )
    ),
)
docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
all_splits = text_splitter.split_documents(docs)
all_splits, len(all_splits)

([Document(metadata={'source': 'https://www.josherich.me/podcast/low-level-technicals-of-llms-daniel-han'}, page_content='Low Level Technicals of LLMs: Daniel Han\n15 May 2025'),
  Document(metadata={'source': 'https://www.josherich.me/podcast/low-level-technicals-of-llms-daniel-han'}, page_content='Low Level Technicals of LLMs: Daniel Han\n[Music]\nWelcome to the AI Engineers world. This is the first workshop. There are a few others running, but thanks for coming. We just arrived from Australia with my brother. I think he’s over there somewhere. Yes, we just came here. We didn’t know a lot of stuff about SF and I think maybe the US is a bit different from Australia. But yeah, we’re very excited to be here. We’re going to stay here for a few months, so if you want to meet up, you can just hit me up via email or Twitter or wherever.'),
  Document(metadata={'source': 'https://www.josherich.me/podcast/low-level-technicals-of-llms-daniel-han'}, page_content='So today I’m going to be talkin

In [3]:
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [4]:
from langchain_chroma import Chroma

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db",  # Where to save data locally, remove if not necessary
)

In [11]:
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain.chat_models import init_chat_model

llm = init_chat_model("gpt-4o-mini", model_provider="openai")

In [5]:
_ = vector_store.add_documents(documents=all_splits)

In [17]:
import pprint
question = "what gpu does google colab offer?"
retrieved_docs = vector_store.similarity_search(question)
docs_content = "\n\n".join(doc.page_content for doc in retrieved_docs)
pprint.pp(docs_content)
len(docs_content.split(' '))

('We’ll be showing some Triton code. I don’t know if we have time for '
 'programming Triton, but that’ll be another topic. The purpose of unof is to '
 'make everyone able to fine-tune their language models with very bad GPUs, '
 'like Tesla T4s. Does anyone know that Google Colab has free Tesla T4s?\n'
 'Yes, right? 65 Tera flops; it’s actually not that bad if you use it '
 'properly. Reminder: there’s a common misconception that the P100s on Kaggle '
 'are faster; that’s actually not correct. I think P100s are five times slower '
 'than Tesla T4s. Although it’s more expensive as a GPU, it’s actually slower. '
 'Please do not select the P100s on Kaggle.\n'
 'Kaggle has 30 hours for free per week GPUs, and you get two Tesla T4s, so '
 'that’s 130 teraflops per week. That is actually very powerful. I think it’s '
 'the same as RTX 3070, but I can’t remember exactly. Kaggle has 30 hours for '
 'free per week. Google Colab depends on how much you use; normally you get '
 'four hours per 

528

In [12]:
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")

messages = prompt.invoke({"question": question, "context": docs_content})
response = llm.invoke(messages)
response.content



'Google Colab offers free Tesla T4 GPUs for users. The Tesla T4 has a computing capability of 65 teraflops. In comparison, it is mentioned that the P100s available on Kaggle are slower than the T4s.'