In [1]:
!pip install pytube
!pip install youtube-transcript-api

Collecting pytube
  Downloading pytube-15.0.0-py3-none-any.whl.metadata (5.0 kB)
Downloading pytube-15.0.0-py3-none-any.whl (57 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytube
Successfully installed pytube-15.0.0
Collecting youtube-transcript-api
  Downloading youtube_transcript_api-1.0.3-py3-none-any.whl.metadata (23 kB)
Downloading youtube_transcript_api-1.0.3-py3-none-any.whl (2.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.2/2.2 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: youtube-transcript-api
Successfully installed youtube-transcript-api-1.0.3


In [2]:
from pytube import Playlist
from youtube_transcript_api import YouTubeTranscriptApi


playlist_url = "https://www.youtube.com/playlist?list=PL8PYTP1V4I8D4BeyjwWczukWq9d8PNyZp"
playlist = Playlist(playlist_url)

transcripts = {}

print(f"Found {len(list(playlist.video_urls))} videos in the playlist.")
for video in playlist.videos:
    video_id = video.video_id
    print(f"Processing video ID: {video_id}")

    try:

        transcript_segments = YouTubeTranscriptApi.get_transcript(video_id)

        full_text = " ".join(segment['text'] for segment in transcript_segments)
        transcripts[video_id] = full_text
        print(f"Transcript retrieved for video ID: {video_id}")
    except Exception as e:
        print(f"Could not retrieve transcript for video ID {video_id}: {e}")

for vid, text in transcripts.items():
    print(f"\nVideo ID: {vid}\nTranscript (first 200 chars): {text[:200]}...")
    print(f"\n video lenght: {len(text)}")


Found 23 videos in the playlist.
Processing video ID: MM48kc5Zq8A
Transcript retrieved for video ID: MM48kc5Zq8A
Processing video ID: F4ww_V6tA-w
Transcript retrieved for video ID: F4ww_V6tA-w
Processing video ID: 111E7iaRgY4
Transcript retrieved for video ID: 111E7iaRgY4
Processing video ID: vlAIa1eZVR4
Transcript retrieved for video ID: vlAIa1eZVR4
Processing video ID: 2oO8dQwT0ek
Transcript retrieved for video ID: 2oO8dQwT0ek
Processing video ID: iWcGS0gCL1E
Transcript retrieved for video ID: iWcGS0gCL1E
Processing video ID: 1Faf1cTe3T8
Transcript retrieved for video ID: 1Faf1cTe3T8
Processing video ID: hCnKbwPntrE
Transcript retrieved for video ID: hCnKbwPntrE
Processing video ID: hs37ze1j41A
Transcript retrieved for video ID: hs37ze1j41A
Processing video ID: KfQaYk4k9eM
Transcript retrieved for video ID: KfQaYk4k9eM
Processing video ID: DvVGkj4zhVU
Transcript retrieved for video ID: DvVGkj4zhVU
Processing video ID: qHNUVpKO2dc
Transcript retrieved for video ID: qHNUVpKO2dc
Process

In [3]:
!pip install transformers faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [4]:
import re
from transformers import AutoTokenizer, AutoModel
import torch
import faiss
import numpy as np

In [5]:

def preprocess_text(text):
    """Clean the text by removing extra whitespace."""
    text = re.sub(r'\s+', ' ', text).strip()
    return text

def chunk_text(text, chunk_size=200, overlap=20):
    """
    Split text into chunks with a given chunk size and overlap.
    Returns a list of chunks.
    """
    words = text.split()
    chunks = []
    start = 0
    while start < len(words):
        chunk = " ".join(words[start:start + chunk_size])
        chunks.append(chunk)
        start += chunk_size - overlap
    return chunks


all_chunks = []
metadata = []

for vid, text in transcripts.items():
    clean = preprocess_text(text)
    chunks = chunk_text(clean, chunk_size=200, overlap=20)
    for i, chunk in enumerate(chunks):
        all_chunks.append(chunk)
        metadata.append({"video_id": vid, "chunk_index": i})

print(f"Total chunks generated: {len(all_chunks)}")

Total chunks generated: 1385


In [6]:

model_name = "sentence-transformers/all-MiniLM-L6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

def get_embedding(text):
    """Generate an embedding for a text chunk using mean pooling."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)

    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().numpy()
    return embeddings

embeddings = [get_embedding(chunk) for chunk in all_chunks]
embedding_dim = embeddings[0].shape[0]
embedding_matrix = np.vstack(embeddings).astype("float32")
print(f"Embeddings generated with dimension: {embedding_dim}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Embeddings generated with dimension: 384


In [7]:
!pip install chromadb

Collecting chromadb
  Downloading chromadb-1.0.4-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.9 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi==0.115.9 (from chromadb)
  Downloading fastapi-0.115.9-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.24.1-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting onnxruntime>=1.14.1 (from chromadb)
  Downloading onnxruntime-1.21.0-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.5 kB)
Collecting opentelemetry-exporter-otlp-proto-grpc>=1.2.0 (from chromadb)
  Downloading opentele

In [8]:
import chromadb

chroma_client = chromadb.PersistentClient(path="/content/chroma_db")

collection = chroma_client.get_or_create_collection(name="youtube_rag")

for idx, (chunk, embedding) in enumerate(zip(all_chunks, embeddings)):
    collection.add(
        ids=[f"doc_{idx}"],
        documents=[chunk],
        embeddings=[embedding.tolist()]
    )

print(" Embeddings stored successfully in ChromaDB!")


 Embeddings stored successfully in ChromaDB!


In [9]:
query_text = "what is transformers"
query_embedding = get_embedding(query_text).tolist()


results = collection.query(
    query_embeddings=[query_embedding],
    n_results=3
)

print(" Retrieved Documents:", results["documents"])


 Retrieved Documents: [["um it's based entirely on attention and so it doesn't use uh recurrent neural networks or any other uh variety of sequence encoding mechanism uh it it just uses attention um it has strong results on a wrote machine translation because that's what they did in the original paper but really everything um and it's fast because it doesn't require any of the um you know incremental processing that rnns require as I mentioned before so there's two types of Transformers uh that we need to be aware of there's encoder decoder Transformers and these were the original type of Transformer they're used in uh semi poopular models like T5 and art um and then there's decoder only models and decoder only models only have a decoder so um the encoder decoder model looks like this the decoder only model looks like this um and I'll explain about uh both of these in turn so the Core Concepts in Transformers uh that are not covered by regular attention are positional encodings multi-h

In [10]:
def retrieve_relevant_docs(query_text, top_k=3):
    """Retrieve top-k relevant documents from ChromaDB."""
    query_embedding = get_embedding(query_text).tolist()
    results = collection.query(
        query_embeddings=[query_embedding],
        n_results=top_k
    )

    return results["documents"]
# Example query
query = "what is  perplexity"
retrieved_docs = retrieve_relevant_docs(query, top_k=3)

print("🔍 Retrieved Documents:")
for idx, doc in enumerate(retrieved_docs[0], 1):
    print(f"{idx}. {doc}")



🔍 Retrieved Documents:
1. and basically this is an exponentiated version of the the previous metrics that I talked about but it has a very intuitive interpretation um and so if I have this sentence here um and I say when a dog sees a squirrel it will usually does anyone have an idea what might come after this Chase spk Run yeah these are pretty good ones um so here's an example of calculating this according to a language model um there's also lots of other things that nobody guess um like be or jump or start or run or try um but the basic idea uh behind perplexity is it's kind of the average number of guesses you would need to take before if you randomly selected from a language model's probability distribution before you got the correct answer and um so if if the next word was B you would basically for this particular token you would get a perplexity of 28.4 because you would need to sample from the language model an average of 28.4 times before you got this and so um for easy tasks y

In [11]:
!pip install huggingface_hub



In [12]:
from huggingface_hub import login
import os

login(token="yours")
api_key = os.getenv("HUGGINGFACE_API_KEY")

from transformers import pipeline

api_key = os.getenv("HUGGINGFACE_API_KEY")
generator = pipeline("text-generation", model="gpt2", use_auth_token=api_key)

query = "Explain machine learning basics"
retrieved_docs = retrieve_relevant_docs(query, top_k=3)

retrieved_docs = [doc for sublist in retrieved_docs for doc in sublist]
context = " ".join(retrieved_docs)

prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"

response = generator(prompt, max_new_tokens=100)

print("🤖 GPT-2 Response:\n", response[0]['generated_text'])

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


🤖 GPT-2 Response:
 Context: so today I want to talk about uh learning from Human feedback and specifically reinforcement learning from Human feedback and so I'm going to talk about both uh you know learning algorithms and how we uh get feedback and how we we learn from it so up until this point most of the stuff we talked about was uh maximum likelihood training and specifically maximum likelihood training we have some sort of input this could be something like a prompt uh we have an output which is you know our predicted uh output and we calculate the log probability or negative log probability of the output given the input and we want to basically minimize this negative log probability minimize our loss and so everybody knows about this um you know you're implementing a lot of the loss calculations and stuff um is is part of assignment one but there's a few problems with this and uh here are some examples of problems uh with these learning algorithms so problem number one is that som

In [13]:


def interactive_rag():
    """Interactive RAG loop: Ask questions until 'exit' is entered."""
    while True:
        query = input("\n Ask a question (or type 'exit' to stop): ")
        if query.lower() == "exit":
            print("👋 Exiting RAG system. Have a great day!")
            break


        results = collection.query(
            query_embeddings=[get_embedding(query).tolist()],
            n_results=3
        )


        retrieved_docs = results["documents"][0]
        retrieved_sources = results["ids"][0]


        context = " ".join(retrieved_docs)


        prompt = f"Context: {context}\n\nQuestion: {query}\n\nAnswer:"
        response = generator(prompt, max_new_tokens=100)


        print("\nGPT-2 Response: ")
        print(response[0]['generated_text'])

        print("\n Sources:")
        for idx, source in enumerate(retrieved_sources, 1):
            print(f"{idx}. {source}")


interactive_rag()



 Ask a question (or type 'exit' to stop): what is RNN


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



GPT-2 Response: 
Context: infinite length sequence um but the disadvantage of rnns is that this you know Vector is a fixed size and so it's very hard to remember all of the information previously in the sequences another issue with rnn's is that each step of the RNN uh requires the output of the previous step so they become quite hard to parallelize on gpus and because of this their flop utilization rate on uh kind of modern gpus is very low and so they aren't really an effective way to use lots and lots of compute that we have on uh on Modern gpus and because of this they've kind of fallen out of favor and aren't used very widely anymore so another option is uh something based on convolution and convolutions uh what they are is they're basically something that applies a single function of the inputs across lots of um across a sliding window on the inputs so uh to give an example we could have a CNN uh that takes in these three vectors and somehow calculates the output another CNN tha

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



GPT-2 Response: 
Context: um it's based entirely on attention and so it doesn't use uh recurrent neural networks or any other uh variety of sequence encoding mechanism uh it it just uses attention um it has strong results on a wrote machine translation because that's what they did in the original paper but really everything um and it's fast because it doesn't require any of the um you know incremental processing that rnns require as I mentioned before so there's two types of Transformers uh that we need to be aware of there's encoder decoder Transformers and these were the original type of Transformer they're used in uh semi poopular models like T5 and art um and then there's decoder only models and decoder only models only have a decoder so um the encoder decoder model looks like this the decoder only model looks like this um and I'll explain about uh both of these in turn so the Core Concepts in Transformers uh that are not covered by regular attention are positional encodings multi

In [14]:

# --- Step 3: index the embeddings with FAISS ---

index = faiss.IndexFlatL2(embedding_dim)  # using L2  distance
index.add(embedding_matrix)
print(f"Indexed {index.ntotal} embeddings.")

def retrieve(query, k=3):
    """Retrieve the top-k relevant chunks for a given query."""
    query_embedding = get_embedding(query).astype("float32")
    query_embedding = np.expand_dims(query_embedding, axis=0)
    distances, indices = index.search(query_embedding, k)
    results = [all_chunks[i] for i in indices[0]]

    results_meta = [metadata[i] for i in indices[0]]
    return results, results_meta


query = "What are the main points discussed in the CMU Advanced NLP lectures?"
retrieved_chunks, retrieved_meta = retrieve(query, k=3)
print("\nRetrieved Chunks:")
for i, (chunk, meta) in enumerate(zip(retrieved_chunks, retrieved_meta), 1):
    print(f"Chunk {i} (from video {meta['video_id']}): {chunk[:200]}...")


Indexed 1385 embeddings.

Retrieved Chunks:
Chunk 1 (from video MM48kc5Zq8A): and also um knowledge based QA information extraction other things like this um and we're also going to talk about Linguistics and multilinguality so um we're going to be talking about um the differen...
Chunk 2 (from video MM48kc5Zq8A): analysis Stanford core NLP and Spacey um and both of them make just very obvious mistakes on things that should be very easy so this is named dentity recognition kind of a task that many people consid...
Chunk 3 (from video 5qA0DtprgFY): and structured um even if you may not agree with certain theories um and all of that aside I think that Linguistics is just fun you don't really need like an engineering reason to study things um so I...


In [16]:
while True:
    user_query = input("Please enter your query for the RAG system (or type 'stop' to exit): ")
    if user_query.lower().strip() == "stop":
        print("Exiting the RAG interactive loop.")
        break
    final_answer = rag_pipeline(user_query, top_k=3)
    print("\nFinal Generated Answer:")
    print(final_answer)
    print("\n" + "="*50 + "\n")

Please enter your query for the RAG system (or type 'stop' to exit): what is bag of word

Final Generated Answer:
Bag of Words for the Word Representation
This question is the answer for this topic and answers for all the other questions that I've seen so far in my research.
In the main part of my article I talked a little bit about
warp learning, how the neural network works and how it works, I also talked
about bag and bag
classification learning in
dense word representations
and the word classification and
word representation learning
for instance
I was also looking at bag learning for example
yes
but the term "bag" is only useful to me
in terms of how to classify a word correctly
from a single word space to a
sequence of
words and I didn't look
at bag or


Please enter your query for the RAG system (or type 'stop' to exit): stop
Exiting the RAG interactive loop.
