In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np

# Loading data

In [2]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [5]:
df[df.course == "data-engineering-zoomcamp"]

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
430,data-engineering-zoomcamp,Workshop 2 - RisingWave,Unable to Open Dashboard as xdg-open doesn’t o...,Refer to the solution given in the first solut...
431,data-engineering-zoomcamp,Workshop 2 - RisingWave,Resolving Python Interpreter Path Inconsistenc...,Example Error:\nWhen attempting to execute a P...
432,data-engineering-zoomcamp,Workshop 2 - RisingWave,How does windowing work in Sql?,Ans : Windowing in streaming SQL involves defi...
433,data-engineering-zoomcamp,Triggers in Mage via CLI,"Encountering the error ""ModuleNotFoundError: N...","Python 3.12.1, is not compatible with kafka-py..."


# Vectorization

In [6]:
docs_example = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

Text into vectors

In [7]:
# Simple bag-of-words model
# Thinking about the frequency of a word in a single document 
cv = CountVectorizer(stop_words='english') #removes English stop words (e.g., “the”, “is”, “and”) during vectorization to keep only more meaningful words.
X = cv.fit_transform(docs_example) #doc to matrix
#each row is one document
#each column is one term (word)
names = cv.get_feature_names_out() # list of words (features/columns) that remain after stop word removal

df_docs = pd.DataFrame(X.toarray(), columns=names).T # transposed matrix
df_docs

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


In [8]:
# Relevance of both frequency and relevance of a word in a document
cv = TfidfVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

Unnamed: 0,0,1,2,3,4
15th,0.46,0.0,0.0,0.0,0.0
2024,0.46,0.0,0.0,0.0,0.0
cloud,0.0,0.0,0.0,0.0,0.46
course,0.37,0.0,0.0,0.0,0.37
date,0.0,0.0,0.5,0.0,0.0
github,0.0,0.58,0.0,0.0,0.0
google,0.0,0.0,0.0,0.0,0.46
homeworks,0.0,0.0,0.5,0.0,0.0
jan,0.46,0.0,0.0,0.0,0.0
listed,0.0,0.58,0.0,0.0,0.0


Example for word="15th", compute TF-IDF for "15th" in Doc 0:

Step 1: TF
Words in Doc 0: ["course", "starts", "15th", "jan", "2024"] (after stop word removal)
TF("15th", Doc 0) = 1 / 5 = 0.20

Step 2: IDF
"15th" appears in only one document → df = 1
N = 5 documents total
Using the formula:
IDF("15th") = log(5 / (1 + 1)) + 1 = log(2.5) + 1 ≈ 0.92 + 1 = 1.92

Step 3: TF-IDF
TF-IDF("15th", Doc 0) = 0.20 × 1.92 ≈ 0.384

Then scikit-learn normalizes the document vector (L2 norm by default), so that the sum of squares of all TF-IDF values for a document = 1.

After normalization, ~0.384 might become ~0.46, depending on the other values in the doc.
_____________________________________

Why course has lower TF-IDF than 15th
Even though "course" appears in two documents, its IDF is lower:
df = 2
IDF(course) = log(5 / (2 + 1)) + 1 = log(1.67) + 1 ≈ 0.51 + 1 = 1.51
So the TF-IDF of "course" is lower than that of a rare word like "15th".

_____________________________________
NOTES
Word	        Appears In Docs	      TF-IDF High If...
Common word	    Many docs	         → Lower IDF → Lower TF-IDF
Rare word	    Few docs	         → Higher IDF → Higher TF-IDF
Frequent in doc	Many times	         → Higher TF → Higher TF-IDF

In [9]:
query = "Do I need to know python to sign up for the January course?"

q = cv.transform([query])
# transforms the query using the same vocabulary learned from docs_example.
# This means it only considers words that exist in the origional documents
# anything new is ignored.

q.toarray()

array([[0.        , 0.        , 0.        , 0.62791376, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.77828292, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ]])

In [10]:
query_dict = dict(zip(names, q.toarray()[0]))
query_dict

doc_dict = dict(zip(names, X.toarray()[1]))
doc_dict

{'15th': np.float64(0.0),
 '2024': np.float64(0.0),
 'cloud': np.float64(0.0),
 'course': np.float64(0.0),
 'date': np.float64(0.0),
 'github': np.float64(0.5773502691896258),
 'google': np.float64(0.0),
 'homeworks': np.float64(0.0),
 'jan': np.float64(0.0),
 'listed': np.float64(0.5773502691896258),
 'participation': np.float64(0.0),
 'prerequisites': np.float64(0.5773502691896258),
 'python': np.float64(0.0),
 'registration': np.float64(0.0),
 'required': np.float64(0.0),
 'setup': np.float64(0.0),
 'start': np.float64(0.0),
 'starts': np.float64(0.0),
 'submit': np.float64(0.0)}

In [11]:
df_qd = pd.DataFrame([query_dict, doc_dict], index=['query', 'doc']).T

(df_qd['query'] * df_qd['doc']).sum()

np.float64(0.0)

In [12]:
X.dot(q.T).toarray()

array([[0.23490553],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.59579005]])

In [13]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(X, q)

array([[0.23490553],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.59579005]])

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(X, q)

array([[0.23490553],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.59579005]])

# Vectorizing all the documents

In [15]:
fields = ['section', 'question', 'text']
transformers = {}
matrices = {}

for field in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=3)
    X = cv.fit_transform(df[field])

    transformers[field] = cv
    matrices[field] = X

transformers['text'].get_feature_names_out()
matrices['text']

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 26463 stored elements and shape (948, 2118)>

In [16]:
query = "I just signed up. Is it too late to join the course?"

q = transformers['text'].transform([query])
score = cosine_similarity(matrices['text'], q).flatten()

In [17]:
mask = (df.course == 'data-engineering-zoomcamp').values
score = score * mask

In [18]:
idx = np.argsort(-score)[:10]

In [19]:
df.iloc[idx].text

0      The purpose of this document is to capture fre...
15     No, late submissions are not allowed. But if t...
22     It's up to you which platform and environment ...
27     You can do most of the course without a cloud....
38     You will have two attempts for a project. If t...
287    This error could result if you are using some ...
3      You don't need it. You're accepted. You can al...
7      Yes, we will keep all the materials after the ...
113    In the join queries, if we mention the column ...
11     No, you can only get a certificate if you fini...
Name: text, dtype: object

In [20]:
boost = {'question': 3.0}

score = np.zeros(len(df))

for f in fields:
    b = boost.get(f, 1.0)
    q = transformers[f].transform([query])
    s = cosine_similarity(matrices[f], q).flatten()
    score = score + b * s

In [21]:
filters = {
    'course': 'data-engineering-zoomcamp'
}

for field, value in filters.items():
    mask = (df[field] == value).values
    score = score * mask

In [22]:
idx = np.argsort(-score)[:10]
results = df.iloc[idx]
results.to_dict(orient='records')

[{'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites'},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'How can we contribute to the course?',
  'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.'},
 {'course': 'data-engineering-zoomcamp',
  'section': 'General course-related questions',
  'question': 'Course - Which playlist on YouTube should I refer to?',
  'text': 'All the main videos are stored in the Main “DATA ENGINEERING” playlist (no year specified). The Github repository has also been updated to show each video with a thumbnail, that would bring you directly to the same playlist below.\nBelow is the MAIN PLAYLIST’. And then you refer to the

# Embeddings and vector search

In [23]:
from sklearn.decomposition import TruncatedSVD

X = matrices['text']
cv = transformers['text']

svd = TruncatedSVD(n_components=16)
X_emb = svd.fit_transform(X)

X_emb[0]

array([ 0.08800348, -0.07511074, -0.10124063,  0.05141978,  0.05144676,
       -0.0608464 ,  0.0203718 ,  0.05648187, -0.18921236,  0.33783015,
        0.03249474,  0.09268806, -0.10909555, -0.03839472,  0.05036441,
        0.02209768])

## Embeddings and Vector Search
__The Problem with Plain Text Search__
When searching through text using basic methods like keyword matching (code above), we only find exact matches. But what if someone uses a synonym or slightly different wording? That’s where embeddings come in.

What Are Embeddings?
Embeddings are a way to turn words, sentences, or even whole documents into numbers — specifically, into dense numerical vectors (basically arrays of numbers). Here's why that matters:

They capture meaning: Words or phrases with similar meanings end up with similar vectors, even if they’re not exactly the same.

They simplify complexity: Embeddings reduce the complexity of text by turning it into a more compact, structured form.

They enable machine learning: These numerical vectors can be used in models for things like recommendations, sentiment analysis, and finding patterns in text.

__SVD: A Simple Embedding Method__
One basic technique to generate embeddings is Singular Value Decomposition (SVD). It works on top of the Bag-of-Words model and helps reduce the dimensionality of the data.

Keep in mind:

SVD doesn't capture word order (because Bag-of-Words doesn’t either).

It does help compress and simplify the data while preserving key relationships — like finding synonyms.

This is a form of lossy compression: we lose some detail, but the overall structure and meaning are still preserved.

NOTE that SVD helps convert word count data into a more meaningful and compact numerical format that machine learning models can understand better.

In [24]:
from sklearn.decomposition import TruncatedSVD

X = matrices['text']
cv = transformers['text']

svd = TruncatedSVD(n_components=16)
X_emb = svd.fit_transform(X)

X_emb[0]

array([ 0.08799789, -0.07503867, -0.10031805,  0.04984937,  0.0530062 ,
       -0.05854748,  0.02024745,  0.0569222 , -0.19712635,  0.3409762 ,
        0.044602  ,  0.08600388, -0.11498712,  0.07985993, -0.03518501,
        0.00439217])

In [25]:
query = 'I just signed up. Is it too late to join the course?'

Q = cv.transform([query])
Q_emb = svd.transform(Q)
Q_emb[0]

array([ 0.04353632, -0.03055826, -0.04438354,  0.01206052,  0.02563275,
       -0.05145829,  0.00942722,  0.03673467, -0.11967534,  0.17740621,
        0.03776278,  0.06520014, -0.06504636,  0.03812922, -0.0212098 ,
        0.00444466])

In [26]:
np.dot(X_emb[0], Q_emb[0])

np.float64(0.12049567665842252)

For all of the documents

In [27]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)

['Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
 'If you have submitted two projects (and peer-reviewed at least 3 course-mates’ projects for each submission), you will get the certificate for the course. According to the course coordinator, Alexey Grigorev, only two projects are needed to get the course certificate.\n(optional) David Odimegwu',
 'Please choose the closest one to your answer. Also do not post your answer in the course slack channel.',
 'No, it’s not possible. The form is closed after the due date. But don’t worry, homework is not mandatory for finishing the course.',
 'The course videos are pre-recorded, you can start watching the course right n

__Non-Negative Matrix Factorization (NMF)__

When we use SVD to create embeddings, the resulting vectors often contain negative numbers, which can make them hard to interpret.

That’s when we use Non-Negative Matrix Factorization (NMF). It works in a similar way to SVD, but with one key difference: it only works with non-negative numbers both in the input and the output.

Why does that matter?
Because with NMF, we can more easily understand the results. Each column in the embedding can be thought of as a topic or concept, and the numbers in the vector tell us how strongly a document relates to each of those topics.

In conclusion, NMF gives us a way to break down a document into recognizable, interpretable pieces, showing how much it's "about" each concept all using only positive numbers, which are more intuitive to work with.

In [28]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=16)
X_emb = nmf.fit_transform(X)
X_emb[0]

array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 7.61547645e-06, 0.00000000e+00,
       0.00000000e+00, 3.07197604e-01, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

In [29]:
Q = cv.transform([query])
Q_emb = nmf.transform(Q)
Q_emb[0]

array([0.        , 0.00084236, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.17338195,
       0.        , 0.        , 0.        , 0.        , 0.00070699,
       0.        ])

In [30]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)

['The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.',
 'Everything is recorded, so you won’t miss anything. You will be able to ask your questions for office hours in advance and we will cover them during the live stream. Also, you can always ask questions in Slack.',
 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
 'No, it’s not possible. The form is closed after the due date. But don’t

__BERT__
Until now our approaches don't consider the word order and manage the words separately.
There are models that solve that issue, one of them is BERT. 

In [31]:
import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Set the model to evaluation mode if not training

  from .autonotebook import tqdm as notebook_tqdm


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [32]:
texts = [
    "Yes, we will keep all the materials after the course finishes.",
    "You can follow the course at your own pace after it finishes"
]
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')

In [33]:
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**encoded_input)
    hidden_states = outputs.last_hidden_state

In [34]:
sentence_embeddings = hidden_states.mean(dim=1)
sentence_embeddings.shape

torch.Size([2, 768])

In [35]:
X_emb = sentence_embeddings.numpy()

In [36]:
sentence_embeddings_cpu = sentence_embeddings.cpu()

In [37]:
def make_batches(seq, n):
    result = []
    for i in range(0, len(seq), n):
        batch = seq[i:i+n]
        result.append(batch)
    return result

In [38]:
from tqdm.auto import tqdm
texts = df['text'].tolist()
text_batches = make_batches(texts, 8)

all_embeddings = []

for batch in tqdm(text_batches):
    encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')

    with torch.no_grad():
        outputs = model(**encoded_input)
        hidden_states = outputs.last_hidden_state
        
        batch_embeddings = hidden_states.mean(dim=1)
        batch_embeddings_np = batch_embeddings.cpu().numpy()
        all_embeddings.append(batch_embeddings_np)

final_embeddings = np.vstack(all_embeddings)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 119/119 [09:47<00:00,  4.94s/it]


In [42]:
final_embeddings

array([[-0.00456304, -0.11667512,  0.6274718 , ..., -0.03659191,
         0.10031676,  0.02927125],
       [-0.1423361 , -0.19853921,  0.28455415, ..., -0.01139052,
        -0.1539977 ,  0.0953508 ],
       [ 0.19672242, -0.08461309,  0.2820051 , ...,  0.1139587 ,
        -0.06448027, -0.01282615],
       ...,
       [-0.28217432, -0.33324352,  0.29784998, ..., -0.35042733,
         0.03266049,  0.09537259],
       [-0.428071  , -0.39468753,  0.30941996, ..., -0.05943284,
        -0.12965173,  0.0788705 ],
       [-0.16892129, -0.25146273,  0.47843292, ..., -0.18535416,
        -0.16108926,  0.27272925]], shape=(948, 768), dtype=float32)

In [43]:
def compute_embeddings(texts, batch_size=8):
    text_batches = make_batches(texts, 8)
    
    all_embeddings = []
    
    for batch in tqdm(text_batches):
        encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
    
        with torch.no_grad():
            outputs = model(**encoded_input)
            hidden_states = outputs.last_hidden_state
            
            batch_embeddings = hidden_states.mean(dim=1)
            batch_embeddings_np = batch_embeddings.cpu().numpy()
            all_embeddings.append(batch_embeddings_np)
    
    final_embeddings = np.vstack(all_embeddings)
    return final_embeddings

In [44]:
X_text = compute_embeddings(df['text'].tolist())

100%|████████████████████████████████████████████| 119/119 [08:21<00:00,  4.22s/it]
