## 1. Preparing the environment


In [2]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
import pandas as pd

df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [5]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

## 2. Basics of Text Search

* Information Retrieval - The process of obtaining relevant information from large datasets based on user queries.
* Vector Spaces - A mathematical representation where text is converted into vectors (points in space) allowing for quantitative comparison.
* Bag of Words - A simple text representation model treating each document as a collection of words disregarding grammar and word order but keeping multiplicity.
* TF-IDF (Term Frequency-Inverse Document Frequency) - A statistical measure used to evaluate how important a word is to a document in a collection or corpus. It increases with the number of times a word appears in the document but is offset by the frequency of the word in the corpus.


Vector Spaces
 - turn the docs into vectors
 - term-document matrix:
   - rows: documents
   - columns: words / token

In [24]:
docs_example = [
    "January course details, register now",
    "Course prerequisites listed in January catalog",
    "Submit January course homework by end of month",
    "Register for January course, no prerequisites",
    "January course setup: Python and Google Cloud"
]

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4
catalog,0,1,0,0,0
cloud,0,0,0,0,1
course,1,1,1,1,1
details,1,0,0,0,0
end,0,0,1,0,0
google,0,0,0,0,1
homework,0,0,1,0,0
january,1,1,1,1,1
listed,0,1,0,0,0
month,0,0,1,0,0


In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer

cv = TfidfVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

Unnamed: 0,0,1,2,3,4
catalog,0.0,0.57,0.0,0.0,0.0
cloud,0.0,0.0,0.0,0.0,0.47
course,0.33,0.27,0.23,0.36,0.23
details,0.69,0.0,0.0,0.0,0.0
end,0.0,0.0,0.47,0.0,0.0
google,0.0,0.0,0.0,0.0,0.47
homework,0.0,0.0,0.47,0.0,0.0
january,0.33,0.27,0.23,0.36,0.23
listed,0.0,0.57,0.0,0.0,0.0
month,0.0,0.0,0.47,0.0,0.0


Query-Document Similarity


In [27]:
query = "Do I need to know python to sign up for the January course?"

q = cv.transform([query])
q.toarray()

array([[0.        , 0.        , 0.39515588, 0.        , 0.        ,
        0.        , 0.        , 0.39515588, 0.        , 0.        ,
        0.        , 0.829279  , 0.        , 0.        , 0.        ]])

We can see the words of the query and the words of some document:

In [28]:
query_dict = dict(zip(names, q.toarray()[0]))
query_dict

{'catalog': 0.0,
 'cloud': 0.0,
 'course': 0.39515588491314224,
 'details': 0.0,
 'end': 0.0,
 'google': 0.0,
 'homework': 0.0,
 'january': 0.39515588491314224,
 'listed': 0.0,
 'month': 0.0,
 'prerequisites': 0.0,
 'python': 0.8292789960182417,
 'register': 0.0,
 'setup': 0.0,
 'submit': 0.0}

In [29]:
doc_dict = dict(zip(names, X.toarray()[1]))
doc_dict

{'catalog': 0.5675015398728066,
 'cloud': 0.0,
 'course': 0.2704175244456293,
 'details': 0.0,
 'end': 0.0,
 'google': 0.0,
 'homework': 0.0,
 'january': 0.2704175244456293,
 'listed': 0.5675015398728066,
 'month': 0.0,
 'prerequisites': 0.45785666908911726,
 'python': 0.0,
 'register': 0.0,
 'setup': 0.0,
 'submit': 0.0}

In [30]:
df_qd = pd.DataFrame([query_dict, doc_dict], index=['query', 'doc']).T

(df_qd['query'] * df_qd['doc']).sum()

0.21371415233666782

cosine similarity

In [31]:
X.dot(q.T).toarray()

array([[0.25955955],
       [0.21371415],
       [0.17843726],
       [0.28419115],
       [0.57137158]])

In [32]:
from sklearn.metrics.pairwise import cosine_similarity


In [33]:
cosine_similarity(X, q)


array([[0.25955955],
       [0.21371415],
       [0.17843726],
       [0.28419115],
       [0.57137158]])

In [34]:
df.columns


Index(['course', 'section', 'question', 'text'], dtype='object')

In [35]:
fields = ['section', 'question', 'text']
transformers = {}
matrices = {}

for field in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=3)
    X = cv.fit_transform(df[field])

    transformers[field] = cv
    matrices[field] = X

In [36]:
transformers['text'].get_feature_names_out()


array(['001', '01', '02', ..., 'zones', 'zoom', 'zoomcamp'], dtype=object)

In [37]:
matrices['text']


<948x2118 sparse matrix of type '<class 'numpy.float64'>'
	with 26463 stored elements in Compressed Sparse Row format>

In [38]:
query = "I just singned up. Is it too late to join the course?"


In [39]:
q = transformers['text'].transform([query])
score = cosine_similarity(matrices['text'], q).flatten()

In [40]:
mask = (df.course == 'data-engineering-zoomcamp').values
score = score * mask
score[:10]

array([0.3336047 , 0.        , 0.        , 0.1328874 , 0.        ,
       0.        , 0.        , 0.12722114, 0.        , 0.        ])

In [41]:
import numpy as np


In [42]:
idx = np.argsort(-score)[:10]
idx

array([  0,  15,  22,  27,  38, 287,   3,   7, 113,  11])

In [43]:
df.iloc[idx].text


0      The purpose of this document is to capture fre...
15     No, late submissions are not allowed. But if t...
22     It's up to you which platform and environment ...
27     You can do most of the course without a cloud....
38     You will have two attempts for a project. If t...
287    This error could result if you are using some ...
3      You don't need it. You're accepted. You can al...
7      Yes, we will keep all the materials after the ...
113    In the join queries, if we mention the column ...
11     No, you can only get a certificate if you fini...
Name: text, dtype: object

The TF-IDF vectorizer already outputs a normalized vectors, so the results are identical. We won't go into details of how it works, but you can check "Introduction to Infromation Retrieval" if you want to learn more.