In [86]:
import requests
import pandas as pd
import numpy as np

In [2]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'

In [5]:
docs = requests.get(docs_url)

In [8]:
raw_docs = docs.json()

In [40]:
documents = []
for course in raw_docs:
    course_name = course['course']
    
    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [43]:
data = pd.DataFrame(documents, columns=['text','section','question','course'])

In [46]:
df = data[data['course'] == 'data-engineering-zoomcamp']

In [47]:
df.head()

Unnamed: 0,text,section,question,course
0,The purpose of this document is to capture fre...,General course-related questions,Course - When will the course start?,data-engineering-zoomcamp
1,GitHub - DataTalksClub data-engineering-zoomca...,General course-related questions,Course - What are the prerequisites for this c...,data-engineering-zoomcamp
2,"Yes, even if you don't register, you're still ...",General course-related questions,Course - Can I still join the course after the...,data-engineering-zoomcamp
3,You don't need it. You're accepted. You can al...,General course-related questions,Course - I have registered for the Data Engine...,data-engineering-zoomcamp
4,You can start by installing and setting up all...,General course-related questions,Course - What can I do before the course starts?,data-engineering-zoomcamp


In [48]:
documents = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

In [49]:
from sklearn.feature_extraction.text import CountVectorizer

In [50]:
cv = CountVectorizer(stop_words='english',max_df=5)

In [51]:
cv.fit(documents)

In [53]:
cv.get_feature_names_out()

array(['15th', '2024', 'cloud', 'course', 'date', 'github', 'google',
       'homeworks', 'jan', 'listed', 'participation', 'prerequisites',
       'python', 'registration', 'required', 'setup', 'start', 'starts',
       'submit'], dtype=object)

In [55]:
X = cv.transform(documents)

In [57]:
pd.DataFrame(X.todense(),columns=cv.get_feature_names_out())

Unnamed: 0,15th,2024,cloud,course,date,github,google,homeworks,jan,listed,participation,prerequisites,python,registration,required,setup,start,starts,submit
0,1,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0
1,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0
2,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,1
3,0,0,0,0,0,0,0,0,0,0,1,0,0,1,1,0,0,0,0
4,0,0,1,1,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0


In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [60]:
tfv = TfidfVectorizer(stop_words='english')
tfv.fit(documents)

In [61]:
tfv.get_feature_names_out()

array(['15th', '2024', 'cloud', 'course', 'date', 'github', 'google',
       'homeworks', 'jan', 'listed', 'participation', 'prerequisites',
       'python', 'registration', 'required', 'setup', 'start', 'starts',
       'submit'], dtype=object)

In [62]:
Y = tfv.transform(documents)
pd.DataFrame(Y.todense(),columns=tfv.get_feature_names_out())

Unnamed: 0,15th,2024,cloud,course,date,github,google,homeworks,jan,listed,participation,prerequisites,python,registration,required,setup,start,starts,submit
0,0.463693,0.463693,0.0,0.374105,0.0,0.0,0.0,0.0,0.463693,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.463693,0.0
1,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.0,0.57735,0.0,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.5,0.0,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,0.0,0.5
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.0,0.0,0.57735,0.57735,0.0,0.0,0.0,0.0
4,0.0,0.0,0.463693,0.374105,0.0,0.0,0.463693,0.0,0.0,0.0,0.0,0.0,0.463693,0.0,0.0,0.463693,0.0,0.0,0.0


In [71]:
tfv = TfidfVectorizer(stop_words='english',min_df=5)
tfv.fit(df.text)

In [77]:
names = tfv.get_feature_names_out()

In [79]:
X = tfv.transform(df.text)

In [78]:
query = "Do I need to know python to sign up for the January course?"

q = tfv.transform([query])

In [95]:
from sklearn.metrics.pairwise import cosine_similarity

In [90]:
X.dot(q.T).todense()

matrix([[0.4389195 ],
        [0.        ],
        [0.        ],
        [0.10272051],
        [0.09061387],
        [0.        ],
        [0.        ],
        [0.37634321],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.34569631],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.16813692],
        [0.        ],
        [0.        ],
        [0.11305004],
        [0.14072055],
        [0.        ],
        [0.09828835],
        [0.35236451],
        [0.17154441],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.39401731],
        [0.24051751],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.13137124],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.        ],
        [0.01703551],
        [0.05064299],
        [0.11656627],
        [0.        ],
        [0.        ],
        [0.        ],
        [0

In [96]:
cosine_similarity(X,q)

array([[0.4389195 ],
       [0.        ],
       [0.        ],
       [0.10272051],
       [0.09061387],
       [0.        ],
       [0.        ],
       [0.37634321],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.34569631],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.16813692],
       [0.        ],
       [0.        ],
       [0.11305004],
       [0.14072055],
       [0.        ],
       [0.09828835],
       [0.35236451],
       [0.17154441],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.39401731],
       [0.24051751],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.13137124],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.01703551],
       [0.05064299],
       [0.11656627],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.        ],
       [0.048

In [97]:
score = cosine_similarity(X,q).flatten()
np.argsort(score)[-5:]

array([ 22, 398,   7,  27,   0])

In [99]:
df.iloc[[ 22, 398,   7,  27,   0],:]

Unnamed: 0,text,section,question,course
22,It's up to you which platform and environment ...,General course-related questions,Environment - Do we really have to use GitHub ...,data-engineering-zoomcamp
398,You need to redefine the python environment va...,Project,How to run python as start up script?,data-engineering-zoomcamp
7,"Yes, we will keep all the materials after the ...",General course-related questions,Course - Can I follow the course after it fini...,data-engineering-zoomcamp
27,You can do most of the course without a cloud....,General course-related questions,Environment - The GCP and other cloud provider...,data-engineering-zoomcamp
0,The purpose of this document is to capture fre...,General course-related questions,Course - When will the course start?,data-engineering-zoomcamp


In [101]:
df.iloc[22,0]

"It's up to you which platform and environment you use for the course.\nGithub codespaces or GCP VM are just possible options, but you can do the entire course from your laptop."

In [102]:
fields = ['section', 'question', 'text']
transformers = {}
matrices = {}

for field in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=3)
    X = cv.fit_transform(df[field])

    transformers[field] = cv
    matrices[field] = X

transformers['text'].get_feature_names_out()
matrices['text']

<435x1177 sparse matrix of type '<class 'numpy.float64'>'
	with 12020 stored elements in Compressed Sparse Row format>

In [105]:
query = "I just singned up. Is it too late to join the course?"

q = transformers['text'].transform([query])
score = cosine_similarity(matrices['text'], q).flatten()

In [109]:
df.iloc[np.argsort(score)[-5:],:]['text'].values

array(["You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
       "This error could result if you are using some select * query without mentioning the name of table for ex:\nwith dim_zones as (\nselect * from `engaged-cosine-374921`.`dbt_victoria_mola`.`dim_zones`\nwhere borough != 'Unknown'\n),\nfhv as (\nselect * from `engaged-cosine-374921`.`dbt_victoria_mola`.`stg_fhv_tripdata`\n)\nselect * from fhv\ninner join dim_zones as pickup_zone\non fhv.PUlocationID = pickup_zone.locationid\ninner join dim_zones as dropoff_zone\non fhv.DOlocationID = dropoff_zone.locationid\n);\nTo resolve just replace use : select fhv.* from fhv",
       'You can do most of the course without a cloud. Almost everything we use (excluding BigQuery) can be run locally. We won’t be able to provide guidelines for some things, but most of the

In [111]:
transformers

{'section': TfidfVectorizer(min_df=3, stop_words='english'),
 'question': TfidfVectorizer(min_df=3, stop_words='english'),
 'text': TfidfVectorizer(min_df=3, stop_words='english')}

In [112]:
boost = {'question': 3.0}

score = np.zeros(len(df))

for f in fields:
    b = boost.get(f, 1.0)
    q = transformers[f].transform([query])
    s = cosine_similarity(matrices[f], q).flatten()
    score = score + b * s

In [116]:
idx = np.argsort(score)[-5:]

In [117]:
df.iloc[idx]

Unnamed: 0,text,section,question,course
9,All the main videos are stored in the Main “DA...,General course-related questions,Course - Which playlist on YouTube should I re...,data-engineering-zoomcamp
34,Star the repo! Share it with friends if you fi...,General course-related questions,How can we contribute to the course?,data-engineering-zoomcamp
1,GitHub - DataTalksClub data-engineering-zoomca...,General course-related questions,Course - What are the prerequisites for this c...,data-engineering-zoomcamp
4,You can start by installing and setting up all...,General course-related questions,Course - What can I do before the course starts?,data-engineering-zoomcamp
0,The purpose of this document is to capture fre...,General course-related questions,Course - When will the course start?,data-engineering-zoomcamp


In [118]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [120]:
index = TextSearch(
    text_fields=['section', 'question', 'text']
)
index.fit(df)

index.search(
    query='I just singned up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineerin

# Vector Search

In [126]:
from sklearn.decomposition import TruncatedSVD, NMF

X = matrices['text']
cv = transformers['text']

svd = TruncatedSVD(n_components=16)
nmf = NMF(n_components=16)

X_emb = nmf.fit_transform(X) # svd.fit_transform(X)

X_emb[0]

array([0.        , 0.        , 0.00216532, 0.        , 0.        ,
       0.        , 0.00088262, 0.        , 0.        , 0.27292856,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        ])

In [128]:
query = 'I just singned up. Is it too late to join the course?'

Q = cv.transform([query])
Q_emb = nmf.transform(Q) # svd.transform(Q)
Q_emb[0]

array([7.70406154e-05, 3.75853721e-03, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 1.83952108e-01, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00])

In [129]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(score)[-10:]
list(df.loc[idx].text)

['You can do most of the course without a cloud. Almost everything we use (excluding BigQuery) can be run locally. We won’t be able to provide guidelines for some things, but most of the materials are runnable without GCP.\nFor everything in the course, there’s a local alternative. You could even do the whole course locally.',
 'Yes, this applies if you want to use Airflow or Prefect instead of Mage, AWS or Snowflake instead of GCP products or Tableau instead of Metabase or Google data studio.\nThe course covers 2 alternative data stacks, one using GCP and one using local installation of everything. You can use one of them or use your tool of choice.\nShould you consider it instead of the one tool you use? That we can’t support you if you choose to use a different stack, also you would need to explain the different choices of tool for the peer review of your capstone project.',
 'Yes, you can. Just remember to adapt all the information on the videos to AWS. Besides, the final capstone 