In [23]:
import requests
import pandas as pd

In [24]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [25]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [26]:
import pandas as pd

df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head()

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...


In [27]:
df[df.course == 'data-engineering-zoomcamp']

Unnamed: 0,course,section,question,text
0,data-engineering-zoomcamp,General course-related questions,Course - When will the course start?,The purpose of this document is to capture fre...
1,data-engineering-zoomcamp,General course-related questions,Course - What are the prerequisites for this c...,GitHub - DataTalksClub data-engineering-zoomca...
2,data-engineering-zoomcamp,General course-related questions,Course - Can I still join the course after the...,"Yes, even if you don't register, you're still ..."
3,data-engineering-zoomcamp,General course-related questions,Course - I have registered for the Data Engine...,You don't need it. You're accepted. You can al...
4,data-engineering-zoomcamp,General course-related questions,Course - What can I do before the course starts?,You can start by installing and setting up all...
...,...,...,...,...
430,data-engineering-zoomcamp,Workshop 2 - RisingWave,Unable to Open Dashboard as xdg-open doesn’t o...,Refer to the solution given in the first solut...
431,data-engineering-zoomcamp,Workshop 2 - RisingWave,Resolving Python Interpreter Path Inconsistenc...,Example Error:\nWhen attempting to execute a P...
432,data-engineering-zoomcamp,Workshop 2 - RisingWave,How does windowing work in Sql?,Ans : Windowing in streaming SQL involves defi...
433,data-engineering-zoomcamp,Triggers in Mage via CLI,"Encountering the error ""ModuleNotFoundError: N...","Python 3.12.1, is not compatible with kafka-py..."


# Implement text based search: We will use vector spaces. 

- Turn the document in the vectors
- document-term matrix:
    - rows: documnets
    - columns: words/tokens
- Bag of words:
    - word order is lost
    - sparse matrix

In [28]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(min_df=5)

In [29]:
cv.fit(df.text)
cv.get_feature_names_out()

array(['01', '02', '03', ..., 'youtube', 'zip', 'zoomcamp'], dtype=object)

In [30]:
docs_example = [
    "Course starts on 15th Jan 2024",
    "Prerequisites listed on GitHub",
    "Submit homeworks after start date",
    "Registration not required for participation",
    "Setup Google Cloud and Python before course"
]

In [34]:
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4
15th,1,0,0,0,0
2024,1,0,0,0,0
cloud,0,0,0,0,1
course,1,0,0,0,1
date,0,0,1,0,0
github,0,1,0,0,0
google,0,0,0,0,1
homeworks,0,0,1,0,0
jan,1,0,0,0,0
listed,0,1,0,0,0


# By using TF-Idf

In [41]:
#Tf-IDf
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idfvectorizer = TfidfVectorizer()
X_tfidf = tf_idfvectorizer.fit_transform(docs_example)

feature_names_tfidf = tf_idfvectorizer.get_feature_names_out()

df_docs = pd.DataFrame(X_tfidf.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4
15th,0.434297,0.0,0.0,0.0,0.0
2024,0.434297,0.0,0.0,0.0,0.0
after,0.0,0.0,0.447214,0.0,0.0
and,0.0,0.0,0.0,0.0,0.387757
before,0.0,0.0,0.0,0.0,0.387757
cloud,0.0,0.0,0.0,0.0,0.387757
course,0.350388,0.0,0.0,0.0,0.31284
date,0.0,0.0,0.447214,0.0,0.0
for,0.0,0.0,0.0,0.447214,0.0
github,0.0,0.523358,0.0,0.0,0.0


In [44]:
#use it in our dataset using countverctorizer
cv = CountVectorizer(stop_words='english', min_df = 5)
X = cv.fit_transform(df.text)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
01,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
02,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
03,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
04,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
05,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yes,0,0,1,0,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
yml,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
youtube,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
zip,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
#use it in our dataset using tf-idf
from sklearn.feature_extraction.text import TfidfVectorizer
tf_idf = TfidfVectorizer(stop_words='english', min_df = 5)
X = tf_idf.fit_transform(df.text)

names = tf_idf.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,938,939,940,941,942,943,944,945,946,947
01,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
02,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
03,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
04,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
05,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yes,0.0,0.00,0.28,0.0,0.0,0.00,0.21,0.2,0.15,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
yml,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.11,0.0,0.0,0.0,0.00
youtube,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.15,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00
zip,0.0,0.00,0.00,0.0,0.0,0.00,0.00,0.0,0.00,0.00,...,0.0,0.0,0.0,0.0,0.0,0.00,0.0,0.0,0.0,0.00


In [85]:
query = "Do I need to know python to sign up for the January course?"

q = tf_idf.transform([query])
print(q)
q.toarray()

  (0, 1088)	0.5935519664108326
  (0, 945)	0.31441356049301333
  (0, 778)	0.29796783250107517
  (0, 644)	0.5608269127690405
  (0, 274)	0.38148200594064524


array([[0., 0., 0., ..., 0., 0., 0.]])

In [97]:
query_dict = dict(zip(names, q.toarray()[0]))
# query_dict

In [57]:
doc_dict = dict(zip(names, X.toarray()[1]))
# doc_dict

In [61]:
from sklearn.metrics.pairwise import cosine_similarity
score = cosine_similarity(X, q).flatten()

#same result
# X.dot(q.T).todense()

In [65]:
import numpy as np
np.argsort(score)[-5:]

array([764,  27, 806, 577, 445], dtype=int64)

In [67]:
df.iloc[806].text

'Technically, yes. Advisable? Not really. Reasons:\nSome homework(s) asks for specific python library versions.\nAnswers may not match in MCQ options if using different languages other than Python 3.10 (the recommended version for 2023 cohort)\nAnd as for midterms/capstones, your peer-reviewers may not know these other languages. Do you want to be penalized for others not knowing these other languages?\nYou can create a separate repo using course’s lessons but written in other languages for your own learnings, but not advisable for submissions.\ntx[source]'

In [69]:
fields = ['section', 'question', 'text']

In [78]:
matrices = {}
vectorizers = {}

for f in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=5)
    X = cv.fit_transform(df[f])
    matrices[f] = X
    vectorizers[f] = cv

  (0, 47)	0.4969848620671003
  (0, 50)	0.510748534028322
  (0, 6)	0.4951242558592849
  (0, 24)	0.4969848620671003
  (1, 47)	0.4969848620671003
  (1, 50)	0.510748534028322
  (1, 6)	0.4951242558592849
  (1, 24)	0.4969848620671003
  (2, 47)	0.4969848620671003
  (2, 50)	0.510748534028322
  (2, 6)	0.4951242558592849
  (2, 24)	0.4969848620671003
  (3, 47)	0.4969848620671003
  (3, 50)	0.510748534028322
  (3, 6)	0.4951242558592849
  (3, 24)	0.4969848620671003
  (4, 47)	0.4969848620671003
  (4, 50)	0.510748534028322
  (4, 6)	0.4951242558592849
  (4, 24)	0.4969848620671003
  (5, 47)	0.4969848620671003
  (5, 50)	0.510748534028322
  (5, 6)	0.4951242558592849
  (5, 24)	0.4969848620671003
  (6, 47)	0.4969848620671003
  :	:
  (939, 37)	0.23534620040390958
  (940, 43)	0.6872452858897771
  (940, 2)	0.6872452858897771
  (940, 37)	0.23534620040390958
  (941, 43)	0.6872452858897771
  (941, 2)	0.6872452858897771
  (941, 37)	0.23534620040390958
  (942, 43)	0.6872452858897771
  (942, 2)	0.6872452858897771
  

In [75]:
matrices
vectorizers

{'section': TfidfVectorizer(min_df=5, stop_words='english'),
 'question': TfidfVectorizer(min_df=5, stop_words='english'),
 'text': TfidfVectorizer(min_df=5, stop_words='english')}

In [None]:
df_pd = pd.DataFrame(df['query_dict'], df['doc_dict'], index=['query', 'doc']).T
(df_pd['query'] * df_pd['doc']).sum()