In [None]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [None]:
print(documents)

In [None]:
import pandas as pd

In [None]:
df = pd.DataFrame(documents, columns=['course', 'section', 'question', 'text'])
df.head()

In [None]:
df[df.course == 'data-engineering-zoomcamp'].head()

In [None]:
docs_example = [
    "January course details, register now",
    "Course prerequisites listed in January catalog",
    "Submit January course homework by end of month",
    "Register for January course, no prerequisites",
    "January course setup: Python and Google Cloud"
]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
cv = CountVectorizer()

In [None]:
cv.fit(docs_example)

In [None]:
names = cv.get_feature_names_out()
names

In [None]:
X = cv.transform(docs_example)

In [None]:
X.toarray()

In [None]:
# Set options to display all rows and columns
pd.set_option('display.max_rows', None)        # Display all rows
pd.set_option('display.max_columns', None)     # Display all columns
pd.set_option('display.width', None)           # Disable width wrapping (try None or a large number like 1000)
pd.set_option('display.max_colwidth', None)    # Display full column content (no truncation in cells)


In [None]:
df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

In [None]:
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
cv = TfidfVectorizer(stop_words='english')
X = cv.fit_transform(docs_example)

names = cv.get_feature_names_out()

df_docs = pd.DataFrame(X.toarray(), columns=names).T
df_docs.round(2)

In [None]:
query = "Do I need to know python to sign up for the January course?"

In [None]:
q = cv.transform([query])
q.toarray()

In [None]:
query_dict = dict(zip(names, q.toarray()[0]))
query_dict

In [None]:
doc_dict = dict(zip(names, X.toarray()[1]))
doc_dict

In [None]:
df_qd = pd.DataFrame([query_dict, doc_dict], index=['query', 'doc']).T

In [None]:
(df_qd['query'] * df_qd['doc']).sum()

In [None]:
X.dot(q.T).toarray()

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
cosine_similarity(X, q)

In [None]:
df.columns

In [None]:
fields = ['section', 'question', 'text']
transformers = {}
matrices = {}

for field in fields:
    cv = TfidfVectorizer(stop_words='english', min_df=3)
    X = cv.fit_transform(df[field])

    transformers[field] = cv
    matrices[field] = X

In [None]:
transformers['text'].get_feature_names_out()

In [None]:
matrices['text']

In [None]:
query = "I just signed up. Is it too late to join the course?"

In [None]:
q = transformers['text'].transform([query])
score = cosine_similarity(matrices['text'], q).flatten()

In [None]:
mask = (df.course == 'data-engineering-zoomcamp').values
score = score * mask
score[:10]

In [None]:
import numpy as np

In [None]:
idx = np.argsort(-score)[:10]
idx

In [None]:
score[idx]

In [None]:
df.iloc[idx].text

In [None]:
fields

In [None]:
query = "I just signed up. Is it too late to join the course?"

In [None]:
boost = {'question': 3.0}

score = np.zeros(len(df))

for f in fields:
    b = boost.get(f, 1.0)
    q = transformers[f].transform([query])
    s = cosine_similarity(matrices[f], q).flatten()
    score = score + b * s

In [None]:
filters = {
    'course': 'data-engineering-zoomcamp'
}

for field, value in filters.items():
    mask = (df[field] == value).values
    score = score * mask

In [None]:
idx = np.argsort(-score)[:10]
results = df.iloc[idx]
results.to_dict(orient='records')

In [None]:
class TextSearch:

    def __init__(self, text_fields):
        self.text_fields = text_fields
        self.matrices = {}
        self.vectorizers = {}

    def fit(self, records, vectorizer_params={}):
        self.df = pd.DataFrame(records)

        for f in self.text_fields:
            cv = TfidfVectorizer(**vectorizer_params)
            X = cv.fit_transform(self.df[f])
            self.matrices[f] = X
            self.vectorizers[f] = cv

    def search(self, query, n_results=10, boost={}, filters={}):
        score = np.zeros(len(self.df))

        for f in self.text_fields:
            b = boost.get(f, 1.0)
            q = self.vectorizers[f].transform([query])
            s = cosine_similarity(self.matrices[f], q).flatten()
            score = score + b * s

        for field, value in filters.items():
            mask = (self.df[field] == value).values
            score = score * mask

        idx = np.argsort(-score)[:n_results]
        results = self.df.iloc[idx]
        return results.to_dict(orient='records')

In [None]:
fields

In [None]:
index = TextSearch(text_fields=['section', 'question', 'text'])

In [None]:
index.fit(documents)

In [None]:
query

In [None]:
index.search(
    query='I just signed up. Is it too late to join the course?',
    n_results=5,
    boost={'question': 3.0},
    filters={'course': 'data-engineering-zoomcamp'}
)

In [None]:
from sklearn.decomposition import TruncatedSVD

In [None]:
X = matrices['text']
cv = transformers['text']

In [None]:
svd = TruncatedSVD(n_components=16)
X_emb = svd.fit_transform(X)

In [None]:
X_emb[0]

In [None]:
query = 'I just signed up. Is it too late to join the course?'

Q = cv.transform([query])
Q_emb = svd.transform(Q)

In [None]:
Q_emb[0]

In [None]:
np.dot(X_emb[0], Q_emb[0])

In [None]:
score = cosine_similarity(X_emb, Q_emb).flatten()

In [None]:
idx = np.argsort(-score)[:10]

In [None]:
list(df.loc[idx].text)

In [None]:
from sklearn.decomposition import NMF

In [None]:
nmf = NMF(n_components=16)
X_emb = nmf.fit_transform(X)
X_emb[0]

In [None]:
Q = cv.transform([query])
Q_emb = nmf.transform(Q)
Q_emb[0]

In [None]:
score = cosine_similarity(X_emb, Q_emb).flatten()
idx = np.argsort(-score)[:10]
list(df.loc[idx].text)

In [None]:
import torch
from transformers import BertModel, BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")
model.eval()  # Set the model to evaluation mode if not training

In [None]:
texts = [
    "Yes, we will keep all the materials after the course finishes.",
    "You can follow the course at your own pace after it finishes"
]
encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt')


In [None]:
encoded_input

In [None]:
with torch.no_grad():  # Disable gradient calculation for inference
    outputs = model(**encoded_input)
    hidden_states = outputs.last_hidden_state

In [None]:
hidden_states.shape

In [None]:
sentence_embeddings = hidden_states.mean(dim=1)
sentence_embeddings.shape

In [None]:
sentence_embeddings.numpy()

# note that if use a GPU, first you need to move your tensors to CPU
# sentence_embeddings_cpu = sentence_embeddings.cpu()

In [None]:
def make_batches(seq, n):
    result = []
    for i in range(0, len(seq), n):
        batch = seq[i:i+n]
        result.append(batch)
    return result

In [None]:
from tqdm.auto import tqdm

In [None]:
def compute_embeddings(texts, batch_size=8):
    text_batches = make_batches(texts, 8)
    
    all_embeddings = []
    
    for batch in tqdm(text_batches):
        encoded_input = tokenizer(batch, padding=True, truncation=True, return_tensors='pt')
    
        with torch.no_grad():
            outputs = model(**encoded_input)
            hidden_states = outputs.last_hidden_state
            
            batch_embeddings = hidden_states.mean(dim=1)
            batch_embeddings_np = batch_embeddings.cpu().numpy()
            all_embeddings.append(batch_embeddings_np)
    
    final_embeddings = np.vstack(all_embeddings)
    return final_embeddings

In [None]:
embeddings = {}

In [None]:
# fields = ['section', 'question', 'text']

for f in fields:
    print(f'computing embeddings for {f}...')
    embeddings[f] = compute_embeddings(df[f].tolist())

In [None]:
import pickle

In [None]:
with open('embeddings.bin', 'wb') as f_out:
    pickle.dump(embeddings, f_out)