# Homework 3

Importo los modulos necesarios

In [76]:
import requests
import pandas as pd
import minsearch
import numpy as np

from tqdm.auto import tqdm
from minsearch import VectorSearch
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.metrics.pairwise import cosine_similarity
from qdrant_client import QdrantClient, models
from fastembed import TextEmbedding
from qdrant_client.models import Filter, FieldCondition, MatchValue
from concurrent.futures import ThreadPoolExecutor, as_completed
from rouge import Rouge

### Importo los documentos

In [2]:
url_prefix = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/03-evaluation/'
docs_url = url_prefix + 'search_evaluation/documents-with-ids.json'
documents = requests.get(docs_url).json()

In [3]:
documents[:3]

[{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'question': 'Course - When will the course start?',
  'course': 'data-engineering-zoomcamp',
  'id': 'c02e79ef'},
 {'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
  'section': 'General course-related questions',
  'question': 'Course - What are the prerequisites for this course?',
  'course': 'data-engineering-zoomcamp',
  'id': '1f6520ca'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware

### Importo los datos de referencia

In [4]:
ground_truth_url = url_prefix + 'search_evaluation/ground-truth-data.csv'
df_ground_truth = pd.read_csv(ground_truth_url)
ground_truth = df_ground_truth.to_dict(orient='records')

In [5]:
ground_truth[:3]

[{'question': 'When does the course begin?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'How can I get the course schedule?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'},
 {'question': 'What is the link for course registration?',
  'course': 'data-engineering-zoomcamp',
  'document': 'c02e79ef'}]

### Creo funcionaes para calcular metricas de evaluacion y la evaluacion generica

In [6]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1
            
    return cnt / len(relevance_total)

def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'HIT_RATE': f"{hit_rate(relevance_total):.4f}",
        'MRR': f"{mrr(relevance_total):.4f}",
    }

## Q1. Minsearch text

### Indexo los documentos en minsearch

In [7]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x7f4b50cbe120>

### Función de busqueda con minsearch

In [8]:
def minsearch_search(query, course):
    boost = {'question': 1.5, 'section': 0.1}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

### Evaluación

In [9]:
evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'HIT_RATE': '0.8487', 'MRR': '0.7288'}

## Q2. Vector search for question

Si querés entender cómo funciona la búsqueda vectorial en MinSearch, podés revisar la explicación paso a paso en su repositorio oficial:  
[Vector Search en MinSearch](https://github.com/alexeygrigorev/minsearch?tab=readme-ov-file#vector-search-with-vectorsearch)


Uso el campo question

In [10]:
texts = []

for doc in documents:
    t = doc['question']
    texts.append(t)

In [11]:
texts[:3]

['Course - When will the course start?',
 'Course - What are the prerequisites for this course?',
 'Course - Can I still join the course after the start date?']

### Convierte text a vector

In [12]:
#Crea un pipeline de procesamiento de texto
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

#Aplica el pipeline a los textos
X = pipeline.fit_transform(texts)

In [13]:
len(X[0]), X[0]

(128,
 array([ 0.20189188, -0.19028114, -0.10261914,  0.16435334, -0.14004852,
        -0.19928493,  0.0326298 ,  0.03152187,  0.11015991, -0.25056714,
        -0.2297715 , -0.08275686,  0.0107985 ,  0.01912367, -0.03904959,
         0.04858485, -0.03590806,  0.00200877, -0.20405168, -0.01187959,
         0.07217801,  0.21314061,  0.0352884 ,  0.09334844,  0.00800627,
         0.02730576, -0.05747045, -0.08794382,  0.04599191,  0.09568683,
         0.10378307, -0.12981451, -0.03935688,  0.03076194,  0.02946738,
        -0.02071025,  0.09501766,  0.05341492, -0.02582382,  0.08743149,
        -0.03647388, -0.168532  , -0.08957893,  0.03547496,  0.11095151,
         0.13033041, -0.07362053,  0.13634367,  0.09826041, -0.05042163,
         0.10989516,  0.00500543, -0.06848177,  0.0502295 ,  0.06014098,
         0.11383368,  0.05322441, -0.02371468, -0.0975518 ,  0.02646635,
         0.00609063, -0.00198324,  0.11985142,  0.08729537,  0.08522016,
         0.01295767,  0.03040857,  0.04797036

### Indexamos el vector

In [14]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7f4b50a14d70>

### Función de busqueda con minsearch

In [15]:
def minsearch_search_vector(query, course):

    query_vector = pipeline.transform([query])[0]
    
    results = vindex.search(
        query_vector = query_vector,
        filter_dict={'course': course},
        num_results=5
    )

    return results

### Evaluación

In [16]:
evaluate(ground_truth, lambda q: minsearch_search_vector(q['question'], q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'HIT_RATE': '0.4817', 'MRR': '0.3573'}

## Q3. Vector search for question and answer

Uso el campo question + text

In [17]:
texts = []

for doc in documents:
    t = doc['question'] + ' ' + doc['text']
    texts.append(t)

In [18]:
texts[:3]

["Course - When will the course start? The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'Course - What are the prerequisites for this course? GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 "Course - Can I still join the course after the start date? Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."]

### Convierte text a vector

In [19]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

X = pipeline.fit_transform(texts)

In [20]:
len(X[0]), X[0]

(128,
 array([ 2.76527456e-01, -1.24063382e-01,  8.16530833e-02, -1.15718482e-01,
        -1.37804071e-01,  3.97703394e-02,  1.41970632e-01, -6.77654722e-02,
         8.82482169e-02, -6.79080171e-02,  1.61867861e-01, -1.30106865e-01,
        -2.58860109e-02,  1.46951057e-01, -3.23798270e-02, -8.83040051e-02,
         3.21253737e-02,  1.08838951e-01,  4.50712481e-03,  4.19611654e-02,
        -1.10769504e-02, -1.19974564e-01,  2.56378554e-03, -1.54568571e-02,
        -1.20440912e-01, -9.67282965e-02, -5.39131813e-02, -3.34985780e-02,
        -1.10056124e-01, -1.09819939e-01, -9.41857147e-02,  9.34912797e-02,
         7.11736963e-02,  3.26605888e-03,  2.90518628e-02, -3.23659350e-02,
        -1.01783460e-01, -2.36213463e-02, -5.20522327e-03,  5.13331133e-03,
        -4.43239513e-02, -1.70147597e-02, -4.23541170e-03,  9.59709639e-02,
         7.01681493e-02, -5.44835979e-02, -3.72399321e-02, -1.61973314e-02,
        -7.61884961e-03, -7.43949287e-02, -1.37833494e-01,  2.45833191e-02,
      

### Indexamos el vector

In [21]:
vindex = VectorSearch(keyword_fields={'course'})
vindex.fit(X, documents)

<minsearch.vector.VectorSearch at 0x7f4b5006e0d0>

### Evaluación

In [22]:
evaluate(ground_truth, lambda q: minsearch_search_vector(q['question'], q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'HIT_RATE': '0.8211', 'MRR': '0.6717'}

## Q4. Qdrant

### Ejecutamos la instancia de Qdrant en un contenedor Docker

Descargar la imagen y lanzar el contenedor usando los siguientes comandos:

```bash
docker pull qdrant/qdrant

docker run -p 6333:6333 -p 6334:6334 \
   -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \
   qdrant/qdrant
```

In [23]:
client = QdrantClient("http://localhost", port=6333)

In [24]:
COLLECTION_NAME = "zoomcamp-llm-hw3"
MODEL_SELECT = 'jinaai/jina-embeddings-v2-small-en'

### Función para convertir text a vector

In [25]:
embedding_model = TextEmbedding(model_name=MODEL_SELECT)

In [26]:
def embed_data(text):
    if type(text) == str:
        text = [text]
    
    result_embed = embedding_model.embed(text)    
    return list(result_embed)[0]

In [27]:
result = embed_data("run docker in WSL")

In [28]:
SIZE_MODEL_SELECT = len(result)
SIZE_MODEL_SELECT

512

### Creamos la colección

In [30]:
if client.collection_exists(COLLECTION_NAME):
    client.delete_collection(COLLECTION_NAME)
    
client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=models.VectorParams(
        size=SIZE_MODEL_SELECT, 
        distance=models.Distance.COSINE
    ),
    timeout=60
)

print(f"Colección '{COLLECTION_NAME}' creada exitosamente.")

Colección 'zoomcamp-llm-hw3' creada exitosamente.


### Creamos los puntos de vectores para luego insertar

In [34]:
def create_point(doc_id, doc):
    vector = embed_data(f"{doc['question']} {doc['text']}")
    return models.PointStruct(
        id=doc_id,
        vector=vector,
        payload={
            "question": doc['question'],
            "text": doc['text'],
            "section": doc['section'],
            "course": doc['course'],
            "id": doc['id']
        }
    )

points = []

# Ejecutar en paralelo
with ThreadPoolExecutor(max_workers=8) as executor:
    # Enviar tareas al pool
    futures = {
        executor.submit(create_point, idx, doc): idx
        for idx, doc in enumerate(documents, 1)
    }

    # Recolectar resultados con tqdm
    for future in tqdm(as_completed(futures), total=len(futures), desc="Generando points"):
        point = future.result()
        points.append(point)
        

Generando points:   0%|          | 0/948 [00:00<?, ?it/s]

In [35]:
points[:1]

[PointStruct(id=10, vector=[-0.05681010047071733, -0.06062142031408796, 0.04059363242098396, 0.081996557562762, -0.017428903265783724, -0.02353864677905906, 0.0016774317585954144, 0.015592863453119147, 0.0011412596340648563, 0.008303913626188668, 0.030288560277769806, 0.009150497376839163, -0.02390912604626786, -0.08501716190958854, 0.07569358725999936, -0.06394870104846918, -0.011110742239109236, -0.047765384584131015, -0.05610437724673457, -0.05965616854206685, -0.019541769049176837, -0.028122269292226276, 0.01800579682417359, -0.019146102939885456, 0.0969808054673193, -0.08120651312012883, -0.010541368126525627, 0.008038470905383616, 0.02812608147040058, 0.03313339264587312, -0.10253499661013812, 0.012825846940989226, 0.009104925471710414, 0.0216640471098591, -0.07317648335589157, -0.0048879360337300785, 0.04395219725504, -0.0027838544051711515, -0.08843268983011893, -0.020972566593370715, -0.00938737477724015, -0.004728647489423212, 0.0947695923439483, 0.07074143259742238, -0.05266

In [36]:
client.upsert(
    collection_name=COLLECTION_NAME,
    points=points,
    wait=True
)

print(f"{len(points)} puntos insertados en la colección usando 'upsert'.")

948 puntos insertados en la colección usando 'upsert'.


### Función de busqueda con Qdrant

In [37]:
def search_qdrant(query, course, limit=5):

    results = client.query_points(
        collection_name=COLLECTION_NAME,
        query=embed_data(query),
        query_filter=Filter(
            must=[FieldCondition(key="course", match=MatchValue(value=course))]
        ),
        limit=limit,
        with_payload=True
    )
    
    return [resp.payload for resp in results.points]

In [38]:
result = search_qdrant('When does the course begin?', 'data-engineering-zoomcamp')
result

[{'question': 'Course - When will the course start?',
  'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
  'section': 'General course-related questions',
  'course': 'data-engineering-zoomcamp',
  'id': 'c02e79ef'},
 {'question': 'Course - Can I follow the course after it finishes?',
  'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone

In [39]:
type(result)

list

### Evaluación

In [40]:
def evaluate_parallel(ground_truth, search_function, max_workers=8):
    relevance_total = []

    def process_question(q):
        doc_id = q['document']
        results = search_function(q)
        return [d['id'] == doc_id for d in results]

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_question, q) for q in ground_truth]
        for future in tqdm(as_completed(futures), total=len(futures)):
            relevance_total.append(future.result())

    return {
        'HIT_RATE': f"{hit_rate(relevance_total):.4f}",
        'MRR': f"{mrr(relevance_total):.4f}",
    }

In [41]:
evaluate_parallel(ground_truth, lambda q: search_qdrant(q['question'], q['course']))

  0%|          | 0/4627 [00:00<?, ?it/s]

{'HIT_RATE': '0.9300', 'MRR': '0.8518'}

## Q5. Cosine simiarity

### Defino funciones para calcular la similitud de coseno

In [43]:
def normalize(u):
    norm = np.sqrt(u.dot(u))
    return u / norm

In [45]:
def cosine(u, v):
    u = normalize(u)
    v = normalize(v)
    return u.dot(v)

In [52]:
def cosine_v2(u, v):
    u_norm = np.sqrt(u.dot(u))
    v_norm = np.sqrt(v.dot(v))
    return u.dot(v) / (u_norm * v_norm)

In [71]:
def cosine_similarity_per(a, b):
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

### Pruebo el calculo con dos vectores que deben tener un similitud muy alta de coseno

In [54]:
v1 = np.array([1, 2, 3])
v2 = np.array([1.1, 2.1, 3.1])

In [49]:
cosine(v1, v2)

np.float64(0.9998592903536574)

In [53]:
cosine_v2(v1, v2)

np.float64(0.9998592903536574)

In [72]:
cosine_similarity_per(v1, v2)

np.float64(0.9998592903536574)

### Importo los datos

In [55]:
results_url = url_prefix + 'rag_evaluation/data/results-gpt4o-mini.csv'
df_results = pd.read_csv(results_url)

In [56]:
df_results.head()

Unnamed: 0,answer_llm,answer_orig,document,question,course
0,You can sign up for the course by visiting the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Where can I sign up for the course?,machine-learning-zoomcamp
1,You can sign up using the link provided in the...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Can you provide a link to sign up?,machine-learning-zoomcamp
2,"Yes, there is an FAQ for the Machine Learning ...",Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Is there an FAQ for this Machine Learning course?,machine-learning-zoomcamp
3,The context does not provide any specific info...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,Does this course have a GitHub repository for ...,machine-learning-zoomcamp
4,To structure your questions and answers for th...,Machine Learning Zoomcamp FAQ\nThe purpose of ...,0227b872,How can I structure my questions and answers f...,machine-learning-zoomcamp


### Creo el pipeline de transformación

In [None]:
pipeline = make_pipeline(
    TfidfVectorizer(min_df=3),
    TruncatedSVD(n_components=128, random_state=1)
)

In [57]:
pipeline.fit(df_results.answer_llm + ' ' + df_results.answer_orig + ' ' + df_results.question)

0,1,2
,steps,"[('tfidfvectorizer', ...), ('truncatedsvd', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,input,'content'
,encoding,'utf-8'
,decode_error,'strict'
,strip_accents,
,lowercase,True
,preprocessor,
,tokenizer,
,analyzer,'word'
,stop_words,
,token_pattern,'(?u)\\b\\w\\w+\\b'

0,1,2
,n_components,128
,algorithm,'randomized'
,n_iter,5
,n_oversamples,10
,power_iteration_normalizer,'auto'
,random_state,1
,tol,0.0


### Vectorizo la respuesta LLM y Original

In [61]:
answer_llm_embeddings = pipeline.transform(df_results.answer_llm)
answer_orig_embeddings = pipeline.transform(df_results.answer_orig)

In [62]:
answer_llm_embeddings[0]

array([ 1.55498588e-01,  1.12196444e-01, -1.27448731e-01,  7.29928733e-02,
       -8.37144913e-02,  7.12563053e-02, -4.17337192e-02, -8.01481645e-03,
       -2.48426718e-02, -1.96725163e-02, -9.15480506e-03, -2.68991351e-02,
       -4.35971673e-02,  2.90931663e-02, -2.36823437e-02, -4.99815087e-02,
        6.65563175e-02,  8.34148810e-02, -3.29049137e-02,  4.33720994e-02,
        1.95608402e-02,  9.82027760e-03,  6.25639037e-02, -6.11572037e-02,
       -7.70446608e-02, -4.53851792e-02,  9.59588142e-02,  2.33611751e-02,
       -3.75310787e-02, -1.20919087e-02, -4.42022930e-02, -2.80196178e-02,
        4.82849319e-02,  6.44100659e-02, -6.05386231e-02, -3.75276419e-02,
       -3.73247344e-02,  6.05745643e-02,  2.76423210e-02, -6.13524037e-02,
        6.59253837e-02, -1.99575837e-02,  2.05179913e-02,  6.58706921e-03,
        7.37963169e-02,  5.55383382e-03,  7.00592681e-02, -7.29858778e-02,
        2.34118930e-02,  6.07270187e-03, -1.30806947e-02, -7.59930012e-03,
       -6.70146252e-02,  

### Cálculo el promedio de la similitud de coseno de los dos vectores

In [66]:
list_cosine = [cosine(answer_llm, answer_orig) 
               for answer_llm, answer_orig  
               in zip(answer_llm_embeddings, answer_orig_embeddings)
]

np.mean(list_cosine)

np.float64(0.8415841233490402)

Más eficiente

In [74]:
cosine_similarities = np.diag(cosine_similarity(answer_llm_embeddings, answer_orig_embeddings))
np.mean(cosine_similarities)

np.float64(0.8415841233490402)

## Q6. Rouge

In [77]:
rouge_scorer = Rouge()

answer_llm     Yes, all sessions are recorded, so if you miss...
answer_orig    Everything is recorded, so you won’t miss anyt...
document                                                5170565b
question                    Are sessions recorded if I miss one?
course                                 machine-learning-zoomcamp
Name: 10, dtype: object

### Pruebo el cálculo del rouge

In [77]:
rouge_scorer = Rouge()

r = df_results.iloc[10]
r

answer_llm     Yes, all sessions are recorded, so if you miss...
answer_orig    Everything is recorded, so you won’t miss anyt...
document                                                5170565b
question                    Are sessions recorded if I miss one?
course                                 machine-learning-zoomcamp
Name: 10, dtype: object

In [78]:
scores = rouge_scorer.get_scores(r.answer_llm, r.answer_orig)[0]
scores

{'rouge-1': {'r': 0.45454545454545453,
  'p': 0.45454545454545453,
  'f': 0.45454544954545456},
 'rouge-2': {'r': 0.21621621621621623,
  'p': 0.21621621621621623,
  'f': 0.21621621121621637},
 'rouge-l': {'r': 0.3939393939393939,
  'p': 0.3939393939393939,
  'f': 0.393939388939394}}

In [80]:
scores['rouge-1']

{'r': 0.45454545454545453, 'p': 0.45454545454545453, 'f': 0.45454544954545456}

### Cálculo el promedio del f1 correspondiente al rouge-1

In [110]:
list_rouge = [rouge_scorer.get_scores(answer_llm, answer_orig)[0] 
              for answer_llm, answer_orig
              in zip(df_results.answer_llm, df_results.answer_orig)
]

list_rouge[-1]

{'rouge-1': {'r': 0.09090909090909091,
  'p': 0.3888888888888889,
  'f': 0.1473684179811635},
 'rouge-2': {'r': 0.02127659574468085, 'p': 0.1, 'f': 0.035087716405047945},
 'rouge-l': {'r': 0.09090909090909091,
  'p': 0.3888888888888889,
  'f': 0.1473684179811635}}

In [111]:
list_rouge_1_f1 = [rouge['rouge-1']['f'] for rouge in list_rouge]
np.mean(list_rouge_1_f1)

np.float64(0.3516946452113943)

### Función generica que permite obtener diferentes cálculos

In [115]:
def obtener_metrica_rouge(serie_a, serie_b ,tipo_rouge, tipo_metrica, tipo_calculo):
    
    list_rouge = [rouge_scorer.get_scores(a, b)[0]
                  for a, b
                  in zip(serie_a, serie_b)
                 ]

    if tipo_calculo == "mean":
        list_trouge_tmetrica = [rouge[tipo_rouge][tipo_metrica] for rouge in list_rouge]
        return np.mean(list_rouge_1_f1)
    elif tipo_calculo == "max":
        list_trouge_tmetrica = [rouge[tipo_rouge][tipo_metrica] for rouge in list_rouge]
        return np.mean(list_rouge_1_f1)

In [116]:
obtener_metrica_rouge(df_results.answer_llm, df_results.answer_orig, 'rouge-1', 'f', 'mean')    

np.float64(0.3516946452113943)