# Task 2 : Retrieval and Generation

In [1]:
%pip install ollama
%pip install langchain-core
%pip install langchain-ollama

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Standard Library
import numpy as np
from tqdm import tqdm
import time
import json
import os
import enum

# PySpark
from pyspark.sql import SparkSession #type: ignore[import]
from pyspark.ml.feature import Word2Vec #type: ignore[import]
from pyspark.ml.feature import Word2VecModel #type: ignore[import]
from sentence_transformers import SentenceTransformer #type: ignore[import]
# LangChain
from langchain_core.prompts import ChatPromptTemplate #type: ignore[import]
from langchain_ollama.llms import OllamaLLM #type: ignore[import]

In [3]:
spark = SparkSession.builder \
    .appName("retieval") \
    .getOrCreate()

sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/16 08:46:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/05/16 08:46:17 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [None]:
# Configuration
class EmbeddingStrategy(enum.Enum):
    WORD2VEC = "word2vec"
    SENTENCE_TRANSFORMERS = "SBERT"
    
class SimilarityStrategy(enum.Enum):
    COSINE = "cosine"
    EUCLIDEAN = "euclidean"
    MANHATTAN = "manhattan"

EMBEDDING_STRATEGY = EmbeddingStrategy.SENTENCE_TRANSFORMERS
SIMILARITY_STRATEGY = SimilarityStrategy.COSINE

# Constants
QUERY = "Generate 5 questions on big data framworks"
NUMBER_OF_TEXT_CHUNKS_TO_RETRIEVE = 10
NUMBER_OF_QUESTIONS_TO_GENERATE = 2
INPUT_DIR = "../dataset/parquet"
OUTPUT_DIR = "../dataset/finals"

In [5]:
dataset_name = EMBEDDING_STRATEGY.value
df = spark.read.parquet(f"{INPUT_DIR}/{dataset_name}")
df.show(1, truncate=False)

                                                                                

+---+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

# Simple query 

In [6]:
query_df = spark.createDataFrame([(QUERY.split(" "),), ], ["text_chunks"])
match EMBEDDING_STRATEGY:
    case EmbeddingStrategy.WORD2VEC:
        model = Word2VecModel.load(f"../dataset/models/word2vec")
        # docummentDF = spark.createDataFrame([(Query.split(" "),)], ["text"])
        # docummentDF.show(truncate=False)
        # from pyspark.ml.feature import Word2Vec
        result = model.transform(query_df)
        result.show(truncate=False)
    case EmbeddingStrategy.SENTENCE_TRANSFORMERS:
        model = SentenceTransformer("all-MiniLM-L6-v2")
        result = model.encode(QUERY, convert_to_tensor=True)
        

# Distance Metrics

In [7]:
def euclidean_distance(v1, v2) -> float:
    return np.sqrt(np.sum(np.square(v1 - v2)))

def cosine_similarity(v1, v2) -> float:
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

def manhattan_distance(v1, v2) -> float:
    return np.sum(np.abs(v1 - v2))

distance_functions = {
    SimilarityStrategy.EUCLIDEAN : euclidean_distance,
    SimilarityStrategy.COSINE: cosine_similarity,
    SimilarityStrategy.MANHATTAN: manhattan_distance
}

score_idx = []
# get the embddings of the df 
match EMBEDDING_STRATEGY:
    case EmbeddingStrategy.WORD2VEC: _ , vector = result.collect().pop(0)
    case EmbeddingStrategy.SENTENCE_TRANSFORMERS: vector = result.numpy()
for (cid, words, vector2) in tqdm(df.collect()):
    score_idx.append((cid, distance_functions[SIMILARITY_STRATEGY](vector, vector2)))
        

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 292/292 [00:00<00:00, 18943.82it/s]


In [8]:
sorted_score_idx = sorted(score_idx, key=lambda x: x[1])
get_top_n = sorted_score_idx[:NUMBER_OF_TEXT_CHUNKS_TO_RETRIEVE]
print(f"Top {NUMBER_OF_TEXT_CHUNKS_TO_RETRIEVE} similar documents:")
df_filtered = df.filter(df.id.isin([x[0] for x in get_top_n]))
df_filtered.show()
print(f"Top {NUMBER_OF_TEXT_CHUNKS_TO_RETRIEVE} scores:")
for i in range(NUMBER_OF_TEXT_CHUNKS_TO_RETRIEVE):
    print(f"Document ID: {get_top_n[i][0]}, Score: {get_top_n[i][1]}")

Top 10 similar documents:


[Stage 4:>                                                          (0 + 1) / 1]

+---+--------------------+--------------------+
| id|         text_chunks|           embedding|
+---+--------------------+--------------------+
|167|[application, set...|[-0.0051523344591...|
|234|[75, accurate, fi...|[0.05355837568640...|
|  5|[spotify, uses, d...|[-0.0309972520917...|
|111|[characteristic, ...|[0.03156685829162...|
|253|[meaning, attache...|[0.00292743742465...|
|212|[login, issue, se...|[-0.0119140520691...|
|108|[vectors, angle, ...|[-0.0096985651180...|
|259|[otherwise, 15, f...|[-0.0391906648874...|
|158|[activated, deact...|[-0.0557411611080...|
| 15|[without, replace...|[-0.0137828057631...|
+---+--------------------+--------------------+

Top 10 scores:
Document ID: 158, Score: -0.0570474401525899
Document ID: 212, Score: -0.05083313293484924
Document ID: 111, Score: -0.04146233958818848
Document ID: 253, Score: -0.0337617235115026
Document ID: 108, Score: -0.031500513312840255
Document ID: 15, Score: -0.015803721403359134
Document ID: 234, Score: -0.013375327

                                                                                

In [9]:
def generate_context_from_dataframe(df):
    context = ""
    for row in df.collect():
        # get the text field from the row and append it to the context
        context += " ".join(row.text_chunks)+"\n"
    return context

context = "Ressources given to you for the course of INFO-H515:"
context += generate_context_from_dataframe(df_filtered)

template = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. If it's a generating question, make {number} question(s)\n\n{context}\n\nQuestion: {question}\nHelpful Answer:"""

prompt = ChatPromptTemplate.from_template(template)

model = OllamaLLM(model="llama3.2")

chain = prompt | model

info  = chain.invoke({"number" : NUMBER_OF_QUESTIONS_TO_GENERATE, "question":QUERY,"context": context}).split("\n")

                                                                                

In [None]:
os.makedirs(OUTPUT_DIR, exist_ok=True)
#save the reqquest, response and the top N documents, time taken to a json file
def save_to_json(request, response, top_n_score,top_n_words, distance_type, embedding):
    data = {
        "request": request,
        "response": response,
        "top_n_documents": top_n_score,
        "top_n_words": top_n_words,
        "algorithm": distance_type,
        "embedding_model": embedding,
    }
    now = time.strftime("%Y_%m_%d_%H_%M_%S")
    
    with open(f"{OUTPUT_DIR}/{now}.json", "w") as json_file:
        json.dump(data, json_file, indent=4)

In [11]:
save_to_json(
    request=QUERY,
    response=info,
    top_n_score=get_top_n,
    top_n_words=[context.split("\n")],
    distance_type=SIMILARITY_STRATEGY.value,
    embedding=EMBEDDING_STRATEGY.value,
)