In [None]:
## Implementación 2 de vector embeddings - Búsqueda de preguntas similares sintéticas generadas por LLM
## Se extrae del user_query preguntas similares, que luego se procesan calculando para cada una de las preguntas
## la similitud de vectores, luego se eliminan duplicados y se filtran los 3 mejores globales (de los 9 obtenidos)
## Con los resultados obtenidos se obtienen los contextos mas relevantes y sus DOIs y se introduen en el
## system prompt del modelo con instrucciones de devolver SÓLO los DOIs relevantes para el usuario.

In [1]:
%%capture
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from unsloth import FastLanguageModel
import torch


  from tqdm.autonotebook import tqdm, trange
2024-08-07 20:50:10.307859: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-08-07 20:50:10.327051: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-07 20:50:10.345551: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-07 20:50:10.351305: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-08-07 20:50:10.368580

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
CPU times: user 5.8 s, sys: 917 ms, total: 6.72 s
Wall time: 6.03 s


In [2]:
%%time
## Carga de los modelos de embeddings y de lenguaje

# Cargar y procesar el dataset (columna "answer", que contiene en este caso las preguntas extraídas de oraciones)
df = pd.read_csv('cleaned_dataset2.csv')
texts = df['answer'].tolist()

# Cargar modelo de embeddings, all-MiniLM-L6-v2
modelRAG = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = modelRAG.encode(texts) #Se calculan los embeddings

# Cargar LLM
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.43.3.
   \\   /|    GPU: Quadro RTX 6000. Max memory: 23.645 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
CPU times: user 8.13 s, sys: 3.55 s, total: 11.7 s
Wall time: 14 s


In [3]:
!nvidia-smi

Wed Aug  7 16:43:51 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Quadro RTX 6000                Off | 00000000:00:10.0 Off |                  Off |
| 36%   48C    P8               7W / 260W |   6401MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM

#query_text = "Where can I find research on positioning systems using physics models?"
#query_text = "Where can I find research about new navigational location techniques"
#query_text = "Tell me where can I read about innovative ways insurance is being used in the boating industry"
#query_text = "Show me sources with information on innovative insurance implementations."
query_text = "Are satellite positioning systems used in conjunction with position estimation models?"


# Define the prompt
prompt ="""<|start_header_id|>system<|end_header_id|>

            You are a helpful assistant.
            Your objective is to take the sentence the user inputs and generate similar sentences of related, interesting subjects.
            Try to keep the question/demand format. For example, create ONLY ONE sentence similar to the ones in the examples:
            user: Where can I find research about new navigational location tecniques?
            assistant: Show me sources regarding innovative satellite positioning systems.
            OR
            assistant: What are the comprehensive studies that explore the relationship between physics and the broader environment?
            <|eot_id|><|start_header_id|>user<|end_header_id|>

            {}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

# Format the prompt with RAG_string
formatted_prompt = prompt.format(query_text)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
    formatted_prompt,
    return_tensors="pt"
).to("cuda")

## Inferencia para buscar dos preguntas de similar contexto 
## Pregunta extra 1
model_answer1= model.generate(**inputs, temperature=0.7, max_new_tokens = 128, use_cache = True)
output1=tokenizer.batch_decode(model_answer1)

## Pregunta extra 2
model_answer2= model.generate(**inputs, temperature=0.7, max_new_tokens = 128, use_cache = True)
output2=tokenizer.batch_decode(model_answer2)

## Función para extraer SOLO el output del modelo.
def output_cleaner(output):
    # Convert list to string
    output_str = output[0]
    # Find the index of "user\n\n"
    user_index = output_str.find("assistant<|end_header_id|>\n\n            ")
    # Slice the string to remove everything up to "user\n\n"
    processed_output = output_str[user_index + len("assistant<|end_header_id|>\n\n            "):]
    # Remove the last character eot
    if processed_output.endswith("<|eot_id|>"):
        processed_output = processed_output[:-10]
    return processed_output


# Similarity Search
query_text = query_text #Como recordatorio de que está declarada antes
query_text_2 = output_cleaner(output1)
query_text_3 = output_cleaner(output2)

query_embedding1 = modelRAG.encode([query_text])
query_embedding2 = modelRAG.encode([query_text_2])
query_embedding3 = modelRAG.encode([query_text_3])

similarity_scores1 = cosine_similarity(query_embedding1, embeddings).flatten()
similarity_scores2 = cosine_similarity(query_embedding2, embeddings).flatten()
similarity_scores3 = cosine_similarity(query_embedding3, embeddings).flatten()

# Get the indices of the top 3 most similar texts
top_3_indices1 = np.argsort(similarity_scores1)[-3:][::-1]
top_3_indices2 = np.argsort(similarity_scores2)[-3:][::-1]
top_3_indices3 = np.argsort(similarity_scores3)[-3:][::-1]

sim_scores_array1 = []
sim_scores_array2 = []
sim_scores_array3 = []

for i in top_3_indices1:
    score = similarity_scores1[i]
    sim_scores_array1.append(score)

for i in top_3_indices2:
    score = similarity_scores2[i]
    sim_scores_array2.append(score)

for i in top_3_indices3:
    score = similarity_scores3[i]
    sim_scores_array3.append(score)


In [None]:
import numpy as np

# Example data as space-separated strings
array1 = top_3_indices1
list1 = sim_scores_array1
array2 = top_3_indices2
list2 = sim_scores_array2
array3 = top_3_indices3
list3 = sim_scores_array3

# Combine each array with its corresponding list
combined1 = list(zip(array1, list1))
combined2 = list(zip(array2, list2))
combined3 = list(zip(array3, list3))

# Sort based on the first element of each tuple (array value)
combined1_sorted = sorted(combined1, key=lambda x: x[0])
combined2_sorted = sorted(combined2, key=lambda x: x[0])
combined3_sorted = sorted(combined3, key=lambda x: x[0])

# Combine all groups into a single list
combined_all = combined1 + combined2 + combined3

# Sort based on the second element of each tuple (list value), in descending order
combined_all_sorted = sorted(combined_all, key=lambda x: x[1], reverse=True)

# Create a dictionary to remove duplicates and keep the first occurrence (highest value due to sorting)
unique_dict = {}
for array_val, list_val in combined_all_sorted:
    if array_val not in unique_dict.values():
        unique_dict[list_val] = array_val

# Unpack the dictionary back into arrays and lists
sorted_arrays = np.array(list(unique_dict.values()))
sorted_lists = list(unique_dict.keys())

# Reduce to the top 3 values
sorted_arrays_top3 = sorted_arrays[:3]
sorted_lists_top3 = sorted_lists[:3]

In [18]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM


RAG_string = ""
for idx, index in enumerate(sorted_arrays_top3):
    text = df.loc[sorted_arrays_top3[idx], 'answer']
    respuesta_DOIs = df.loc[sorted_arrays_top3[idx], 'DOIs']
    respuesta_Titulo = df.loc[sorted_arrays_top3[idx], 'Title']
    similarity_score = sorted_lists_top3[idx]
    
    # Use variables in a multi-line f-string
    RAG_string += f"""
Context {idx + 1}: {text}
DOIs: {respuesta_DOIs}
Similarity Score: {similarity_score:.4f}

"""

# He quitado Title: {respuesta_Titulo} despues del output de DOIs: {respuesta_DOIs} 

# Define the prompt
prompt ="""<|start_header_id|>system<|end_header_id|>

            You are a research assistant model.
            Your objective is to retrieve Digital Object Identifiers (DOIs) of research papers that relate to the user's query.
            Read the context provided and decide if it's relevant to what the user is asking. If it is, provide the DOIs for all the contexts that are relevant.
            If you are unsure if the context is relevant for given question, DO NOT make stuff up and instead answer truthfully.
            If there are DOIs, reply ONLY with "A scientific article regarding those subjects can be found with the DOI" (use the plural if needed), followed by the DOIs that are more relevant (if at all), don't explain your reasoning.
            If there are no DOIs, DON'T MAKE THEM UP, explain you couldn't find the answer in your data sources.
            If all similarity scores are below 0.45, tell the user you couldn't find research in your training dataset on that topic.<|eot_id|><|start_header_id|>user<|end_header_id|>

            {}
            {}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

# Format the prompt with RAG_string
formatted_prompt = prompt.format(RAG_string, query_text)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
    formatted_prompt,
    return_tensors="pt"
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)

# Generate text
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=256)


<|begin_of_text|><|start_header_id|>system<|end_header_id|>

            You are a research assistant model.
            Your objective is to retrieve Digital Object Identifiers (DOIs) of research papers that relate to the user's query.
            Read the context provided and decide if it's relevant to what the user is asking. If it is, provide the DOIs for all the contexts that are relevant.
            If you are unsure if the context is relevant for given question, DO NOT make stuff up and instead answer truthfully.
            If there are DOIs, reply ONLY with "A scientific article regarding those subjects can be found with the DOI" (use the plural if needed), followed by the DOIs that are more relevant (if at all), don't explain your reasoning.
            If there are no DOIs, DON'T MAKE THEM UP, explain you couldn't find the answer in your data sources.
            If all similarity scores are below 0.45, tell the user you couldn't find research in your training dataset on th