In [None]:
## Implementación 3 de vector embeddings - Búsqueda de palabras clave extraidas con LLM
## Se extrae del user_query palabras clave, que luego se procesan calculando la similitud de vectores
## Con los vectores obtenidos se obtienen los contextos mas relevantes y sus DOIs y se introduen en el
## system prompt del modelo con instrucciones de devolver SÓLO los DOIs relevantes para el usuario.

In [5]:
%%capture 
# Prefiero no ver el tiempo a ver los warnings (eliminamos %%time)
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Cargar y procesar el dataset (columna "Context", que contiene en este caso las oraciones extraídas de PDFs)
df = pd.read_csv('cleaned_dataset2.csv')
texts = df['Context'].tolist()

# Generar Embeddings con el modelo all-MiniLM-L6-v2
modelRAG = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = modelRAG.encode(texts)

In [2]:
%%time
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.43.3.
   \\   /|    GPU: Quadro RTX 6000. Max memory: 23.645 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
CPU times: user 7.65 s, sys: 2.14 s, total: 9.79 s
Wall time: 10.3 s


In [3]:
!nvidia-smi

Wed Aug  7 19:51:50 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Quadro RTX 6000                Off | 00000000:00:10.0 Off |                  Off |
| 46%   60C    P8               9W / 260W |   6401MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [18]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM

# Step 3: Similarity Search
#query_text = "Where can I find research on positioning systems using physics models?"
#query_text = "Where can I find research about new navigational location techniques"
query_text = "Tell me where can I read about innovative ways insurance is being used in the boating industry"


# Define the prompt
prompt ="""<|start_header_id|>system<|end_header_id|>

            You are a research assistant model.
            Your objective is to extract the main subjects that relate to the user's query.
            You should write a few words for each subject present in the query.
            For example:
            query: What are the key studies on the impact of high task-oriented interdependence in global supply chains and its significance for platform ecosystem adoption?
            assistant: high task-oriented interdependence, global supply chains, platform ecosystem adoption
            example 2:
            query: What are some authoritative sources on the process of encoding sequences of integers using the most frequent pairs?
            assistant: encoding sequences of integers, most frequent pairs
            <|eot_id|><|start_header_id|>user<|end_header_id|>

            {}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

# Format the prompt with RAG_string
formatted_prompt = prompt.format(query_text)

# Print the formatted prompt for debugging
#print(formatted_prompt)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
    formatted_prompt,
    return_tensors="pt"
).to("cuda")


## Inferencia para buscar dos preguntas de similar contexto 
## Pregunta extra 1
model_answer1= model.generate(**inputs, temperature=0.7, max_new_tokens = 128)
output1=tokenizer.batch_decode(model_answer1)

## Función para extraer SOLO el output del modelo.
def output_cleaner(output):
    # Convert list to string
    output_str = output[0]
    # Find the index of "user\n\n"
    user_index = output_str.find("assistant<|end_header_id|>\n\n")
    # Slice the string to remove everything up to "user\n\n"
    processed_output = output_str[user_index + len("assistant<|end_header_id|>\n\n"):]
    # Remove the last character eot
    if processed_output.endswith("<|eot_id|>"):
        processed_output = processed_output[:-10]
    return processed_output
    
embedding_search_query = output_cleaner(output1)


["<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n            You are a research assistant model.\n            Your objective is to extract the main subjects that relate to the user's query.\n            You should write a few words for each subject present in the query.\n            For example:\n            query: What are the key studies on the impact of high task-oriented interdependence in global supply chains and its significance for platform ecosystem adoption?\n            assistant: high task-oriented interdependence, global supply chains, platform ecosystem adoption\n            example 2:\n            query: What are some authoritative sources on the process of encoding sequences of integers using the most frequent pairs?\n            assistant: encoding sequences of integers, most frequent pairs\n            <|eot_id|><|start_header_id|>user<|end_header_id|>\n\n            Tell me where can I read about innovative ways insurance is being used in the boating i

In [None]:
query_embedding = modelRAG.encode([embedding_search_query])
similarity_scores = cosine_similarity(query_embedding, embeddings).flatten()

# Get the indices of the top 3 most similar texts
top_3_indices = np.argsort(similarity_scores)[-3:][::-1]

RAG_string = ""
for idx, index in enumerate(top_3_indices):
    text = df.loc[index, 'Context']
    respuesta_DOIs = df.loc[index, 'DOIs']
    similarity_score = similarity_scores[index]
    
    # Use variables in a multi-line f-string
    RAG_string += f"""
Context {idx + 1}: {text}
DOIs: {respuesta_DOIs}

"""
# Elimine Similarity Score: {similarity_score:.4f} para no sesgar al LLM
print(RAG_string)

# Define the prompt
prompt ="""<|start_header_id|>system<|end_header_id|>

            You are a helpful assistant.
            Your objective is to retrieve Digital Object Identifiers (DOIs) of research papers that relate to the user's query.
            Read the context provided and decide if it's relevant to what the user is asking. If it is, provide the DOIs for all the contexts that are relevant.
            If you are unsure if the context is relevant for given question, try to not make stuff up and instead answer truthfully.
            If there a DOIs, reply ONLY with "A scientific article regarding those subjects can be found with the DOI" (use the plural if needed), followed by the DOIs that are more relevant (if at all), don't explain your reasoning.
            If there are no DOIs, DON'T MAKE THEM UP, explain you couldn't find the answer in your data sources.<|eot_id|><|start_header_id|>user<|end_header_id|>

            {}
            
            User query: {}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

# Elimine If all similarity scores are below 0.45, tell the user you couldn't find research on that topic. ya que quite los scores del RAG_string

# Format the prompt with RAG_string
formatted_prompt = prompt.format(RAG_string, query_text)

# Print the formatted prompt for debugging
#print(formatted_prompt)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
    formatted_prompt,
    return_tensors="pt"
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)

# Generate text
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=256)