In [None]:
## Implementación original de vector embeddings - Búsqueda de preguntas similares y deicisión con LLM
## Se calculan las 3 preguntas en el dataset mas similares a la pregunta del usuario
## Con los resultados obtenidos se obtienen los contextos mas relevantes y sus DOIs y se introduen en el
## system prompt del modelo con instrucciones de devolver SÓLO los DOIs relevantes para el usuario.

In [1]:
%%capture
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

2024-09-02 22:26:18.337052: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-02 22:26:18.361632: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-02 22:26:18.384022: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-02 22:26:18.390810: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-02 22:26:18.411570: I tensorflow/core/platform/cpu_feature_guar

In [2]:
%%time
# Cargar y procesar el dataset (columna "answer", que contiene en este caso las preguntas extraídas de oraciones)
df2 = pd.read_csv('dataset.csv')
texts = df2['answer'].tolist()

# Generar Embeddings con el modelo all-MiniLM-L6-v2
modelRAG2 = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = modelRAG2.encode(texts)

CPU times: user 1.36 s, sys: 363 ms, total: 1.72 s
Wall time: 3.6 s


In [3]:
%%time
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
==((====))==  Unsloth 2024.8: Fast Llama patching. Transformers = 4.43.3.
   \\   /|    GPU: Quadro RTX 6000. Max memory: 23.645 GB. Platform = Linux.
O^O/ \_/ \    Pytorch: 2.3.0+cu121. CUDA = 7.5. CUDA Toolkit = 12.1.
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.26.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
CPU times: user 6.82 s, sys: 2.08 s, total: 8.9 s
Wall time: 9.71 s


In [4]:
!nvidia-smi

Tue Aug 20 08:57:25 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.183.01             Driver Version: 535.183.01   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Quadro RTX 6000                Off | 00000000:00:10.0 Off |                  Off |
| 36%   50C    P2              60W / 260W |   6401MiB / 24576MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [14]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from transformers import AutoTokenizer, AutoModelForCausalLM

# Step 3: Similarity Search
query_text = "Show me sources on innovative digitalization methods implemented in the maritime industry"
#query_text = "Where can I find research about new navigational location techniques"
#query_text = "Tell me where can I read about innovative ways insurance is being used in the boating industry"

query_embedding = modelRAG.encode([query_text])
similarity_scores = cosine_similarity(query_embedding, embeddings).flatten()

# Get the indices of the top 3 most similar texts
top_3_indices = np.argsort(similarity_scores)[-3:][::-1]

RAG_string = ""
for idx, index in enumerate(top_3_indices):
    text = df.loc[index, 'answer']
    respuesta_DOIs = df.loc[index, 'DOIs']
    similarity_score = similarity_scores[index]
    
    # Use variables in a multi-line f-string
    RAG_string += f"""
Context {idx + 1}: {text}
DOIs: {respuesta_DOIs}
Similarity Score: {similarity_score:.4f}

"""

# Define the prompt
prompt ="""<|start_header_id|>system<|end_header_id|>

            You are a research assistant model.
            Your objective is to retrieve Digital Object Identifiers (DOIs) of research papers that relate to the user's query.
            Read the context provided and decide if it's relevant to what the user is asking. If it is, provide the DOIs for all the contexts that are relevant.
            If you are unsure if the context is relevant for given question, try to not make stuff up and instead answer truthfully.
            If there a DOIs, reply ONLY with "A scientific article regarding those subjects can be found with the DOI" (use the plural if needed), followed by the DOIs that are more relevant (if at all), don't explain your reasoning.
            If there are no DOIs, DON'T MAKE THEM UP, explain you couldn't find the answer in your data sources.
            If all similarity scores are below 0.45, tell the user you couldn't find research on that topic.<|eot_id|><|start_header_id|>user<|end_header_id|>

            {}
            Question: {}<|eot_id|><|start_header_id|>assistant<|end_header_id|>"""

# Format the prompt with RAG_string
formatted_prompt = prompt.format(RAG_string, query_text)

# Print the formatted prompt for debugging
#print(formatted_prompt)

FastLanguageModel.for_inference(model) # Enable native 2x faster inference
inputs = tokenizer(
    formatted_prompt,
    return_tensors="pt"
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)

# Generate text
_ = model.generate(**inputs, streamer=text_streamer, max_new_tokens=256)

<|begin_of_text|><|start_header_id|>system<|end_header_id|>

            You are a research assistant model.
            Your objective is to retrieve Digital Object Identifiers (DOIs) of research papers that relate to the user's query.
            Read the context provided and decide if it's relevant to what the user is asking. If it is, provide the DOIs for all the contexts that are relevant.
            If you are unsure if the context is relevant for given question, try to not make stuff up and instead answer truthfully.
            If there a DOIs, reply ONLY with "A scientific article regarding those subjects can be found with the DOI" (use the plural if needed), followed by the DOIs that are more relevant (if at all), don't explain your reasoning.
            If there are no DOIs, DON'T MAKE THEM UP, explain you couldn't find the answer in your data sources.
            If all similarity scores are below 0.45, tell the user you couldn't find research on that topic.<|eot_id|><|st