In [None]:
%%capture
!pip install langchain langchain-community langchain-huggingface faiss-cpu groq langchain-groq sentence-transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path_local = "/content/drive/MyDrive/faiss_mid"

In [None]:
import os
from google.colab import userdata

In [None]:
os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY')

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

In [None]:
# Load the pre-built FAISS vector database and initialize the LLM (Llama-3 via Groq)
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
vectorstore = FAISS.load_local(
    path_local,
    embeddings,
    allow_dangerous_deserialization=True
)

In [None]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    model_name="llama-3.3-70b-versatile",
    temperature=0,
)

In [None]:
# Define the RAFT prompt: enforces step-by-step reasoning (Chain of Thought) based strictly on the provided medical context

from langchain_core.prompts import ChatPromptTemplate

raft_generation_prompt = ChatPromptTemplate.from_template("""
You are a Pharmacology professor creating a high-difficulty exam.
Use the provided CONTEXT to create one question and a detailed answer.

FORMAT RULES:
1. Start directly with "QUESTION:" followed by the created question.
2. Then write "ANSWER:" followed by the response.
3. The answer must be detailed, showing a step-by-step reasoning (Chain of Thought)
and ending with a conclusion, but EVERYTHING must flow naturally without using
section headers like "Step 1" or "Introduction".
4. Base the answer strictly on the provided context.

CONTEXTO:
{context}

Create a question and an answer based strictly on this context.
""")

In [None]:
def generate_raft_example(query):

    docs = vectorstore.similarity_search(query, k=3)
    context_text = "\n\n".join([d.page_content for d in docs])

    chain = raft_generation_prompt | llm
    response = chain.invoke({"context": context_text})

    return response.content

In [None]:
print(generate_raft_example("What is the mechanism of action of Ibuprofen?"))

QUESTION: What is the primary mechanism of action of Ibuvon syrup, and how does it achieve its therapeutic effects in relieving pain and reducing fever, considering its classification as a nonsteroidal anti-inflammatory drug (NSAID) and its composition of ibuprofen?

ANSWER: Ibuvon syrup, being a nonsteroidal anti-inflammatory drug (NSAID), works by blocking the release of certain chemical messengers that cause fever, pain, and inflammation, which is characteristic of its classification as a nonselective COX 1/2 inhibitor within the propionic acid derivatives class. This action is primarily due to its composition of ibuprofen, which is the active ingredient responsible for its therapeutic effects. By inhibiting the COX enzymes, ibuprofen reduces the production of prostaglandins, which are key mediators of pain, fever, and inflammation. As a result, Ibuvon syrup is effective in relieving pain caused by various conditions such as headache, migraine, nerve pain, toothache, sore throat, pe

In [None]:
import json
import time
import pandas as pd
import random

In [None]:
# Load medicine names from the excel file
df_path = "/content/drive/MyDrive/MID_processed.xlsx"

df = pd.read_excel(df_path)
med_list = df['name'].unique().tolist()

In [None]:
# Function to save each example
def append_to_jsonl(data, filename="pharma_raft_dataset.jsonl"):
    with open(filename, 'a', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False)
        f.write('\n')

In [None]:
# Bulk generation loop
count = 0
target = 1000

print(f" Generating {target} RAFT examples.")

while count < target:
    try:
        medicine = random.choice(med_list)
        print(f"[{count+1}/{target}] Processing: {medicine}")

        # Generamos el contenido con el prompt en inglés
        raw_output = generate_raft_example(f"Mechanism, uses and side effects of {medicine}")

        # Separate Question and Answer for the dataset
        if "QUESTION:" in raw_output and "ANSWER:" in raw_output:
            parts = raw_output.split("ANSWER:")
            question = parts[0].replace("QUESTION:", "").strip()
            answer = parts[1].strip()

            # Retrieve documents for the RAFT 'context' field
            docs = vectorstore.similarity_search(medicine, k=3)
            context = "\n\n".join([d.page_content for d in docs])

            # Save
            example = {
                "instruction": question,
                "context": context,
                "answer": answer
            }
            append_to_jsonl(example)
            count += 1

        # Pause to avoid Groq Rate Limits
        time.sleep(2.0)

    except Exception as e:
        print(f"⚠️ Error: {e}. Retrying...")
        time.sleep(10)

print("\nFinished")