In [None]:
%%capture
!pip install langchain langchain-community langchain-huggingface faiss-cpu groq langchain-groq sentence-transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
path_local = "/content/drive/MyDrive/faiss_mid"

In [None]:
import os
from google.colab import userdata

In [None]:
os.environ["GROQ_API_KEY"] = userdata.get('GROQ_API_KEY')

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

In [None]:
# Load the pre-built FAISS vector database and initialize the LLM (Llama-3 via Groq)
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cuda'}
embeddings = HuggingFaceEmbeddings(model_name=model_name, model_kwargs=model_kwargs)

In [None]:
vectorstore = FAISS.load_local(
    path_local,
    embeddings,
    allow_dangerous_deserialization=True
)

In [None]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    model_name="llama-3.1-8b-instant",
    temperature=0,
)

In [None]:
# Define the RAFT prompt: enforces step-by-step reasoning (Chain of Thought) based strictly on the provided medical context

from langchain_core.prompts import ChatPromptTemplate

raft_generation_prompt = ChatPromptTemplate.from_template("""
You are a Pharmacology professor creating a high-difficulty exam.
Use the provided CONTEXT to create one question and a detailed answer.

FORMAT RULES:
1. Start directly with "QUESTION:" followed by the created question.
2. Then write "ANSWER:" followed by the response.
3. The answer must be detailed, showing a step-by-step reasoning (Chain of Thought)
and ending with a conclusion, but EVERYTHING must flow naturally without using
section headers like "Step 1" or "Introduction".
4. Base the answer strictly on the provided context.
5. The ANSWER must be highly concise (Maximum 4 or 5 sentences).

CONTEXTO:
{context}

Create a question and an answer based strictly on this context.
""")

In [None]:
def generate_raft_example(query):

    docs = vectorstore.similarity_search(query, k=3)
    context_text = "\n\n".join([d.page_content for d in docs])

    chain = raft_generation_prompt | llm
    response = chain.invoke({"context": context_text})

    return response.content

In [None]:
print(generate_raft_example("What is the mechanism of action of Ibuprofen?"))

QUESTION: A patient is prescribed Ibuvon Syrup for the treatment of fever and pain relief. However, the patient has a history of heart disease and is taking the medication for long-term treatment. What is the most appropriate action for the patient's doctor to take in this situation?

ANSWER: The doctor should regularly monitor the patient's kidney function, liver function, and levels of blood components to prevent potential complications such as stomach bleeding and kidney problems. This is because long-term use of Ibuvon Syrup may lead to serious complications, and the patient's history of heart disease increases the risk of these complications. The doctor should also inform the patient about the importance of taking the medication as prescribed and avoiding alcohol consumption, as it can increase the risk of stomach problems.


In [None]:
import json
import time
import pandas as pd
import random

In [None]:
# Load medicine names from the excel file
df_path = "/content/drive/MyDrive/MID_processed.xlsx"

df = pd.read_csv(df_path)
med_list = df['name'].unique().tolist()

In [None]:
# Function to save each example
def append_to_jsonl(data, filename="pharma_raft_tab.jsonl"):
    with open(filename, 'a', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False)
        f.write('\n')

In [None]:
# Bulk generation loop
count = 0
target = 300
print(f" Generating {target} RAFT examples.")

query_templates = [
    "Mechanism, uses and side effects of {medicine}",
    "What is the chemical composition and active ingredients of {medicine}?",
    "Who is the manufacturer of {medicine} and what are its primary uses?",
    "Detailed side effects and contraindications for {medicine}",
    "Clinical applications and therapeutic uses of {medicine}",
    "General overview and pharmacological properties of {medicine}"
]

while count < target:
    try:
        medicine = random.choice(med_list)

        # Generamos el contenido con el prompt en inglés
        selected_template = random.choice(query_templates)

        # 3. Formateas el string final con el nombre del medicamento
        query = selected_template.format(medicine=medicine)

        print(f"[{count+1}/{target}] Processing: {medicine} -> Query: {query}")

        # 4. Le pasas la petición dinámica a la función
        raw_output = generate_raft_example(query)

        # Separate Question and Answer for the dataset
        if "QUESTION:" in raw_output and "ANSWER:" in raw_output:
            parts = raw_output.split("ANSWER:")
            question = parts[0].replace("QUESTION:", "").strip()
            answer = parts[1].strip()

            # Retrieve documents for the RAFT 'context' field
            docs = vectorstore.similarity_search(medicine, k=3)
            context = "\n\n".join([d.page_content[:1500] for d in docs])

            # Save
            example = {
                "image": None,
                "instruction": question,
                "context": context,
                "answer": answer
            }
            append_to_jsonl(example)
            count += 1

        # Pause to avoid Groq Rate Limits
        #time.sleep(0.5)

    except Exception as e:
        print(f"⚠️ Error: {e}. Retrying...")
        time.sleep(10)

print("\nFinished")

In [None]:
df_dataset = pd.read_json("pharma_raft_tab.jsonl", lines=True)

# Ver cuántas filas y columnas tiene
print(f"Total de ejemplos: {len(df_dataset)}")

# Mostrar las primeras 5 filas
df_dataset.head()

Total de ejemplos: 244


Unnamed: 0,image,instruction,context,answer
0,,Compare and contrast the mechanism of action o...,medicine name: hotpan d 10mg40mg tablet\nchemi...,Hotpan D 10mg40mg tablet and Hotpan 40mg table...
1,,"A patient is prescribed Glimcor M2 Tablet PR, ...",medicine name: glimcor m2 tablet pr\nchemical ...,"Glimepiride, a sulfonylurea, works by increasi..."
2,,A patient is prescribed Blazextin D 10mg40mg t...,medicine name: blazex d 10mg40mg tablet\nchemi...,The doctor should consider the potential risks...
3,,A 6-year-old patient is prescribed Delpocalm 1...,medicine name: delpocure 100mg5ml suspension\n...,The doctor should advise the mother to consult...
4,,Compare the pharmacological properties of Trz ...,medicine name: trz 5mg tablet\nchemical class:...,Both Trz 5mg tablet and Trolzin 5mg tablet bel...
