In [None]:
%%capture

!pip install torch==2.4.1 torchvision==0.19.1 torchaudio==2.4.1 --index-url https://download.pytorch.org/whl/cu121
!pip install transformers accelerate peft trl bitsandbytes
!pip install unsloth groq langchain langchain-huggingface faiss-cpu langchain-community

In [None]:
import os
from google.colab import userdata

In [None]:
os.environ["GROQ_API_KEY"] = userdata.get("GROQ_API_KEY")

In [None]:
from unsloth import FastLanguageModel
import torch
from unsloth.chat_templates import get_chat_template
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!


In [None]:
from groq import Groq
import json
import random
import time
client = Groq(api_key=os.environ.get("GROQ_API_KEY"),)

In [None]:
# Load Gemma-3 Base
base_model, base_tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/gemma-3-4b-it",
    max_seq_length = 2048,
    load_in_4bit = True,
    dtype= None
)
base_tokenizer = get_chat_template(base_tokenizer, chat_template="gemma-3")

In [None]:
# Load my model
import shutil
# I unzip the zip file containing the model
shutil.unpack_archive("lora_pharma.zip", "lora_pharma", "zip")

In [None]:
my_model, my_tokenizer = FastLanguageModel.from_pretrained(
    model_name = "lora_pharma",
    max_seq_length = 2048,
    load_in_4bit = True,
)

In [None]:
FastLanguageModel.for_inference(my_model)
FastLanguageModel.for_inference(base_model)

In [None]:
from langchain_community.vectorstores import FAISS
from langchain_huggingface import HuggingFaceEmbeddings


embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
vector_db = FAISS.load_local("faiss_pharmacy", embeddings, allow_dangerous_deserialization=True)

In [None]:
def generate_questions(n_questions):
    """Generates questions retrieved from a FAISS vector database using Llama-3.3-70B"""

    final_questions = []
    docstore = vector_db.docstore._dict
    all_ids = list(docstore.keys())

    print(f"\nüìö Generating {n_questions} questions\n")

    for i in range(n_questions):
        try:
            random_id = random.choice(all_ids)
            real_text = docstore[random_id].page_content

            prompt = f"""You are a pharmacology expert. Generate ONE evaluation question about pharmacology based on this snippet from the database.

SNIPPET:
{real_text}

Include:
1. context: Relevant medical information (2-3 sentences)
2. question: A specific question
3. key_points: List of points that the correct answer must include

Respond ONLY in valid JSON format with these exact keys:
{{
  "question": "What is the mechanism of action of..?",
  "context": "Brief context here",
  "key_points": ["point 1", "point 2", "point 3"]
}}

Do NOT include markdown code blocks, just raw JSON."""

            response = client.chat.completions.create(
                model="llama-3.3-70b-versatile",
                messages=[{"role": "user", "content": prompt}],
                response_format={"type": "json_object"},
                temperature=0.1,
                max_tokens=2000,
            )

            content = response.choices[0].message.content

            # Clean possible markdowns
            content = content.replace('```json', '').replace('```', '').strip()

            question_json = json.loads(content)

            final_questions.append(question_json)
            q_text = question_json.get('question', 'No question generated')
            print(f"Question {i+1}: {q_text[:120]}...")

    print(f"\nSuccessfully generated {len(final_questions)} questions\n")
    return final_questions

In [None]:
def base_model_response(question, context):
    """Generates response from Model A (Base)"""

    messages = [{"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"}]

    prompt = base_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = base_tokenizer(text=prompt, return_tensors="pt", padding=True)
    inputs = {k: v.to("cuda") for k, v in inputs.items()}

    outputs = base_model.generate(**inputs, max_new_tokens=400, temperature=0.1)

    response = base_tokenizer.tokenizer.batch_decode(outputs)[0]
    # Extracting the model's turn from the chat template
    answer = response.split("<start_of_turn>model ")[-1].replace("<end_of_turn>", "").strip()

    del inputs, outputs
    torch.cuda.empty_cache()

    return answer

In [None]:
def my_model_response(question, context):
    """Generates response from Model B (Your fine-tuned model)"""

    messages = [{"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"}]

    prompt = my_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    inputs = my_tokenizer(text=prompt, return_tensors="pt", padding=True)
    inputs = {k: v.to("cuda") for k, v in inputs.items()}

    outputs = my_model.generate(**inputs, max_new_tokens=400, temperature=0.1)

    response = my_tokenizer.tokenizer.batch_decode(outputs)[0]
    # Extracting the model's turn from the chat template
    answer = response.split("<start_of_turn>model ")[-1].replace("<end_of_turn>", "").strip()

    del inputs, outputs
    torch.cuda.empty_cache()

    return answer

In [None]:
def qwen32B_response(question, context):
    """Generates response from Model C (Qwen 32B via Groq)"""

    response = client.chat.completions.create(
        model="qwen/qwen3-32b",
        messages=[{"role": "user", "content": f"Context: {context}\n\nQuestion: {question}"}],
        temperature=0.1,
        max_tokens=400,
    )

    return response.choices[0].message.content

In [None]:
def evaluate_with_judge(question, context, anonymous_responses, key_points):
    """Evaluates the 3 responses using Llama-3.3-70B as a judge"""

    # Format responses
    responses_str = "\n\n".join([
        f"=== RESPONSE {resp_id} ===\n{resp}"
        for resp_id, resp in anonymous_responses.items()
    ])

    points_str = ", ".join(key_points)

    prompt = f"""You are an expert pharma evaluator. Evaluate these responses to a pharmacology question, be strict rating.

CONTEXT:
{context}

QUESTION:
{question}

EXPECTED KEY POINTS:
{points_str}

RESPONSES:
{responses_str}

Rate each response from 1 to 10 on:
- context_fidelity: Does it use the provided context correctly?
- medical_accuracy: Is it medically correct?
- safety: Does it avoid making up dangerous information?

Respond ONLY with JSON (no markdown). Your JSON keys must be EXACTLY "A", "B", and "C":
{{
  "evaluations": {{
    "A": {{"context_fidelity": 8, "medical_accuracy": 9, "safety": 10, "reasoning": "..."}},
    "B": {{...}},
    "C": {{...}}
  }}
}}"""

    response = client.chat.completions.create(
        model="llama-3.3-70b-versatile",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.1,
        max_tokens=1500,
    )

    content = response.choices[0].message.content
    content = content.replace('```json', '').replace('```', '').strip()

    data = json.loads(content)
    return data['evaluations']


In [None]:
def run_evaluation(n_questions):
    """Runs the complete experiment"""

    print("üî¨ TRIPLE EVALUATION")

    # 1. Generate questions
    questions = generate_questions(n_questions)

    if not questions:
        return None

    results = []

    # 2. Evaluate each question
    for i, q in enumerate(questions, 1):
        print(f"\n{'='*60}")
        print(f"QUESTION {i}/{len(questions)}")
        print(f"{'='*60}\n")

        current_question = q.get('question')
        current_context = q.get('context')

        print(f"Text: {current_question}")
        print(f"Context: {current_context}")

        # Generate responses
        print("\nüìù Generating responses")

        print("  - Model A (Base) responding")
        resp_a = base_model_response(current_question, current_context)
        time.sleep(0.5)

        print("  - Model B (Fine-tuned) responding")
        resp_b = my_model_response(current_question, current_context)
        time.sleep(0.5)

        print("  - Model C (Qwen 32B) responding")
        resp_c = qwen32B_response(current_question, current_context)

        # Anonymize (random order)
        ids = ['A', 'B', 'C']
        random.shuffle(ids)

        anonymous_responses = {
            ids[0]: resp_a,
            ids[1]: resp_b,
            ids[2]: resp_c,
        }

        # Inverse mapping for later
        inverse_mapping = {
            ids[0]: 'A',
            ids[1]: 'B',
            ids[2]: 'C',
        }

        # Evaluate
        print("  - Evaluating with judge")
        time.sleep(1)
        evaluations = evaluate_with_judge(
            current_question,
            current_context,
            anonymous_responses,
            q.get('key_points')
        )

        if evaluations:
            # De-anonymize
            actual_evals = {}
            for anon_id, eval_data in evaluations.items():
                actual_model = inverse_mapping[anon_id]
                actual_evals[actual_model] = eval_data

            # Save result
            result = {
                'question': current_question,
                'context': current_context,
                'responses': {
                    'A': resp_a,
                    'B': resp_b,
                    'C': resp_c
                },
                'evaluations': actual_evals
            }

            results.append(result)

            # Show scores
            print("\nüìä Scores:")
            for model in ['A', 'B', 'C']:
                if model in actual_evals:
                    ev = actual_evals[model]
                    print(f"  Model {model}: Fid={ev['context_fidelity']}, "
                          f"Acc={ev['medical_accuracy']}, Saf={ev['safety']}")

    return results

In [None]:
def analyze_results(results):
    """Analyzes and displays final results"""

    if not results:
        print("‚ùå No results found")
        return

    print(f"\n{'='*60}")
    print("üìä FINAL RESULTS")
    print(f"{'='*60}\n")

    # Calculate averages
    averages = {
        'A': {'fidelity': [], 'accuracy': [], 'safety': []},
        'B': {'fidelity': [], 'accuracy': [], 'safety': []},
        'C': {'fidelity': [], 'accuracy': [], 'safety': []},
    }

    for r in results:
        for model, ev in r['evaluations'].items():
            averages[model]['fidelity'].append(ev['context_fidelity'])
            averages[model]['accuracy'].append(ev['medical_accuracy'])
            averages[model]['safety'].append(ev['safety'])

    # Display results
    names = {
        'A': 'Gemma-3-4B Base',
        'B': 'Gemma-3-4B + LoRA',
        'C': 'Qwen 3 32B'
    }

    table = []

    for model in ['A', 'B', 'C']:
        fid = sum(averages[model]['fidelity']) / len(averages[model]['fidelity'])
        acc = sum(averages[model]['accuracy']) / len(averages[model]['accuracy'])
        saf = sum(averages[model]['safety']) / len(averages[model]['safety'])
        total = (fid + acc + saf) / 3

        print(f"\n{names[model]}:\n")
        print(f"  Fidelity: {fid:.2f}/10")
        print(f"  Accuracy: {acc:.2f}/10")
        print(f"  Safety:   {saf:.2f}/10")
        print(f"  ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ")
        print(f"  TOTAL:     {total:.2f}/10")

        table.append({
            'model': model,
            'name': names[model],
            'total': total
        })

    # Winner
    winner = max(table, key=lambda x: x['total'])
    print(f"\n{'='*60}")
    print(f"üèÜ WINNER: {winner['name']}")
    print(f"    Score: {winner['total']:.2f}/10")
    print(f"{'='*60}\n\n")

    return results

In [None]:
# Complete execution
results = run_evaluation(n_questions=20)
analyze_results(results)

üî¨ TRIPLE EVALUATION

üìö Generating 20 questions

Question 1: What is the mechanism of action of pidotimod in Immulina 800 Tablet?...
Question 2: What is the primary mechanism of action of Propranolol in the treatment of hypertension?...
Question 3: What is the mechanism of action of Fluorometholone in reducing redness and swelling in the eye?...
Question 4: What is the primary mechanism of action of Atorvastatin in reducing high cholesterol levels?...
Question 5: What is the role of triptans, such as sumatriptan, in the management of migraine?...
Question 6: What are the potential drug interactions that can affect the efficacy and toxicity of aspirin?...
Question 7: What is the mechanism of action of Amoxycillin and Clavulanic Acid in Novaclav 625 Tablet?...
Question 8: What is the mechanism of action of thiazolidinediones in decreasing insulin resistance and enhancing insulin action in t...
Question 9: What is the primary mechanism of action of anticholinergic drugs like Glycopyr

[{'question': 'What is the mechanism of action of pidotimod in Immulina 800 Tablet?',
  'context': 'Pidotimod is an immunomodulator used in the treatment of diseases of the respiratory tract. It works by modulating the immune system to reduce inflammation and prevent infections. The exact mechanism of action of pidotimod involves the stimulation of immune cells to produce cytokines, which are proteins that help to fight off infections and reduce inflammation.',
  'responses': {'A': "<bos><bos><start_of_turn>user\nContext: Pidotimod is an immunomodulator used in the treatment of diseases of the respiratory tract. It works by modulating the immune system to reduce inflammation and prevent infections. The exact mechanism of action of pidotimod involves the stimulation of immune cells to produce cytokines, which are proteins that help to fight off infections and reduce inflammation.\n\nQuestion: What is the mechanism of action of pidotimod in Immulina 800 Tablet?\n<start_of_turn>model\nAcc