In [None]:
# Install required libraries
!pip install transformers torch pandas


In [None]:
import time
import pandas as pd
from transformers import pipeline

prompts = [
    "Who is the missile man of India?",
    "What is AI and ML?",
    "Name the 7 wonders of the world.",
    "Name the female who won two Nobel Prizes.",
    "How many females have won more than one Nobel Prize?",
    "Explain AI and ML in the simplest terms.",
    "Explain what a data analyst actually does in the simplest terms."
]

models = {
    "phi3_mini": pipeline(
        "text-generation",
        model="microsoft/Phi-3-mini-4k-instruct",
        device=0
    ),
    "distilgpt2": pipeline(
        "text-generation",
        model="distilgpt2",
        device=0
    )
}

results = []

for model_name, model in models.items():
    for prompt in prompts:
        start_time = time.time()

        output = model(
            prompt,
            max_new_tokens=30,
            do_sample=False,
            return_full_text=False
        )

        end_time = time.time()
        response_text = output[0]["generated_text"]

        results.append({
            "model_name": model_name,
            "prompt": prompt,
            "response": response_text,
            "latency_seconds": round(end_time - start_time, 3),
            "response_length": len(response_text)
        })

df = pd.DataFrame(results)
df.to_csv("llm_evaluation_results.csv", index=False)

df


In [None]:
df.head()

In [None]:
df.groupby("model_name")[["latency_seconds", "response_length"]].mean()


## Observations

- distilgpt2 was significantly faster but failed to handle instruction-based prompts, producing incoherent or repetitive outputs.
- Phi-3 Mini generated more coherent and informative responses, though with higher latency.
- This highlights the trade-off between response quality and inference speed when selecting language models.
- For instruction-heavy tasks such as Q&A, instruction-tuned models are more suitable than base language models.
