In [6]:
#Phase 3 LLM Pipeline
#Developed By : Sriram P
# task2_llm_pipeline_llama.py

# 📌 Install required libraries
!pip install -U transformers datasets huggingface_hub python-dotenv



In [4]:


#  Login to Hugging Face (only required once per session)
from huggingface_hub import login
login()  # Enter your HF token when prompted (must have access to Llama 3.2 1B)

# 🛠️ Import libraries
import os
import csv
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from dotenv import load_dotenv

# Load environment variables (optional, used if you store your token in .env)
load_dotenv()

# Load tokenizer and model
# model_id = "meta-llama/Llama-3.2-1B" (meta-llama/Llama-3.2-1B is a gated repo, waiting to receive access)
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16, device_map="auto")

# Load the GSM8K dataset (first 5 entries for testing)
dataset = load_dataset("gsm8k", "main", split="train").select(range(5))

# Process dataset
results = []
print("Processing questions...")

for idx, entry in enumerate(dataset):
    question = entry["question"]

    # prompt = (
    #     f"Problem: {question}\n"
    #     "What are the prerequisite concepts a student must know to answer this problem correctly?\n"
    #     "List them as comma-separated concepts in learning order."
    # )  Too Generalised didn't provide expected output.

    prompt = (
        f"Question: {question}\n\n"
        "Identify the core **mathematical concepts** a student must understand in order to correctly solve this question.\n"
        "Return only the list of concepts, in order from most basic to most advanced, separated by commas.\n"
        "Only include concepts directly relevant to solving the problem (e.g., Addition, Division, Unit Conversion, Fractions, etc.).\n"
        "Do not include explanations or unrelated general knowledge. Strictly return the ordered concept list.\n"
        "\nExample output:\nAddition, Unit Conversion, Multiplication"
    )


    try:
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            do_sample=True,
            top_p=0.95,
            temperature=0.7
        )
        prerequisites = tokenizer.decode(outputs[0], skip_special_tokens=True).replace(prompt, "").strip()
    except Exception as e:
        prerequisites = f"ERROR: {str(e)}"

    results.append({
        "question": question,
        "prerequisite_concepts": prerequisites
    })

    print(f"[{idx + 1}/{len(dataset)}] Processed")

# Save results to CSV
output_file = "task2_llm_prerequisites_output.csv"
with open(output_file, mode="w", newline='', encoding="utf-8") as f:
    writer = csv.DictWriter(f, fieldnames=["question", "prerequisite_concepts"])
    writer.writeheader()
    writer.writerows(results)

print(f"Results saved to {output_file}")


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Processing questions...
[1/5] Processed
[2/5] Processed
[3/5] Processed
[4/5] Processed
[5/5] Processed
Results saved to task2_llm_prerequisites_output.csv
