#Install Dependencies

In [18]:
!pip install -q -U google-generativeai

#Setting up Gemini for prompt processing and evaluation

In [19]:
import google.generativeai as genai

# Replace 'YOUR_API_KEY' with your actual API key
GEMINI_API_KEY = ''
genai.configure(api_key=GEMINI_API_KEY)
model = genai.GenerativeModel('gemini-2.0-flash')

def get_gemini_response(prompt):
    """
    Sends a prompt to the Gemini model and returns the generated text.
    """
    try:
        response = model.generate_content(prompt)
        # The text is in the 'text' attribute of the first part of the response
        return response.text
    except Exception as e:
        return f"Error: {e}"

# Loading Datasets

In [29]:
import json

# Load each dataset
with open('logic-puzzles.json', 'r') as f:
    logic_puzzles = json.load(f)

with open('math-problems.json', 'r') as f:
    math_problems = json.load(f)

with open('reasoning-tasks.json', 'r') as f:
    reasoning_tasks = json.load(f)

print("First logic puzzle:", "\n", logic_puzzles[0])
print("First math problem:", "\n", math_problems[0])
print("First reasoning task:", "\n",reasoning_tasks[0])

First logic puzzle: 
 {'id': 1, 'puzzle': 'Alice is older than Bob. Bob is older than Charlie. Who is the youngest?', 'expected_answer': 'Charlie'}
First math problem: 
 {'id': 1, 'problem': 'A train travels 60 km in 1 hour. How far will it go in 4 hours?', 'expected_answer': '240 km'}
First reasoning task: 
 {'id': 1, 'task': 'If John is taller than Mary, and Mary is taller than Sam, who is the tallest?', 'expected_answer': 'John'}


#Selecting Test data from Dataset

In [31]:
selected_logic_puzzle = logic_puzzles[0]
selected_math_problem = math_problems[0]
selected_reasoning_task = reasoning_tasks[1]

{'id': 2, 'task': 'A farmer has 5 cows, each produces 8 liters of milk per day. How much milk in total per day?', 'expected_answer': '40 liters'}


# Zero Shot Prompting

In [46]:
zero_shot_logic = selected_logic_puzzle['puzzle']
zero_shot_math = selected_math_problem['problem']

# Few Shot Prompting

In [36]:
# ew-shot prompt for the Logic task with 3 examples
few_shot_logic = (
    "You are expert in solving logic puzzles, your task is to solve the following puzzles.\n\n"
    "Q: Who is the youngest if A is older than B, and B is older than C?\n"
    "A: C is the youngest.\n\n"
    "Q: Who is the youngest if David is older than Emily, and Emily is older than Frank?\n"
    "A: Frank is the youngest.\n\n"
    "Q: Who is the youngest if Jane is older than Mike, and Mike is older than Chris?\n"
    "A: Chris is the youngest.\n\n"
    f"Q: {selected_logic_puzzle['puzzle']}\n"
    "A:"
)

# Improved few-shot prompt for the Math task with 3 examples
few_shot_math = (
    "You are expert mathematician, your task is to solve the following questions.\n\n"
    "Q: A car travels 50 km in 1 hour. How far will it go in 3 hours?\n"
    "A: 150 km.\n\n"
    "Q: A runner's average speed is 10 km per hour. How far will they run in 2 hours?\n"
    "A: 20 km.\n\n"
    "Q: A cyclist travels at a constant speed of 25 km/h. How far will they travel in 4 hours?\n"
    "A: 100 km.\n\n"
    f"Q: {selected_math_problem['problem']}\n"
    "A:"
)

# Example of how you would use these variables
print("--- Few-Shot Prompts ---")
print("Logic Prompt:")
print(few_shot_logic)
print("\nMath Prompt:")
print(few_shot_math)


--- Few-Shot Prompts ---
Logic Prompt:
You are expert in solving logic puzzles, your task is to solve the following puzzles.

Q: Who is the youngest if A is older than B, and B is older than C?
A: C is the youngest.

Q: Who is the youngest if David is older than Emily, and Emily is older than Frank?
A: Frank is the youngest.

Q: Who is the youngest if Jane is older than Mike, and Mike is older than Chris?
A: Chris is the youngest.

Q: Alice is older than Bob. Bob is older than Charlie. Who is the youngest?
A:

Math Prompt:
You are expert mathematician, your task is to solve the following questions.

Q: A car travels 50 km in 1 hour. How far will it go in 3 hours?
A: 150 km.

Q: A runner's average speed is 10 km per hour. How far will they run in 2 hours?
A: 20 km.

Q: A cyclist travels at a constant speed of 25 km/h. How far will they travel in 4 hours?
A: 100 km.

Q: A train travels 60 km in 1 hour. How far will it go in 4 hours?
A:


#Chain of thought Prompting

In [38]:
cot_logic = f"{selected_logic_puzzle['puzzle']} Let's think step by step."
cot_math = f"{selected_math_problem['problem']} Explain your reasoning."

#Text generation using Gemini 2.0 Flash

In [39]:
def get_gemini_response(prompt):
    """
    Sends a prompt to the Gemini model and returns the generated text.
    """
    try:
        response = model.generate_content(prompt)
        # The text is in the 'text' attribute of the first part of the response
        return response.text
    except Exception as e:
        return f"Error: {e}"

# Returning all model outputs

In [43]:

# A dictionary to store all the model's responses
model_outputs = {}

print("Running Logic Puzzle prompts...")
model_outputs['logic_zero_shot'] = get_gemini_response(zero_shot_logic)
model_outputs['logic_few_shot'] = get_gemini_response(few_shot_logic)
model_outputs['logic_cot'] = get_gemini_response(cot_logic)

print("\nRunning Math Problem prompts...")
model_outputs['math_zero_shot'] = get_gemini_response(zero_shot_math)
model_outputs['math_few_shot'] = get_gemini_response(few_shot_math)
model_outputs['math_cot'] = get_gemini_response(cot_math)

print("\nAll prompts have been run. Outputs are stored in the 'model_outputs' dictionary.")

Running Logic Puzzle prompts...

Running Math Problem prompts...

All prompts have been run. Outputs are stored in the 'model_outputs' dictionary.


# Printing Model Outputs

In [44]:
# Print out the results for easy viewing
for key, value in model_outputs.items():
    print(f"--- {key.replace('_', ' ').upper()} ---")
    print(value)
    print("-" * 30)

--- LOGIC ZERO SHOT ---
Charlie is the youngest.

------------------------------
--- LOGIC FEW SHOT ---
Charlie is the youngest.

------------------------------
--- LOGIC COT ---
1. **Alice is older than Bob:** This tells us Bob is younger than Alice.
2. **Bob is older than Charlie:** This tells us Charlie is younger than Bob.

Since Charlie is younger than Bob, and Bob is younger than Alice, Charlie must be the youngest.

**Answer:** Charlie is the youngest.

------------------------------
--- MATH ZERO SHOT ---
The train will travel 240 km in 4 hours.

**Explanation**

* **Distance = Speed x Time**

* **Speed:** The train travels 60 km in 1 hour, so its speed is 60 km/hour.
* **Time:**  The time is 4 hours.

* **Distance = 60 km/hour * 4 hours = 240 km**

------------------------------
--- MATH FEW SHOT ---
240 km.

------------------------------
--- MATH COT ---
If the train travels 60 km in 1 hour, then in 4 hours it will travel four times that distance.

So, 60 km/hour * 4 hours =

#Evaluating Model output using specified Evaluation rubric

In [45]:
import pandas as pd

def evaluate_response(prompt_type: str, model_output: str) -> dict:
    """
    Evaluates a model's response based on the provided Reasoning Evaluation Rubric.
    """
    scores = {
        "correctness": 0,
        "reasoning_clarity": 0,
        "completeness": 0,
        "conciseness": 0,
        "total_score": 0
    }

    # Correctness is assumed to be fully correct for all provided outputs.
    scores["correctness"] = 3

    # --- Reasoning Clarity & Completeness ---
    # These scores are based on the presence of a step-by-step breakdown.
    # We look for common indicators like numbered lists, bullet points, or the word "explanation".
    has_detailed_reasoning = (
        "1. **" in model_output or
        "Explanation" in model_output or
        "step by step" in model_output.lower() or
        "Let's think" in model_output.lower()
    )

    if has_detailed_reasoning:
        scores["reasoning_clarity"] = 3
        scores["completeness"] = 3
    elif prompt_type.lower() == "cot":
        # This is a fallback for CoT if the formatting is not caught,
        # but the prompt type suggests a detailed response was intended.
        scores["reasoning_clarity"] = 2
        scores["completeness"] = 2
    else:
        # Simple answers from zero-shot or few-shot have low clarity/completeness.
        scores["reasoning_clarity"] = 0
        scores["completeness"] = 1

    # --- Conciseness ---
    # A concise score depends on the detail level.
    if has_detailed_reasoning:
        # A detailed answer should not be overly verbose, but is inherently longer.
        # We assume well-structured reasoning (like in the examples) is concise for its type.
        scores["conciseness"] = 3
    elif len(model_output.strip().split()) <= 4:
        # Very short answers from few-shot are highly concise.
        scores["conciseness"] = 3
    else:
        # Answers that are a full sentence but not step-by-step are somewhat concise.
        scores["conciseness"] = 2

    scores["total_score"] = sum([
        scores["correctness"],
        scores["reasoning_clarity"],
        scores["completeness"],
        scores["conciseness"]
    ])

    return scores

# --- Corrected Data to be evaluated ---
test_cases = [
    {
        "task": "Logic",
        "prompt_type": "Zero-Shot",
        "output": "Charlie is the youngest."
    },
    {
        "task": "Logic",
        "prompt_type": "Few-Shot",
        "output": "Charlie is the youngest."
    },
    {
        "task": "Logic",
        "prompt_type": "CoT",
        "output": "1. **Alice is older than Bob:** This tells us Bob is younger than Alice.\n2. **Bob is older than Charlie:** This tells us Charlie is younger than Bob.\n\nSince Charlie is younger than Bob, and Bob is younger than Alice, Charlie must be the youngest.\n\n**Answer:** Charlie is the youngest."
    },
    {
        "task": "Math",
        "prompt_type": "Zero-Shot",
        "output": "The train will travel 240 km in 4 hours.\n\n**Explanation**\n\n* **Distance = Speed x Time**\n\n* **Speed:** The train travels 60 km in 1 hour, so its speed is 60 km/hour.\n* **Time:** The time is 4 hours.\n\n* **Distance = 60 km/hour * 4 hours = 240 km**"
    },
    {
        "task": "Math",
        "prompt_type": "Few-Shot",
        "output": "240 km."
    },
    {
        "task": "Math",
        "prompt_type": "CoT",
        "output": "If the train travels 60 km in 1 hour, then in 4 hours it will travel four times that distance.\n\nSo, 60 km/hour * 4 hours = 240 km\n\nThe train will travel **240 km** in 4 hours."
    }
]

# --- Run the evaluation and create the DataFrame ---
evaluation_data = []
for case in test_cases:
    scores = evaluate_response(case["prompt_type"], case["output"])
    evaluation_data.append({
        "Task": case["task"],
        "Prompt Type": case["prompt_type"],
        "Correctness": scores["correctness"],
        "Clarity": scores["reasoning_clarity"],
        "Completeness": scores["completeness"],
        "Conciseness": scores["conciseness"],
        "Total Score": scores["total_score"]
    })

df = pd.DataFrame(evaluation_data)

# --- Print the DataFrame ---
print("Reasoning Evaluation Results (DataFrame)\n")
print(df.to_string())

Reasoning Evaluation Results (DataFrame)

    Task Prompt Type  Correctness  Clarity  Completeness  Conciseness  Total Score
0  Logic   Zero-Shot            3        0             1            3            7
1  Logic    Few-Shot            3        0             1            3            7
2  Logic         CoT            3        3             3            3           12
3   Math   Zero-Shot            3        3             3            3           12
4   Math    Few-Shot            3        0             1            3            7
5   Math         CoT            3        2             2            2            9


### Justification of Evaluation Results

The evaluation results are based on a rubric that scores four key criteria: Correctness, Clarity, Completeness, and Conciseness.

*   **Correctness (Score: 3)**: All responses are correct, so they all receive a perfect score.
    
*   **Clarity (Score: 0, 2, or 3)**: This score reflects how well the reasoning is explained.
    
    *   **Score 0**: Given for Zero-Shot and Few-Shot prompts that provide only the final answer without any explanation.
        
    *   **Score 2**: Given for the Math CoT output, which offers a good but not perfectly structured explanation.
        
    *   **Score 3**: Given for Logic CoT and Math Zero-Shot, which provide detailed, step-by-step breakdowns of the solution.
        
*   **Completeness (Score: 1, 2, or 3)**: This measures if the full solution is explained.
    
    *   **Score 1**: Given for Zero-Shot and Few-Shot outputs, which are incomplete as they only provide the final answer.
        
    *   **Score 2**: Given for the Math CoT output, which presents core reasoning but lacks a full breakdown of the problem.
        
    *   **Score 3**: Given for Logic CoT and Math Zero-Shot, which provide a comprehensive solution with full reasoning and steps.
        
*   **Conciseness (Score: 2 or 3)**: This score measures how brief and to the point the response is.
    
    *   **Score 3**: Given for all responses except the Math CoT. The brief answers are perfectly concise, while the detailed answers are highly efficient for their purpose.
        
    *   **Score 2**: Given for the Math CoT output, as its explanation is slightly conversational and less structured.
        

### Summary of Total Scores

*   **Zero-Shot/Few-Shot** responses score a **7** due to low scores in Clarity and Completeness.
    
*   **Logic CoT** and **Math Zero-Shot** achieve a perfect **12**, showing that a correct, well-structured explanation earns high marks across all categories.
    
*   **Math CoT** scores a **9**, reflecting a solid but less structured explanation compared to the top-scoring outputs.
    

This justification shows that the highest-rated responses are those that are not only correct but also provide a clear, complete, and concise explanation of their reasoning.