# STEP 1: Install Dependencies

In [1]:
%pip install --upgrade pip
%pip install torch transformers accelerate bitsandbytes sentencepiece tiktoken

Collecting pip
  Downloading pip-25.2-py3-none-any.whl.metadata (4.7 kB)
Downloading pip-25.2-py3-none-any.whl (1.8 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.8/1.8 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.2
Collecting bitsandbytes
  Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl.metadata (11 kB)
Downloading bitsandbytes-0.47.0-py3-none-manylinux_2_24_x86_64.whl (61.3 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.3/61.3 MB[0m [31m51.9 MB/s[0m  [33m0:00:01[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbyte

# STEP 2: Load Model and Tokenizer

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "Qwen/Qwen3-4B"   # Base model (with thinking capability)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16,
    load_in_8bit=True
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/726 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/3.99G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/99.6M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

# STEP 3: Helper Function

In [5]:
def ask_qwen(prompt, enable_thinking=True, max_new_tokens=200):
    inputs = tokenizer.apply_chat_template(
        [{"role": "user", "content": prompt}],
        tokenize=True,
        return_tensors="pt",
        enable_thinking=enable_thinking
    ).to(model.device)

    attention_mask = torch.ones(inputs.shape, dtype=torch.long, device=inputs.device)  # Manual mask (all 1s for unpadded input)

    outputs = model.generate(inputs, attention_mask=attention_mask, max_new_tokens=max_new_tokens)
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# STEP 4: Interactive Loop

In [None]:
while True:
    prompt = input("\nüîπ prompt (or type 'exit' to quit): ")
    if prompt.lower() == "exit":
        print("Exiting... ‚úÖ")
        break

    choice = input("Do you want enable_thinking? (y/n): ").strip().lower()
    enable_thinking = True if choice == "y" else False

    print("\n=== Model Output ===")
    print(ask_qwen(prompt, enable_thinking=enable_thinking))
    print("====================")

# Import libaries

In [7]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import pandas as pd
from datetime import datetime
import json
import re

# step 5: # STEP 4: Define Task Dataset (Project Step 1: Task Selection)

In [8]:
tasks = [
    {
        "category": "arithmetic",
        "question": "What is 15 * 12?",
        "expected_answer": "180",
        "type": "single-step"
    },
    {
        "category": "math_word_problem",
        "question": "A store sells apples at $2 each and oranges at $3 each. If you buy 4 apples and 5 oranges, how much do you spend?",
        "expected_answer": "23",
        "type": "multi-step"
    },
    {
        "category": "logic_puzzle",
        "question": "If A is taller than B, and B is taller than C, who is the tallest?",
        "expected_answer": "A",
        "type": "reasoning"
    },
    {
        "category": "commonsense_reasoning",
        "question": "If it‚Äôs raining outside, should you bring an umbrella? Why?",
        "expected_answer": "Yes, to stay dry.",
        "type": "reasoning"
    }
    # Add more tasks as needed
]

In [9]:
# STEP 5: Prompt Templates (Project Step 2: Prompt Design)
prompt_templates = {
    "chain_of_thought": """Solve the following problem step-by-step. Show all your reasoning clearly, and provide the final answer at the end with 'Final Answer:'.
Question: {question}""",
    "self_consistency": """Solve the following problem multiple times (at least 3 reasoning paths) and choose the most consistent answer. Show each reasoning path clearly, and provide the final answer with 'Final Answer:'.
Question: {question}""",
    "reflection": """Solve the following problem step-by-step. After solving, double-check your reasoning for errors and refine if needed. Show all steps, including the reflection, and provide the final answer with 'Final Answer:'.
Question: {question}"""
}

In [10]:
# STEP 6: Helper Function for Model Execution (Project Step 3)
def ask_qwen(prompt, enable_thinking=True, max_new_tokens=500):
    inputs = tokenizer.apply_chat_template(
        [{"role": "user", "content": prompt}],
        tokenize=True,
        return_tensors="pt",
        enable_thinking=enable_thinking
    ).to(model.device)

    attention_mask = torch.ones(inputs.shape, dtype=torch.long, device=inputs.device)
    outputs = model.generate(inputs, attention_mask=attention_mask, max_new_tokens=max_new_tokens)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract final answer (if present)
    final_answer_match = re.search(r'Final Answer: (.*?)(?:\n|$)', response, re.IGNORECASE)
    final_answer = final_answer_match.group(1).strip() if final_answer_match else response.strip()

    return {"full_response": response, "final_answer": final_answer}

In [11]:
# STEP 7: Evaluation Function (Project Step 4)
def evaluate_response(task, response):
    expected = str(task["expected_answer"]).strip().lower()
    actual = str(response["final_answer"]).strip().lower()

    # Correctness
    is_correct = expected == actual

    # Basic reasoning quality check (heuristic-based)
    reasoning_steps = response["full_response"].count("\n") > 2  # More than 2 lines suggests steps
    logical_coherence = reasoning_steps  # Placeholder: enhance with more sophisticated checks
    error_types = []

    if not is_correct:
        if not reasoning_steps:
            error_types.append("skipped_steps")
        if "hallucination" in response["full_response"].lower():  # Placeholder check
            error_types.append("hallucination")
        if task["category"] == "arithmetic" and not re.search(r'\d+\s*[\+\-\*/]\s*\d+', response["full_response"]):
            error_types.append("arithmetic_mistake")

    return {
        "is_correct": is_correct,
        "logical_coherence": logical_coherence,
        "error_types": error_types
    }

In [None]:
# STEP 8: Run Experiments (Project Steps 3 & 4)
results = []
for task in tasks:
    for prompt_type, template in prompt_templates.items():
        print(f"\n=== Running {prompt_type} for task: {task['question']} ===")
        prompt = template.format(question=task["question"])
        response = ask_qwen(prompt, enable_thinking=True)

        # Print full response
        print("\nFull Model Response:")
        print(response["full_response"])
        print("\nExtracted Final Answer:", response["final_answer"])
        print("="*50)

        evaluation = evaluate_response(task, response)

        results.append({
            "task_category": task["category"],
            "question": task["question"],
            "expected_answer": task["expected_answer"],
            "prompt_type": prompt_type,
            "full_response": response["full_response"],
            "final_answer": response["final_answer"],
            "is_correct": evaluation["is_correct"],
            "logical_coherence": evaluation["logical_coherence"],
            "error_types": evaluation["error_types"],
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })


=== Running chain_of_thought for task: What is 15 * 12? ===

Full Model Response:
user
Solve the following problem step-by-step. Show all your reasoning clearly, and provide the final answer at the end with 'Final Answer:'.  
Question: What is 15 * 12?
</think>

</think>

We are asked to compute the product of 15 and 12.

### Step 1: Multiply 15 by 12
We can break this down as follows:

$$
15 \times 12 = 15 \times (10 + 2) = (15 \times 10) + (15 \times 2)
$$

$$
= 150 + 30 = 180
$$

### Final Answer:
$$
\boxed{180}
$$

Extracted Final Answer: user
Solve the following problem step-by-step. Show all your reasoning clearly, and provide the final answer at the end with 'Final Answer:'.  
Question: What is 15 * 12?
</think>

</think>

We are asked to compute the product of 15 and 12.

### Step 1: Multiply 15 by 12
We can break this down as follows:

$$
15 \times 12 = 15 \times (10 + 2) = (15 \times 10) + (15 \times 2)
$$

$$
= 150 + 30 = 180
$$

### Final Answer:
$$
\boxed{180}
$$

=== Run

In [13]:
# STEP 9: Save Results to CSV (Project Steps 4 & 6)
results_df = pd.DataFrame(results)
results_df.to_csv("qwen3_4b_results.csv", index=False)
print("Results saved to qwen3_4b_results.csv")

Results saved to qwen3_4b_results.csv


In [14]:
# STEP 10: Comparative Analysis Placeholder (Project Step 5)
# Note: To compare with a stronger LLM (e.g., Llama-3 70B), you'll need API access or a local setup.
# Below is a placeholder function to extend later.
def run_comparative_analysis(task, prompt_template, model_api="llama-3-70b"):
    # Example: Use an API (e.g., Hugging Face, OpenAI) for a stronger model
    print(f"Placeholder: Running {model_api} for task: {task['question']}")
    # Implement API call or local model inference here
    return {"full_response": "Placeholder response", "final_answer": "Placeholder answer"}

# Run comparative analysis (optional)
comparative_results = []
for task in tasks:
    for prompt_type, template in prompt_templates.items():
        response = run_comparative_analysis(task, template)
        evaluation = evaluate_response(task, response)
        comparative_results.append({
            "task_category": task["category"],
            "question": task["question"],
            "expected_answer": task["expected_answer"],
            "prompt_type": prompt_type,
            "model": "stronger_llm",
            "full_response": response["full_response"],
            "final_answer": response["final_answer"],
            "is_correct": evaluation["is_correct"],
            "logical_coherence": evaluation["logical_coherence"],
            "error_types": evaluation["error_types"],
            "timestamp": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        })

Placeholder: Running llama-3-70b for task: What is 15 * 12?
Placeholder: Running llama-3-70b for task: What is 15 * 12?
Placeholder: Running llama-3-70b for task: What is 15 * 12?
Placeholder: Running llama-3-70b for task: A store sells apples at $2 each and oranges at $3 each. If you buy 4 apples and 5 oranges, how much do you spend?
Placeholder: Running llama-3-70b for task: A store sells apples at $2 each and oranges at $3 each. If you buy 4 apples and 5 oranges, how much do you spend?
Placeholder: Running llama-3-70b for task: A store sells apples at $2 each and oranges at $3 each. If you buy 4 apples and 5 oranges, how much do you spend?
Placeholder: Running llama-3-70b for task: If A is taller than B, and B is taller than C, who is the tallest?
Placeholder: Running llama-3-70b for task: If A is taller than B, and B is taller than C, who is the tallest?
Placeholder: Running llama-3-70b for task: If A is taller than B, and B is taller than C, who is the tallest?
Placeholder: Runnin

In [15]:
# Save comparative results
comparative_df = pd.DataFrame(comparative_results)
comparative_df.to_csv("comparative_results.csv", index=False)
print("Comparative results saved to comparative_results.csv")

Comparative results saved to comparative_results.csv


In [16]:
import pandas as pd
df = pd.read_csv("qwen3_4b_results.csv")
accuracy = df.groupby(["task_category", "prompt_type"])["is_correct"].mean()
error_counts = df["error_types"].value_counts()
print("Accuracy by Category and Prompt Type:\n", accuracy)
print("Error Type Counts:\n", error_counts)

Accuracy by Category and Prompt Type:
 task_category          prompt_type     
arithmetic             chain_of_thought    0.0
                       reflection          0.0
                       self_consistency    0.0
commonsense_reasoning  chain_of_thought    0.0
                       reflection          0.0
                       self_consistency    0.0
logic_puzzle           chain_of_thought    0.0
                       reflection          0.0
                       self_consistency    1.0
math_word_problem      chain_of_thought    1.0
                       reflection          0.0
                       self_consistency    0.0
Name: is_correct, dtype: float64
Error Type Counts:
 error_types
[]    12
Name: count, dtype: int64
