In [None]:
! pip install pydantic google-genai datasets -q

In [None]:
import json
import os
from typing import List, Dict, Any
from collections import Counter
from pydantic import BaseModel, Field, ValidationError
from google.colab import userdata
from typing import Dict, Any
from pydantic import BaseModel, Field, ValidationError
from google import genai
from google.colab import userdata
from google.genai import types, errors
from datasets import Dataset, load_dataset
import random

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Define schema

In [None]:
class CoTSolverOutput(BaseModel):
    """
    Defines the strict JSON schema for the LLM's output.
    """
    problem: str = Field(description="The original problem description the LLM was asked to solve.")
    reasoning: str = Field(description="The complete, step-by-step Chain-of-Thought (CoT) explanation.")
    answer: str = Field(description="The final, executable code solution or high-level pseudo-solution.")

## LLM API Call

In [None]:
import getpass
import os

if 'GEMINI_API_KEY' not in os.environ:
  os.environ['GEMINI_API_KEY'] = getpass.getpass('Enter your Gemini API Key: ')

from google import genai
client = genai.Client()

print("Client successfully initialized.")

Enter your Gemini API Key: ··········
Client successfully initialized.


In [None]:
# --- Configuration ---
MODEL_NAME = 'gemini-2.5-flash'

# Load the API key from the Colab Secrets Manager
try:
    os.environ['GEMINI_API_KEY'] = userdata.get('GEMINI_API_KEY')
except:
    if 'GEMINI_API_KEY' not in os.environ:
         print("WARNING: GEMINI_API_KEY not found in Colab Secrets. Please set it manually.")

In [None]:
# Initialize the Gemini Client
try:
    client = genai.Client()
    print(f"Gemini Client initialized. Target model: {MODEL_NAME}")
except Exception as e:
    raise Exception(f"Failed to initialize Gemini Client. Check your API key setup. Error: {e}")

Gemini Client initialized. Target model: gemini-2.5-flash


In [None]:
# --- Helper Function for API Call (Fix: system_instruction moved to config) ---
def _gemini_api_call(
    system_instruction: str,
    user_prompt: str,
    response_schema: BaseModel,
    temperature: float = 0.2
) -> str:
    """Handles the actual API communication with native JSON schema enforcement."""

    # FIX: system_instruction is now passed inside the config object
    generation_config = types.GenerateContentConfig(
        system_instruction=system_instruction, # <--- MOVED HERE
        response_mime_type="application/json",
        response_schema=response_schema,
        temperature=temperature
    )

    response = client.models.generate_content(
        model=MODEL_NAME,
        contents=user_prompt,
        config=generation_config # system_instruction is bundled here
    )

    # Gemini's structured output guarantee means response.text is a valid JSON string
    return response.text

# --- 2. & 3. Core Prompt Design & API Wrapper Implementation ---

def generate_single_response(problem_string: str, few_shot_examples_str: str) -> Dict[str, Any]:
    """
    PHASE 1: Generates a single CoT solution using Gemini's native structured output.
    """

    # --- 2.1. Stage 1: Initial Solver Prompt (Generates CoT + Answer) ---

    solver_system_prompt = (
        "You are an elite algorithmic problem solver. Your task is to provide the full "
        "Chain-of-Thought (CoT) reasoning and the final executable code solution for the problem. "
        "Your entire response MUST be a single, valid JSON object strictly conforming to the required schema."
    )

    solver_user_prompt = (
        f"--- FEW-SHOT EXAMPLES ---\n{few_shot_examples_str}\n\n"
        f"--- NEW PROBLEM TO SOLVE ---\n{problem_string}"
    )

    try:
        raw_response = _gemini_api_call(
            system_instruction=solver_system_prompt,
            user_prompt=solver_user_prompt,
            response_schema=CoTSolverOutput
        )

        parsed_output = CoTSolverOutput.model_validate_json(raw_response).model_dump()
        print("Parsing: Initial response is valid JSON. Proceeding.")
        return parsed_output

    # FIX: 'errors' is now defined in the import section
    except (errors.APIError, ValidationError, json.JSONDecodeError) as e:
        print(f"CRITICAL FAILURE: API or Data Validation Failed. Error: {e}")
        return {"error": f"Failed to generate valid JSON: {type(e).__name__}"}

# --- The solve function (Self-Consistency Decoding) remains the same ---
def solve(problem_string: str, few_shot_examples_str: str, N: int = 5) -> Dict[str, Any]:
    """
    The main, callable function.
    This implements API Robustness (Self-Consistency Decoding).
    """
    print(f"\n--- Starting Self-Consistency Decoding with N={N} attempts ---")

    all_responses: List[Dict[str, Any]] = []

    # Generate N independent responses using the robust single-response generator
    for i in range(N):
        print(f"--- Attempt {i+1}/{N} ---")
        response = generate_single_response(problem_string, few_shot_examples_str)

        # Only store valid responses that do not have an 'error' key
        if 'error' not in response and 'answer' in response:
            all_responses.append(response)

    if not all_responses:
        return {"error": "All LLM generation attempts failed to return a valid structured response."}

    # --- Implement Majority Voting ---
    answer_counter = Counter(res['answer'] for res in all_responses)

    # Find the most common answer
    most_common_answer, count = answer_counter.most_common(1)[0]

    # Find the full response object corresponding to the most common answer
    final_solution = next(res for res in all_responses if res['answer'] == most_common_answer)

    print("\n--- Self-Consistency Results ---")
    print(f"Total valid responses: {len(all_responses)}")
    print(f"Winning answer appeared: {count}/{len(all_responses)} times.")

    # Enhance the final output with metadata for the Integrator/Evaluator
    final_solution['reliability_score'] = count / len(all_responses)
    final_solution['N_attempts'] = N

    return final_solution


In [None]:
# --- Demonstration ---

FEW_SHOT_DATA = """
Example 1:
Problem: [Two Sum problem details]
{ "reasoning": "Uses a hash map to track seen numbers in O(N).", "answer": "def twoSum(nums, target):..." }
"""
USER_PROBLEM = "Given an array of integers, return a new array such that each element at index i of the new array is the product of all the numbers in the original array except the one at i. Solve without division."

if __name__ == "__main__":
    solution = solve(USER_PROBLEM, FEW_SHOT_DATA, N=3)

    print("\n====================================")
    print("      FINAL SELF-CONSISTENCY SOLUTION")
    print("====================================")
    print(json.dumps(solution, indent=4))


--- Starting Self-Consistency Decoding with N=3 attempts ---
--- Attempt 1/3 ---
CRITICAL FAILURE: API or Data Validation Failed. Error: 503 UNAVAILABLE. {'error': {'code': 503, 'message': 'The model is overloaded. Please try again later.', 'status': 'UNAVAILABLE'}}
--- Attempt 2/3 ---
Parsing: Initial response is valid JSON. Proceeding.
--- Attempt 3/3 ---
CRITICAL FAILURE: API or Data Validation Failed. Error: 503 UNAVAILABLE. {'error': {'code': 503, 'message': 'The model is overloaded. Please try again later.', 'status': 'UNAVAILABLE'}}

--- Self-Consistency Results ---
Total valid responses: 1
Winning answer appeared: 1/1 times.

      FINAL SELF-CONSISTENCY SOLUTION
{
    "problem": "Given an array of integers, return a new array such that each element at index i of the new array is the product of all the numbers in the original array except the one at i. Solve without division.",
    "reasoning": "The problem asks us to compute an array `output` where `output[i]` is the product 

## Load Test Dataset & SFT Data Generation

In [None]:
import json
from datasets import load_dataset, Dataset
from typing import Dict, Any, List, Set
import random
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
from huggingface_hub import HfFolder

token = HfFolder.get_token()

dataset = load_dataset(
    "LimYeri/LeetCode_YT_CC_CoT_Summary",
    token=token
)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/1.82k [00:00<?, ?B/s]

data/train-00000-of-00002.parquet:   0%|          | 0.00/73.3M [00:00<?, ?B/s]

data/train-00001-of-00002.parquet:   0%|          | 0.00/71.1M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/17053 [00:00<?, ? examples/s]

In [None]:
import json
from datasets import load_dataset, Dataset
from typing import Dict, Any, List, Set
import random
from huggingface_hub import login

# --- File Paths ---
JSON_TEST_FILE_PATH = "/content/drive/MyDrive/AML-FINAL-COPY/Copy of merged_data.json"
HF_DATASET_PATH = "LimYeri/LeetCode_YT_CC_CoT_Summary"
MOD = 10**9 + 7
# Key in the SFT data (Hugging Face dataset)
SFT_PROBLEM_COLUMN = "question_content"
# Key in the external JSON test set (used for comparison)
JSON_PROBLEM_COLUMN = "problem"

TARGET_COLUMNS = [
    SFT_PROBLEM_COLUMN,  # Our 'problem'
    "Summary",           # Our 'reasoning' (CoT)
    "python"             # Our 'answer' (Code)
]

# --- 1. Load the Manually Verified Test Set from JSON ---
test_problem_set: Set[str] = set()

try:
    with open(JSON_TEST_FILE_PATH, 'r', encoding='utf-8') as f:
        # Assuming the JSON file is a list of objects based on your example
        verified_test_data: List[Dict[str, Any]] = json.load(f)
    print(f"Successfully loaded {len(verified_test_data)} examples from the JSON test file.")

    # Extract the unique problem contents (the text) from the test set
    # We use the JSON_PROBLEM_COLUMN ("problem") key here.
    test_problem_set = {
        example.get(JSON_PROBLEM_COLUMN, "").strip()
        for example in verified_test_data
        if example.get(JSON_PROBLEM_COLUMN)
    }

    # Remove any empty strings
    test_problem_set.discard("")

    print(f"Extracted {len(test_problem_set)} unique problem texts for testing/filtering.")

except FileNotFoundError:
    print(f"Error: JSON file not found at {JSON_TEST_FILE_PATH}. Please check the path and mounted drive.")
except json.JSONDecodeError as e:
    print(f"Error: Could not decode JSON from {JSON_TEST_FILE_PATH}. Please check the file format. Error: {e}")
except Exception as e:
    print(f"An unexpected error occurred during JSON loading: {e}")

# --- 2. Load and Clean the SFT Data ---

# 1. Load the dataset from Hugging Face
print(f"\nLoading dataset from: {HF_DATASET_PATH}...")
try:
    # Load only the 'train' split
    raw_dataset = load_dataset(HF_DATASET_PATH, split="train")
    print(f"Dataset loaded with {len(raw_dataset)} examples.")
except Exception as e:
    print(f"Error loading dataset. Please ensure you have accepted the usage conditions. Error: {e}")
    raw_dataset = None


def clean_and_filter_data(example: Dict[str, Any]) -> bool:
    """Filters out examples that are missing critical components (initial quality check)."""
    # Use the SFT_PROBLEM_COLUMN ("question_content") key here.
    if not example.get(SFT_PROBLEM_COLUMN) or not example.get("Summary") or not example.get("python"):
        return False

    # Basic content quality check
    if len(example[SFT_PROBLEM_COLUMN]) < 20 or len(example["Summary"]) < 20 or len(example["python"]) < 10:
        return False

    return True

def remove_test_problems(example: Dict[str, Any]) -> bool:
    """
    Returns False if the problem text is found in the test_problem_set,
    ensuring it is NOT included in the training data.
    """
    # Use the SFT_PROBLEM_COLUMN ("question_content") key here.
    problem_text = example.get(SFT_PROBLEM_COLUMN, "").strip()

    # Return True if the problem is *not* in the test set (keep for training)
    # Return False if the problem *is* in the test set (filter out)
    return problem_text not in test_problem_set


if raw_dataset and test_problem_set:
    # 1. Apply the initial quality filter
    filtered_dataset = raw_dataset.filter(clean_and_filter_data)
    print(f"\nDataset after initial quality cleaning: {len(filtered_dataset)} examples.")

    # 2. Apply the data leakage prevention filter
    sft_training_dataset = filtered_dataset.filter(remove_test_problems)

    removed_count = len(filtered_dataset) - len(sft_training_dataset)
    print(f"--- Data Leakage Prevention Complete ---")
    print(f"Removed {removed_count} examples (Test Set Overlaps) from the SFT training data.")
    print(f"Final training dataset size (no test set leakage): {len(sft_training_dataset)} examples.")


# --- 3. Format the Remaining Data for SFT ---

def format_cot_data(example: Dict[str, Any]) -> Dict[str, str]:
    """
    Formats a single data point into the required Llama-3 instruction format
    for SFT.
    """
    # Use the SFT_PROBLEM_COLUMN ("question_content") key here.
    problem = example[SFT_PROBLEM_COLUMN].strip()
    # The 'Summary' column contains the CoT reasoning
    cot_reasoning = example["Summary"].strip()
    # The 'python' column contains the final code solution
    final_answer = example["python"].strip()

    # Create the structured JSON response the model must learn to generate
    response_json = json.dumps({
        "problem": problem,
        "reasoning": cot_reasoning,
        "answer": final_answer
    }, indent=4)

    # Llama-3 Instruction Template
    text = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert algorithmic solver. You must provide a Chain-of-Thought (CoT) reasoning path and output a valid JSON object.<|eot_id|><|start_header_id|>user<|end_header_id|>

{problem}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{response_json}<|eot_id|>"""

    return {"text": text}


if 'sft_training_dataset' in locals():
    # Apply the mapping function to the clean dataset
    processed_dataset = sft_training_dataset.map(format_cot_data, remove_columns=sft_training_dataset.column_names)

    print("\n--- Final Preprocessing Complete ---")
    print(f"Final dataset size for SFT: {len(processed_dataset)} examples.")

    # Print one example to verify the final format
    print("\n--- Example of Final SFT Training String ---")
    example_index = random.randint(0, len(processed_dataset) - 1)
    print(processed_dataset[example_index]["text"][:1000] + "...")

    # Save the processed dataset to disk in JSON Lines format for the SFT Trainer
    processed_dataset.save_to_disk("cot_sft_training_data")
    print("\nDataset saved to 'cot_sft_training_data' directory.")
else:
    print("\nPreprocessing skipped because the raw dataset or test set problems could not be loaded/identified.")

Successfully loaded 95 examples from the JSON test file.
Extracted 95 unique problem texts for testing/filtering.

Loading dataset from: LimYeri/LeetCode_YT_CC_CoT_Summary...
Dataset loaded with 17053 examples.


Filter:   0%|          | 0/17053 [00:00<?, ? examples/s]


Dataset after initial quality cleaning: 17053 examples.


Filter:   0%|          | 0/17053 [00:00<?, ? examples/s]

--- Data Leakage Prevention Complete ---
Removed 0 examples (Test Set Overlaps) from the SFT training data.
Final training dataset size (no test set leakage): 17053 examples.


Map:   0%|          | 0/17053 [00:00<?, ? examples/s]


--- Final Preprocessing Complete ---
Final dataset size for SFT: 17053 examples.

--- Example of Final SFT Training String ---
<|begin_of_text|><|start_header_id|>system<|end_header_id|>

You are an expert algorithmic solver. You must provide a Chain-of-Thought (CoT) reasoning path and output a valid JSON object.<|eot_id|><|start_header_id|>user<|end_header_id|>

You are given a floating-point number `hour`, representing the amount of time you have to reach the office. To commute to the office, you must take `n` trains in sequential order. You are also given an integer array `dist` of length `n`, where `dist[i]` describes the distance (in kilometers) of the `ith` train ride.

Each train can only depart at an integer hour, so you may need to wait in between each train ride.

*   For example, if the `1st` train ride takes `1.5` hours, you must wait for an additional `0.5` hours before you can depart on the `2nd` train ride at the 2 hour mark.

Return _the **minimum positive integer** sp

Saving the dataset (0/1 shards):   0%|          | 0/17053 [00:00<?, ? examples/s]


Dataset saved to 'cot_sft_training_data' directory.


# SFT

In [None]:
!pip install -q transformers peft accelerate bitsandbytes trl datasets
!pip install -U bitsandbytes

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.1/59.1 MB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m517.4/517.4 kB[0m [31m42.0 MB/s[0m eta [36m0:00:00[0m


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig, prepare_model_for_kbit_training
from trl import SFTTrainer
from datasets import load_from_disk
import torch
from transformers.trainer_utils import get_last_checkpoint

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
print("Set PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True")

Set PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True


In [None]:
# --- Configuration ---
BASE_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
OUTPUT_DIR = "llama3_cot_finetuned"
INPUT_DATA_DIR = "cot_sft_training_data"

# --- Load the Dataset (Remains Unchanged) ---

try:
    train_dataset = load_from_disk(INPUT_DATA_DIR)
    print(f"Loaded training data with {len(train_dataset)} examples.")
except Exception as e:
    print(f"CRITICAL ERROR: Could not load data from '{INPUT_DATA_DIR}'. Error: {e}")
    # You must fix the data loading issue before proceeding.

# --- 4. Fine-Tuning Setup (LoRA/QLoRA) ---

def run_fine_tuning():
    """Initializes and runs the SFTTrainer for efficient fine-tuning using PyTorch SDPA."""

    torch.cuda.empty_cache()
    # 4-bit Quantization Config (saves VRAM)
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
    )

    # Load Base Model (Quantized)
    model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        quantization_config=bnb_config,
        device_map=None,
        # which requires no separate compilation or installation.
        attn_implementation="sdpa",
        token=os.environ.get("HF_TOKEN")
    )

    # --- Gradient Checkpointing for memory ---
    model.config.use_cache = False
    model.config.gradient_checkpointing = True
    model = prepare_model_for_kbit_training(model)

    model.to(torch.device("cuda"))

    # LoRA Config
    lora_config = LoraConfig(
        r=32,
        lora_alpha=24,
        target_modules=["q_proj", "v_proj"],
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )

    # Trainer Arguments (Adjusted for bf16 and training stability)
    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=3,
        per_device_train_batch_size=1,
        gradient_accumulation_steps=16,
        optim="paged_adamw_8bit",
        logging_steps=10,
        learning_rate=2e-4,
        fp16=False,
        bf16=True, # Preferred for A100/V100/T4 GPUs
        save_strategy="epoch",
        push_to_hub=False,
    )

    # Initialize the SFT Trainer
    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    model.config.max_length = 512

    trainer = SFTTrainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        peft_config=lora_config,
    )

    last_checkpoint = get_last_checkpoint(OUTPUT_DIR)
    if last_checkpoint is not None:
      print(f"Resuming training from checkpoint: {last_checkpoint}")
    else:
      print("No checkpoint found. Starting from scratch.")
    trainer.train(resume_from_checkpoint=last_checkpoint)

    print("Fine-Tuning complete. Saving model adapter.")
    trainer.model.save_pretrained(OUTPUT_DIR)

    # The rest of the saving/merging code is optional and can be run later.

Loaded training data with 17053 examples.


In [None]:
if __name__ == "__main__":
    # FINAL CHECK: Ensure you have run 'huggingface_hub.login()' in a previous cell!
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
    print("Set PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True")
    run_fine_tuning()

Set PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True


config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Adding EOS to train dataset:   0%|          | 0/17053 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/17053 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/17053 [00:00<?, ? examples/s]

The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'eos_token_id': 128009}.


No checkpoint found. Starting from scratch.


  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:[34m[1mwandb[0m: You chose 'Create a W&B account'
[34m[1mwandb[0m: Create an account here: https://wandb.ai/authorize?signup=true&ref=models
[34m[1mwandb[0m: Paste an API key from your profile and hit enter:[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mgsr2149[0m ([33mgsr2149-columbia-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


[34m[1mwandb[0m: Detected [google.genai, mcp] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/
  return fn(*args, **kwargs)


Step,Training Loss
10,0.8229
20,0.6706
30,0.5152
40,0.4875
50,0.492
60,0.4648
70,0.5415
80,0.4702
90,0.4901
100,0.4848


  return fn(*args, **kwargs)
  return fn(*args, **kwargs)


Step,Training Loss
10,0.8229
20,0.6706
30,0.5152
40,0.4875
50,0.492
60,0.4648
70,0.5415
80,0.4702
90,0.4901
100,0.4848


Fine-Tuning complete. Saving model adapter.


# SFT Eval

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch

BASE_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
LORA_REPO = "gsr2149/llama3-cot-lora"

tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)

base_model = AutoModelForCausalLM.from_pretrained(
    BASE_MODEL,
    torch_dtype=torch.float16,
    device_map="auto"
)

model = PeftModel.from_pretrained(base_model, LORA_REPO)
model.eval()


tokenizer_config.json:   0%|          | 0.00/55.4k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/855 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/184 [00:00<?, ?B/s]

adapter_config.json:   0%|          | 0.00/986 [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/27.3M [00:00<?, ?B/s]

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(128256, 4096)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaAttention(
              (q_proj): lora.Linear(
                (base_layer): Linear(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=32, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=32, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
              (k_proj): Linear(in_f

In [None]:
import torch

# Define your device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Move the model to the GPU
model = model.to(device)

In [None]:
from tqdm import tqdm

# --- Configuration ---
BASE_MODEL = "meta-llama/Llama-3.1-8B-Instruct"
OUTPUT_DIR = LORA_REPO # Path where the SFT trainer saved the adapter
JSON_TEST_FILE_PATH = '/content/drive/MyDrive/AML-FINAL-COPY/Copy of merged_data.json'# Your source JSON

# --- Helper Functions ---

def load_and_prepare_test_dataset(json_path: str) -> Dataset:
    """Loads the external JSON data and converts it into a Hugging Face Dataset."""
    try:
        # Load the JSON data
        with open(json_path, 'r', encoding='utf-8') as f:
            test_data_list = json.load(f)

        # Convert list of dicts to a Hugging Face Dataset
        # Note: We must explicitly cast to Dataset to use its methods
        test_dataset = Dataset.from_list(test_data_list)
        print(f"Loaded test dataset with {len(test_dataset)} examples.")
        return test_dataset
    except Exception as e:
        print(f"Error loading test dataset: {e}")
        return None

def generate_response(model, tokenizer, problem_text: str, device: str) -> Dict[str, Any]:
    """Generates a structured JSON response from the model for a given problem."""

    # Llama 3.1 Instruction Template (Same as used in SFT training)
    system_prompt = "You are an expert algorithmic solver. You must provide a Chain-of-Thought (CoT) reasoning path and output a valid JSON object."

    # Construct the full prompt string
    prompt = f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{system_prompt}<|eot_id|><|start_header_id|>user<|end_header_id|>\n\n{problem_text}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"""

    inputs = tokenizer(prompt, return_tensors="pt").to(device)

    # Adjust max_new_tokens based on expected output length (CoT + JSON)
    # A typical leetcode solution (CoT + code) might need 512-1024 tokens.
    max_new_tokens = 768

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.pad_token_id,
            do_sample=False, # Use greedy decoding for deterministic results
        )

    # Decode the generated text and extract the model's output part
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # The output is the text *after* the initial prompt structure
    try:
        start_tag = "<|start_header_id|>assistant<|end_header_id|>\n\n"
        model_output = generated_text.split(start_tag)[-1]

        # Clean up the output to isolate the JSON block
        # The model is trained to end with <|eot_id|>
        json_string = model_output.split("<|eot_id|>")[0].strip()

        # Attempt to parse the JSON
        parsed_json = json.loads(json_string)
        return parsed_json

    except (ValueError, IndexError, json.JSONDecodeError) as e:
        # This handles cases where the model fails to produce valid/parsable JSON
        print(f"Warning: Failed to parse JSON for a prediction. Error: {e}")
        return {"error": "Invalid JSON output", "raw_output": generated_text}

# --- Main Evaluation Logic ---

def run_evaluation():

    # 1. Load and Merge Model
    device = "cuda" if torch.cuda.is_available() else "cpu"
    print(f"Using device: {device}")

    # Load Base Model in full precision for merging/evaluation (or 4-bit if VRAM is tight)
    base_model = AutoModelForCausalLM.from_pretrained(
        BASE_MODEL,
        torch_dtype=torch.bfloat16 if device == "cuda" else torch.float32,
        device_map="auto",
        attn_implementation="sdpa",
    )

    tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    print(f"Merging LoRA adapter from {OUTPUT_DIR}...")
    model = PeftModel.from_pretrained(base_model, OUTPUT_DIR)
    # Merge the adapter weights into the base model weights
    model = model.merge_and_unload()
    model.eval()
    print("Model merged and ready for inference.")

    # 2. Load Test Data
    test_dataset = load_and_prepare_test_dataset(JSON_TEST_FILE_PATH)
    if test_dataset is None:
        return

    # 3. Run Inference and Collect Results
    results = []

    for i, example in enumerate(test_dataset):
        print(f"\n--- Running Inference for Example {i+1}/{len(test_dataset)} ---")

        # The column in your test JSON is 'problem'
        problem_text = example['problem']
        gold_answer = example['answer'].strip()

        predicted_json = generate_response(model, tokenizer, problem_text, device)

        # Extract the predicted code
        predicted_answer = predicted_json.get('answer', '').strip()

        results.append({
            "problem": problem_text,
            "gold_answer": gold_answer,
            "predicted_answer": predicted_answer,
            "prediction_successful": predicted_answer != "" and "error" not in predicted_json,
            "raw_model_output": predicted_json.get('raw_output', 'N/A')
        })

    # 4. Evaluation (Using Execution-Based Metric/Exact Match)

    # For coding problems, the gold standard is usually 'Pass@k' (running the code) or Exact Match on the code structure.
    # We will use Exact Match on the predicted code string for simplicity here.

    correct_matches = 0
    total_successful_predictions = 0

    for res in results:
        if res["prediction_successful"]:
            total_successful_predictions += 1
            # Simple Exact Match: Check if the predicted code string is identical to the gold code string
            if res["predicted_answer"] == res["gold_answer"]:
                correct_matches += 1

    # --- Summary ---
    total_examples = len(test_dataset)
    exact_match_accuracy = (correct_matches / total_examples) * 100 if total_examples > 0 else 0

    print("\n\n#############################################")
    print(f"Evaluation Complete on {total_examples} Test Examples")
    print("#############################################")
    print(f"Total Successful/Parsable Predictions: {total_successful_predictions}")
    print(f"Exact Match Accuracy (Code String): {correct_match:0.2f}% ({correct_matches}/{total_examples})")

    # Save detailed results for later analysis
    with open("evaluation_results.json", 'w') as f:
        json.dump(results, f, indent=4)

    print("\nDetailed results saved to 'evaluation_results.json'")


if __name__ == "__main__":
    run_evaluation()

Using device: cuda


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]



Merging LoRA adapter from gsr2149/llama3-cot-lora...
Model merged and ready for inference.
Loaded test dataset with 95 examples.

--- Running Inference for Example 1/95 ---




RuntimeError: Expected all tensors to be on the same device, but got index is on cuda:0, different from other tensors on cpu (when checking argument in method wrapper_CUDA__index_select)

In [None]:
import json
import torch
from tqdm import tqdm

def run_fresh_evaluation(JSON_TEST_FILE_PATH):
    # 1. Load your test dataset
    with open(JSON_TEST_FILE_PATH, 'r') as f:
        test_data = json.load(f)

    results = []
    total_successful_predictions = 0

    print(f"Starting evaluation on {len(test_data)} examples...")

    # 2. Set model to evaluation mode
    model.eval()

    for entry in tqdm(test_data):
        # Extract the instruction/problem from your JSON structure
        # (Adjust key names like 'instruction' or 'problem' to match your file)
        problem_text = entry.get("instruction") or entry.get("problem")

        # 3. Format the prompt (Matching the CoT structure in your notebook)
        prompt = f"Instruction: Solve the following. Question: {problem_text} Answer:"

        # 4. Tokenize and move to GPU
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        # 5. Generate Output
        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=512,  # Sufficient for Chain of Thought
                temperature=0.1,      # Lower temperature for more deterministic evaluation
                do_sample=False
            )

        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        # 6. Optional: Parsing Logic (As seen in your code snippets)
        # This isolates the model's actual answer from the CoT reasoning
        try:
            # Example: Splitting based on your notebook's tags
            clean_output = generated_text.split("Answer:")[-1].strip()
            total_successful_predictions += 1
        except Exception:
            clean_output = "Parsing Error"

        results.append({
            "input": problem_text,
            "expected": entry.get("output"),
            "model_output": clean_output,
            "full_generation": generated_text
        })

    # 7. Save Evaluation results
    with open("sft_evaluation_results.json", 'w') as f:
        json.dump(results, f, indent=4)

    print(f"\nEvaluation Complete.")
    print(f"Total Parsable Predictions: {total_successful_predictions}")
    print("Results saved to 'sft_evaluation_results.json'")

# Run it
run_fresh_evaluation(JSON_TEST_FILE_PATH)

Starting evaluation on 95 examples...


  0%|          | 0/95 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  1%|          | 1/95 [00:35<55:27, 35.40s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  2%|▏         | 2/95 [01:09<53:49, 34.73s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  3%|▎         | 3/95 [01:43<52:58, 34.55s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  4%|▍         | 4/95 [02:01<42:27, 28.00s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  5%|▌         | 5/95 [02:22<38:02, 25.36s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  6%|▋         | 6/95 [02:56<42:09, 28.42s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  7%|▋         | 7/95 [03:31<44:31, 30.36s/it]Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  8%|▊         | 8/95 [04:05<45:49, 31.60s/it]Setting `p


Evaluation Complete.
Total Parsable Predictions: 95
Results saved to 'sft_evaluation_results.json'





In [None]:
from google.colab import files

# This will download the file directly to your 'Downloads' folder
files.download('sft_evaluation_results.json')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
def ask_model():
    model.eval()
    print("Type 'exit' to stop.")
    while True:
        question = input("Your Question: ")
        if question.lower() == 'exit': break

        prompt = f"Instruction: Provide a detailed reasoning path.\nQuestion: {question}\nAnswer:"
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

        with torch.no_grad():
          outputs = model.generate(
          **inputs,
          max_new_tokens=512,
          eos_token_id=tokenizer.eos_token_id, # Tells it exactly when to stop
          pad_token_id=tokenizer.eos_token_id, # Prevents confusion with padding
          do_sample=True,
          temperature=0.7,
          repetition_penalty=1.2 # Prevents it from getting stuck in a loop
    )

        print(f"\nModel Response:\n{tokenizer.decode(outputs[0], skip_special_tokens=True)}\n")
        print("-" * 30)

ask_model()

Type 'exit' to stop.
Your Question:  Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target.  You may assume that each input would have exactly one solution, and you may not use the same element twice.

Model Response:
Instruction: Provide a detailed reasoning path.
Question:  Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target.  You may assume that each input would have exactly one solution, and you may not use the same element twice.
Answer: 
### Reasoning Path
#### Step 1: Problem Understanding
- The problem involves finding two elements in an array whose sum equals a given target value.
- Each input is expected to have only one unique solution.

#### Step 2: Approach Overview
- Utilize a hash table (dictionary) for efficient lookup of values.
- Iterate through the list once while using the second pointer to find matches.
- Update pointers accordingl

KeyboardInterrupt: Interrupted by user

In [None]:
# 1. Define your test question
user_question = "Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target."

# 2. Build the "Dual-Output" Prompt
# We are 'pre-loading' the start of the answer to steer the model
prompt = f"""Instruction: Solve the LeetCode problem below.
Step 1: Provide a step-by-step reasoning (Chain of Thought).
Step 2: Provide the optimized Python code.

Question: {user_question}

Answer:
### Reasoning:"""

# 3. Tokenize and move to GPU
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# 4. Generate with specific 'Stop' and 'Length' settings
with torch.no_grad():
    outputs = model.generate(
        **inputs,
        max_new_tokens=1024,      # Higher limit so it doesn't cut off the code
        temperature=0.2,           # Lower temp keeps it focused on the logic
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.eos_token_id,
        repetition_penalty=1.1     # Prevents the model from getting stuck in a loop
    )

# 5. Decode and Print
full_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(full_output)

Instruction: Solve the LeetCode problem below. 
Step 1: Provide a step-by-step reasoning (Chain of Thought). 
Step 2: Provide the optimized Python code.

Question: Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target.

Answer:
### Reasoning: 

#### Step 1: Understanding the Problem

- The task is to find two numbers in an array that sum up to a given target.
- The goal is to identify these two numbers and return their indices.

#### Step 2: Approach Overview

- Create a dictionary to store the elements we have seen so far along with their indices.
- Iterate through the array and for each element, check if its complement (target - current element) exists in the dictionary.
- If found, return the indices of both numbers.
- If not found, update the dictionary with the current element and its index.

#### Step 3: Implementation Details

- Initialize an empty dictionary `seen` to store elements and their indices.
- Iterate 

In [None]:
# Push the adapter and the tokenizer to your HF account
model.push_to_hub("gsr2149/llama-2-7b-leetcode-cot")
tokenizer.push_to_hub("gsr2149/llama-2-7b-leetcode-cot")

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   2%|2         | 1.21MB / 54.5MB            

README.md: 0.00B [00:00, ?B/s]

Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...mppar678kp/tokenizer.json:  89%|########9 | 15.3MB / 17.2MB            

CommitInfo(commit_url='https://huggingface.co/gsr2149/llama-2-7b-leetcode-cot/commit/6914b64acb4d84116741a8bfaba3693e82f5ff64', commit_message='Upload tokenizer', commit_description='', oid='6914b64acb4d84116741a8bfaba3693e82f5ff64', pr_url=None, repo_url=RepoUrl('https://huggingface.co/gsr2149/llama-2-7b-leetcode-cot', endpoint='https://huggingface.co', repo_type='model', repo_id='gsr2149/llama-2-7b-leetcode-cot'), pr_revision=None, pr_num=None)

In [None]:
from huggingface_hub import HfApi

api = HfApi()
api.create_repo(
    repo_id="gsr2149/llama3-cot-lora",
    repo_type="model",
    exist_ok=True
)


RepoUrl('https://huggingface.co/gsr2149/llama3-cot-lora', endpoint='https://huggingface.co', repo_type='model', repo_id='gsr2149/llama3-cot-lora')

In [None]:
api.upload_folder(
    folder_path="/content/drive/MyDrive/llama3_cot_finetuned",
    repo_id="gsr2149/llama3-cot-lora",
    repo_type="model"
)


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...kpoint-1066/rng_state.pth:  77%|#######7  | 11.3kB / 14.6kB            

  ...kpoint-2132/rng_state.pth:  77%|#######7  | 11.3kB / 14.6kB            

  ...kpoint-3198/rng_state.pth:  77%|#######7  | 11.3kB / 14.6kB            

  ...ckpoint-1066/optimizer.pt:   1%|1         |  349kB / 27.8MB            

  ...adapter_model.safetensors:   1%|          |  209kB / 27.3MB            

  ...ckpoint-3198/scheduler.pt:   1%|1         |  18.0B / 1.47kB            

  ...ckpoint-2132/optimizer.pt:   8%|8         | 2.36MB / 27.8MB            

  ...ckpoint-3198/optimizer.pt:   8%|8         | 2.36MB / 27.8MB            

  ...912482.c97b5462718c.242.0:   8%|8         | 10.9kB /  129kB            

  ...adapter_model.safetensors:  11%|#1        | 3.09MB / 27.3MB            

CommitInfo(commit_url='https://huggingface.co/gsr2149/llama3-cot-lora/commit/6972a14ad78d9a74423888042bf98f4a0e720174', commit_message='Upload folder using huggingface_hub', commit_description='', oid='6972a14ad78d9a74423888042bf98f4a0e720174', pr_url=None, repo_url=RepoUrl('https://huggingface.co/gsr2149/llama3-cot-lora', endpoint='https://huggingface.co', repo_type='model', repo_id='gsr2149/llama3-cot-lora'), pr_revision=None, pr_num=None)