# Testing Our Loaded Models
- This notebook loads the saved Leetcode and Spanish translation models to use for inference. This notebook is only for testing out prompts and a manual, qualitative analysis of the prompts

In [1]:
import torch
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported
from unsloth.chat_templates import standardize_sharegpt
from datasets import load_dataset
from transformers import TextStreamer
from tqdm import tqdm
from peft import PeftModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


  from .autonotebook import tqdm as notebook_tqdm


Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!


# Load Base Llama 3.2 1B Model

In [2]:
max_seq_length = 2048
dtype = None  # let it auto-detect
load_in_4bit = True

# Load the base model
base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)


==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    GRID A100X-20C. Num GPUs = 1. Max memory: 19.996 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


# Apply the Leetcode Finetuning

In [3]:
model = PeftModel.from_pretrained(
    model = base_model,
    model_id = "models/llama-leetcode-4bit", 
)


In [None]:
max_length = model.config.max_position_embeddings

device = model.device

# shuffle and select 1000 examples
shuffled_dataset = dataset.shuffle(seed=42).select(range(1000))

# accumulators for total negative log likelihood and number of tokens
nll_sum = 0.0
n_tokens = 0

stride = 512
model.eval()


# loop over each prompt
for example in tqdm(shuffled_dataset):
    prompt = example['text']
    encodings = tokenizer(prompt, return_tensors="pt")
    input_ids = encodings.input_ids
    seq_len = input_ids.size(1)

    prev_end_loc = 0

    for begin_loc in range(0, seq_len, stride):
        end_loc = min(begin_loc + max_length, seq_len)
        trg_len = end_loc - prev_end_loc
        inputs = input_ids[:, begin_loc:end_loc].to(device)

        targets = inputs.clone()
        targets[:, :-trg_len] = -100

        with torch.no_grad():
            outputs = model(inputs, labels=targets)
            n_log = outputs.loss

        num_valid_tokens = (targets != -100).sum().item()
        num_loss_tokens = num_valid_tokens - inputs.size(0)
        nll_sum += n_log * num_loss_tokens
        n_tokens += num_loss_tokens

        prev_end_loc = end_loc
        if end_loc == seq_len:
            break

# calculate final perplexity
avg_nll = nll_sum / n_tokens
ppl = torch.exp(avg_nll)

print(f"perplexity: {ppl}")


# Inference of Finetuned Leetcode Model

In [19]:
FastLanguageModel.for_inference(model)
print("in inference mode now")

in inference mode now


In [21]:
FastLanguageModel.for_inference(model)

tokenizer = get_chat_template(tokenizer, chat_template="llama-3.1")



text_streamer = TextStreamer(tokenizer, skip_prompt=True)

# to generate this text, i gave chat GPT some examples from the leetcode dataset and asked it to make consice prompts

prompts = [
    "You are given an integer array `nums` and an integer `k`. Find the maximum subarray sum of all the subarrays of `nums` that meet the following conditions:\n- The length of the subarray is `k`, and\n- All the elements of the subarray are **distinct**.\nReturn the maximum subarray sum of all the subarrays that meet the conditions. If no subarray meets the conditions, return `0`.",
    
    "You are given an array `nums` consisting of positive integers. You are also given an integer array `queries` of size `m`. For the `ith` query, you want to make all of the elements of `nums` equal to `queries[i]`. You can perform the following operation on the array any number of times:\n- Increase or decrease an element of the array by `1`.\nReturn an array `answer` of size `m` where `answer[i]` is the **minimum** number of operations to make all elements of `nums` equal to `queries[i]`.",
    
    "You are a professional robber planning to rob houses along a street. Each house has a certain amount of money stashed, the only constraint stopping you from robbing each of them is that adjacent houses have security systems connected and it will automatically contact the police if two adjacent houses were broken into on the same night.\nGiven an integer array `nums` representing the amount of money of each house, return the maximum amount of money you can rob tonight **without alerting the police**.",
    
    "Given an integer number `n`, return the difference between the product of its digits and the sum of its digits.",
    
    "Design a data structure that follows the constraints of a **Least Recently Used (LRU) cache**.\nImplement the `LRUCache` class:\n- `LRUCache(int capacity)` initializes the LRU cache with positive size `capacity`.\n- `int get(int key)` returns the value of the key if the key exists, otherwise return -1.\n- `void put(int key, int value)` update the value of the key if it exists. Otherwise, add the key-value pair to the cache. If the number of keys exceeds the capacity, evict the least recently used key.\nThe functions `get` and `put` must each run in O(1) average time complexity.",
    
    "Given a 0-indexed integer array `nums`, return `true` if it can be made **strictly increasing** after removing **exactly one** element, or `false` otherwise. If the array is already strictly increasing, return `true`.",
    
    "You are given an integer array `nums` and an integer `target`.\nYou want to build an **expression** out of nums by adding one of the symbols `'+'` and `'-'` before each integer in nums and then concatenate all the integers.\nReturn the number of different **expressions** that you can build, which evaluates to `target`.",
    
    "Alice and Bob want to water `n` plants in their garden. Each plant needs a specific amount of water. They water the plants simultaneously from opposite ends with watering cans of given capacities.\nWhenever one can isn’t enough, they refill it.\nIn case they meet at the same plant, the one with more water should water it.\nReturn the **number of times** they have to refill to water all the plants.",
    
    "Given two binary trees `original` and `cloned` and a reference to a node `target` in the original tree, return a reference to the same node in the cloned tree.",
    
    "In some array `arr`, the values were in arithmetic progression: the values `arr[i + 1] - arr[i]` are all equal for every `0 <= i < arr.length - 1`.\nA value from `arr` was removed that **was not the first or last value in the array**.\nGiven `arr`, return the removed value.",
    
    "The Fibonacci numbers form a sequence such that each number is the sum of the two preceding ones, starting from `0` and `1`.\nGiven `n`, calculate `F(n)`.",
    
    "You are given a 2D integer array `questions` where each question earns you points and blocks the next `brainpower[i]` questions if you solve it.\nReturn the **maximum** points you can earn for the exam.",
    
    "In a row of dominoes, `tops[i]` and `bottoms[i]` represent the top and bottom halves of the ith domino.\nWe may rotate the ith domino. Return the minimum number of rotations so that all the values in `tops` are the same, or all the values in `bottoms` are the same.\nIf it cannot be done, return `-1`.",
    
    "You are given a string `s` consisting only of letters `'a'` and `'b'`. In a single step you can remove one **palindromic subsequence** from `s`.\nReturn the **minimum** number of steps to make the given string empty.",
    
    "You are given several logs, each with a unique ID and a timestamp in format `Year:Month:Day:Hour:Minute:Second`.\nImplement a `LogSystem` class to `put` logs and `retrieve` logs that fall within a time range, with precision defined by a `granularity`.",
    
    "You are given an integer array `nums`. Pick a number to delete and earn points equal to that number. Then, all elements equal to `nums[i] - 1` and `nums[i] + 1` are also deleted.\nReturn the maximum number of points you can earn by repeating this process.",
    
    "The cafeteria offers sandwiches and students queue to eat. Each student either takes the sandwich on top of the stack or goes to the end of the queue.\nThis continues until no student wants the top sandwich.\nReturn the number of students that are unable to eat.",
    
    "You are given an array `security`, where `security[i]` is the number of guards on day `i`, and an integer `time`.\nReturn all days where the `time` days before are non-increasing and the `time` days after are non-decreasing.",
    
    "Given a string `s`, find the first non-repeating character in it and return its index. If it does not exist, return `-1`.",
    
    "Implement the `myAtoi(string s)` function, which converts a string to a 32-bit signed integer following rules similar to C/C++'s `atoi`.",
    
    "You are given two string arrays `words1` and `words2`.\nA string `a` in `words1` is universal if for every string `b` in `words2`, `b` is a subset of `a`.\nReturn an array of all the universal strings in `words1`.",
    
    "You are given an array `nums` and an integer `k`.\nFind the longest subsequence of `nums` that is strictly increasing and the difference between adjacent elements is at most `k`.",
    
    "Given an array `nums` consisting of `2n` elements in the form `[x1,x2,...,xn,y1,y2,...,yn]`, return the array in the form `[x1,y1,x2,y2,...,xn,yn]`."
]




for prompt in prompts:
    messages = [
        {
            "role": "user",
            "content": prompt
        }
    ]

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt",
    ).to(model.device)

    _ = model.generate(
        input_ids=inputs,
        streamer=text_streamer,
        max_new_tokens=512,
        use_cache=True,
        temperature=0.7,
        min_p=0.1,
    )
    print("\n")
    print("end of example") 
    print("\n")


def maximumSubarray(self, nums: List[int], k: int) -> int:
        ans = 0
        s = 0
        for i in range(len(nums)):
            if len(nums[i:])!= k: continue
            s += nums[i]
            ans = max(ans, s)
        return ans<|eot_id|>


end of example


def minOperations(self, nums: List[int], queries: List[int]) -> List[int]:
        answer = []
        for q in queries:
            nums.sort()
            min_diff = q - nums[0]
            nums[0] = q
            max_diff = q - nums[-1]
            nums[-1] = q
            min_ops = min_diff + max_diff
            answer.append(min_ops)
        return answer<|eot_id|>


end of example


def rob(nums):
    dp = [0] * len(nums)
    dp[0] = nums[0]
    dp[1] = max(nums[0], nums[1])
    for i in range(2, len(nums)):
        dp[i] = max(dp[i-1], dp[i-2] + nums[i])
    return dp[-1]<|eot_id|>


end of example


**Product vs Sum of Digits Difference**.<br> <br> https://leetcode.com/problems/product-vs-sum-of-digits-after-n-n

# Manual Output Analysis from the leetcode Model
- Outputs are very mixed
- some completions show logical structure, but a lot of them also have significant issues.
- These results do align with the BLEU scores from earlier, however, i noticed that the quality of the prompts matters a lot here
- The model performs better when it gets prompts exactly how they were fed to it during finetuning - I didn't see any major issues while printing out random predictions while calculating the BLEU scores earlier

# load English-Spanish Translation Finetuned Model

In [6]:
# Load the base model
base_model2, tokenizer2 = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    GRID A100X-20C. Num GPUs = 1. Max memory: 19.996 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [7]:
model2 = PeftModel.from_pretrained(
    model = base_model2,
    model_id = "models/llama-enes-4bit", 
)
FastLanguageModel.for_inference(model2)
print("second model in inference mode now")

second model in inference mode now


# Inference of English-Spanish Translation Model

In [12]:
FastLanguageModel.for_inference(model2)

tokenizer2 = get_chat_template(tokenizer2, chat_template="llama-3.1")

messages2 = [
    {
        "role": "user",
        "content": """ Muriel is now 20.  translate the above text to spanish" 
        """
    }
]

inputs = tokenizer2.apply_chat_template(
    messages2,
    tokenize=True,
    add_generation_prompt=True,
    return_tensors="pt",
).to("cuda")

text_streamer2 = TextStreamer(tokenizer, skip_prompt=True)
_ = model2.generate(
    input_ids=inputs,
    streamer=text_streamer,
    max_new_tokens=512,
    use_cache=True,
    temperature=0.7,
    min_p=0.1,
)


Ahora Muriel tiene 20 años.<|eot_id|>


# Manual Output Analysis from the EN-ES Model
- Most outputs looked fine.
- I didn't notice any repeating prompts or anything here, apart from the occasional "translate this english text to spanish" showing up in the prompts translated to spanish, but after running the prompt again, this issue was resolved.
- Referenced some outputs with Google Translate and results seemed to correspond with the strong BLEU results observed earlier.