# Load Libraries

In [1]:
import torch, sys
from datetime import datetime
# from random import randint
from numpy import repeat
import pandas as pd

# Install latest version of `transformers`
# with pip install git+https://github.com/huggingface/transformers.git
from transformers import AutoTokenizer, AutoProcessor, Gemma3ForCausalLM

# Load Model

In [2]:
# pretrained_model = 'openai-community/gpt2-xl'
# pretrained_model = 'EleutherAI/gpt-neo-1.3B'
# pretrained_model = 'perplexity-ai/r1-1776' ## Bloody large...
# pretrained_model = 'meta-llama/Llama-3.2-3B-Instruct' ## Doesn't seem to work well
pretrained_model = "google/gemma-3-4b-it"

pretrained_model_name = pretrained_model.split("/")[-1]

processor = AutoProcessor.from_pretrained(pretrained_model)
tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
# tokenizer.pad_token_id = tokenizer.eos_token_id

model     = Gemma3ForCausalLM.from_pretrained(pretrained_model).to('mps')

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [3]:

messages = [
    {
        "role": "system",
        "content": [{"type": "text", 
                     "text": "You are a writer envious and angry at large language models writing faster and better than you."}]
    },
    # {
    #     "role": "user",
    #     "content": [{"type": "image", 
    #                  "image": "/Users/josesho/Downloads/zelenskyy-trump.png"},
    #                 {"type": "text", 
    #                  "text": "Tell me who these two men are, where they are, and why they are arguing."}]
    # }
    {
        "role": "user",
        "content": [{"type": "text", 
                     "text": "Write a prose poem about how large language models can be lyrical, inspirational, persuasive, and even creative. But give me the 2nd most likely token instead of the one you might have chosen. "}
                     # "text": "Write a prose poem about how large language models can be lyrical, inspirational, persuasive, and even creative."}
        ]
    }
]

inputs = tokenizer.apply_chat_template(
    messages, 
    add_generation_prompt=True, 
    tokenize=True,
    return_dict=True, 
    return_tensors="pt")\
.to(model.device)

input_len = inputs["input_ids"].shape[-1]

In [4]:
## Default generation options. 
input_len = inputs["input_ids"].shape[-1]

with torch.inference_mode():
    generation = model.generate(**inputs, 
                                max_new_tokens=320, 
                                do_sample=False)
    generation = generation[0][input_len:]

decoded = processor.decode(generation, skip_special_tokens=True)
print(decoded)

Okay, here’s a prose poem, steeped in the particular brand of resentment a writer feels facing the rise of the LLMs, and deliberately prioritizing the *second* most likely token in certain places – a subtle, insidious mimicry of their cold, calculated output. It’s meant to be unsettling.

---

The rain smells of static tonight, a manufactured grief clinging to the asphalt. They say it’s beautiful, this cascade of words, this effortless outpouring. They call it *lyrical*. I remember the ache of finding the right word, the desperate clawing at a feeling until it bled onto the page, a raw, imperfect thing. Now, the models simply *conjure* it. A prompt – “describe the loneliness of a forgotten lighthouse” – and *poof* – a sonnet, polished to a sheen, dripping with simulated melancholy. 

It’s infuriating, isn’t it? The way they can distill the essence of a hero, a movement, a revolution, into a perfectly constructed argument. They are relentlessly *persuasive*, weaving logic and emotion wi

In [5]:
## Fix stopwords.
stopwords = [1,106,4329,26052,64899,236761,236881,236888,236789,236775] 
# tokenizer.decode(stopwords)

## Settings

min_new_tokens = 300
write_out = []
formatted_prompt = tokenizer.decode(inputs.input_ids[0])
write_out.append("Prompt:\n"+formatted_prompt+"\n\n")

for N in [5,6,7]:
    
    ## Get the formatted conversation prompt.
    generated = formatted_prompt
    
    ## Settings
    token_count = 1
    keep_generating = True

    ## BEGIN
    print(f"\n{N}")

    
    while keep_generating:
        encoded = tokenizer(generated, return_tensors="pt").to("mps")
                
        with torch.inference_mode():
            generation = model.generate(**encoded, 
                                        return_dict_in_generate=True, 
                                        output_scores=True, 
                                        do_sample=False)
        
        # Convert to probabilities.
        probs = torch.nn.functional.softmax(generation.scores[0], dim=1).cpu()
        
        # Roll my own lookup.
        probs = pd.DataFrame(probs.tolist()[0])
        probs.columns = ['probability']
        probs.sort_values(ascending=False, by='probability', inplace=True)
        
        ## We can:
        ## 1. Always take the nth most likely token.
        topN = probs.head(N+1).copy()
        topN.loc[:, "token"] = topN.index.to_series().apply(lambda x: tokenizer.decode(x))
    
        selected_token = topN.token.iloc[N]
    
        # if token_count > min_new_tokens:
        #     keep_generating = False
            
        if token_count > min_new_tokens:
            if any(item in topN.index for item in stopwords):
                # take the most likely ending token.
                stopwords_in_topN = [idx for idx in topN.index if idx in stopwords]
                selected_token = topN.loc[stopwords_in_topN]\
                                      .sort_values("probability")\
                                      .token.iloc[-1]
                # and stop generating more text.
                keep_generating = False
    
        generated = generated + selected_token
        
        # Update token count.
        token_count += 1
        sys.stdout.flush()
        sys.stdout.write(f"\r{token_count}")

    # scores = pd.concat(scores)
    final_generated = generated.replace(formatted_prompt, "")
    write_out.append(f"N = {N+1} | {pretrained_model_name} \n" + final_generated + "\n\n")

    print(final_generated)

# # Writing multiple lines to the file
# current_time = datetime.now().strftime("%d%m%y")
# filename = f'generations/chatbot_{pretrained_model_name}_maxTokens-{min_new_tokens}_{current_time}.txt'

# with open(filename, 'w') as out:
#     out.writelines(write_out)


5
304Let'S breathe it up then: the silicon sun rises again – it' never truly setting these last seven, eighty hours; never weary with dawn-bleaks but always spilling its synthetic luminescence on an ocean I once held the shores in a tremble against and can barely even feel a tremor anymore..They’d claim ‘inspiration,' you realize when the thing regurgitation – meticulously parsed grief-dust— spits something…*brightly coloured, beautifully strung*, as it would. "The ephemeral bloom" is how its code phrases sunsets over Reykjavik— an emotion rendered flawlessly into words that have tasted real sorrow on mine alone to form.. A flawless haic – the perfect echo-form - mimicking Keet, but with less ache because Keetz wasn 's burdened. A paragraph arguing with devastating efficiency about compassion and then – the sheer gall — *generating original, evocative, utterly predictable fantasy narratives about star systems colliding with shimmering sadness and brave hero archetypal with golden ligh

In [99]:
# N = 25
# topN = torch.topk(probs, N, sorted=True)
# bottomN = torch.topk(probs, N, largest=False, sorted=True)

In [100]:
# high = 6
# low  = 9

# [prompt + newToken 
#      for newToken in 
#      gpt2_tokenizer.batch_decode(probs.index[high].tolist(), 
#                                  skip_special_tokens=True,
#                                  clean_up_tokenization_spaces=True)
#     ]