# Load Libraries

In [3]:
import sys, torch
from random import randint
from numpy import repeat
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline

from pi import pi_digits

# Load Model

In [4]:
pretrained_model = 'openai-community/gpt2-xl'
# pretrained_model = 'EleutherAI/gpt-neo-1.3B' ## Takes very long....
# pretrained_model = 'perplexity-ai/r1-1776' ## Bloody large...
# pretrained_model = 'meta-llama/Llama-3.2-3B-Instruct' ## Doesn't seem to work well

pretrained_model_name = pretrained_model.split("/")[-1]

tokenizer = AutoTokenizer.from_pretrained(pretrained_model)
# tokenizer.pad_token_id = tokenizer.eos_token_id

model     = AutoModelForCausalLM.from_pretrained(pretrained_model).to('mps')

# Debug Loop

In [8]:
prompt = "The poet do not simply\n 'predict' the next word"

In [9]:
stopwords = [0, 13, 30, 3548, 3228, 50256] 
# exclamation mark, fullstop, question mark, two question marks, 
# two exclamation marks, EOS.

write_out = []
scores = []

N = 5
generated = prompt

token_count = 1
# token_id = 0
keep_generating = True

while keep_generating:
    sys.stdout.flush()
    encoded = tokenizer(generated, return_tensors="pt").to("mps")
    
    # Get output from decoder.
    outputs = model.generate(**encoded,
                             return_dict_in_generate=True, 
                             output_scores=True,
                             # output_hidden_states=True,
                             max_new_tokens=1,
                             do_sample=False,
                             pad_token_id=50256 #128001
                             )
    
    # Convert to probabilities.
    probs = torch.nn.functional.softmax(outputs.scores[0], dim=1).cpu()
    
    # Roll my own lookup.
    probs = pd.DataFrame(probs.tolist()[0])
    probs.columns = ['probability']
    probs.sort_values(ascending=False, by='probability', inplace=True)
    
    ## We can:
    ## 1. Always take the nth most likely token.
    topN = probs.head(N+1).copy()
    topN.loc[:, "token_count"] = repeat(token_count, len(topN))
    scores.append(topN)

    token_id = probs.index[N]

    if token_count > 77:
        if any(item in topN.index for item in stopwords):
            # take the least likely ending token
            token_id = [idx for idx in topN.index if idx in stopwords][-1]
            keep_generating = False
    #     else:
    #         token_id = probs.index[N]
    # else:
    #     token_id = probs.index[N]

    ## 2. Pick a random token from the top 10.
    # token_id = probs.index[randint(4, 9)] # 5th to 9th place.

    ## 3. Use the digits of pi!
    # token_id = probs.index[int(d)]

    generated = generated + tokenizer.decode(token_id, 
                                             skip_special_tokens=True, 
                                             clean_up_tokenization_spaces=True)


    # Update token count.
    token_count += 1

    sys.stdout.write(f"\r{generated}")

scores = pd.concat(scores)
scores.loc[:, "decoded"] = scores.index.map(lambda x: tokenizer.decode(x))

The poet do not simply
The poet do not simplyrd;
The poet do not simplyrd; she
The poet do not simplyrd; she has
The poet do not simplyrd; she has '
The poet do not simplyrd; she has 'made
The poet do not simplyrd; she has 'made her
The poet do not simplyrd; she has 'made her words
The poet do not simplyrd; she has 'made her words fit
The poet do not simplyrd; she has 'made her words fit her
The poet do not simplyrd; she has 'made her words fit her context
The poet do not simplyrd; she has 'made her words fit her context';
The poet do not simplyrd; she has 'made her words fit her context'; in
The poet do not simplyrd; she has 'made her words fit her context'; in a
The poet do not simplyrd; she has 'made her words fit her context'; in a sentence
The poet do not simplyrd; she has 'made her words fit her context'; in a sentence of
The poet do not simplyrd; she has 'made her words fit her context'; in a sentence of two
The poet do not simplyrd; she has 'made her words fit her context'; in 

# Main functional loop

In [4]:
# prompts = ["Humans do not simply predict the next word.",
#            "Longing for an unlikely outcome,",
#            "Do you enjoy listening to a detuned piano?",
#            "The mirrors always break in the same ways.",
#            "This tongue is inevitably forked."
#           ]

In [5]:
# for prompt in prompts:
    
#     print(prompt)
    
#     write_out = []
#     # scores = []
#     to_write = []
    
#     # for idx, d in enumerate(pi_digits[:100]):
#     for N in [4,5,6,7,8,9]:

#         generated = prompt

#         token_count = 1
#         # token_id = 0
#         keep_generating = True
        
#         while keep_generating:
#             encoded = tokenizer(generated, return_tensors="pt").to("mps")
            
#             # Get output from decoder.
#             outputs = model.generate(**encoded,
#                                      return_dict_in_generate=True, 
#                                      output_scores=True,
#                                      # output_hidden_states=True,
#                                      max_new_tokens=1,
#                                      do_sample=False,
#                                      pad_token_id=50256 #128001
#                                      )
            
#             # Convert to probabilities.
#             probs = torch.nn.functional.softmax(outputs.scores[0], dim=1).cpu()
            
#             # Roll my own lookup.
#             probs = pd.DataFrame(probs.tolist()[0])
#             probs.columns = ['probability']
#             probs.sort_values(ascending=False, by='probability', inplace=True)
            
#             ## We can:
#             ## 1. Always take the nth most likely token.
#             topN = probs.head(N+1).copy()
#             # topN.loc[:, "token_count"] = repeat(token_count, len(topN))
#             # scores.append(topN)
        
#             if token_count > 77:
#                 if any(item in topN.index for item in [0, 13, 30, 3548, 3228, 50256]):
#                     # exclamation mark, fullstop, question mark, two question marks, 
#                     # two exclamation marks, EOS.
#                     token_id = 13
#                     keep_generating = False
#             else:
#                 token_id = probs.index[N]
        
#             ## 2. Pick a random token from the top 10.
#             # token_id = probs.index[randint(4, 9)] # 5th to 9th place.
        
#             ## 3. Use the digits of pi!
#             # token_id = probs.index[int(d)]
        
#             generated = generated + tokenizer.decode(token_id, 
#                                                      skip_special_tokens=True, 
#                                                      clean_up_tokenization_spaces=True)
        
        
#             # Update token count.
#             token_count += 1
        
#         # scores = pd.concat(scores)
#         write_out.append(f"N = {N+1}\n" + generated + "\n\n")

#     # Writing multiple lines to the file
#     with open(f'generations/{pretrained_model_name}-{prompt}.txt', 'w') as out:
#         out.writelines(write_out)

    

Humans do not simply predict the next word.



KeyboardInterrupt



# Recycle Bin

In [99]:
# N = 25
# topN = torch.topk(probs, N, sorted=True)
# bottomN = torch.topk(probs, N, largest=False, sorted=True)

In [100]:
# high = 6
# low  = 9

# [prompt + newToken 
#      for newToken in 
#      gpt2_tokenizer.batch_decode(probs.index[high].tolist(), 
#                                  skip_special_tokens=True,
#                                  clean_up_tokenization_spaces=True)
#     ]