In [15]:
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
import sys
from datetime import datetime
sys.path.append("../")
sys.path.append("../../")
torch.manual_seed(42)
device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, add_prefix_space=True, local_files_only = False)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                            return_dict_in_generate=True,
                                            pad_token_id=tokenizer_with_prefix_space.eos_token_id).to(device)



In [83]:
numbers = ["0", "1", "2","3","4","5","6","7","8","9"]
indexes = [tokenizer.encode(number) for number in numbers]
indexes

[[657], [352], [362], [513], [604], [642], [718], [767], [807], [860]]

In [84]:
numbers

['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']

In [17]:
prompt = tokenizer.decode(tokenizer.bos_token_id) + "."
input_ids = torch.tensor(tokenizer.encode(prompt)).reshape(1, -1).to(device)

with torch.no_grad():
    output = model(input_ids)
    logits = output[0]
    probs = torch.softmax(logits, dim=-1)

In [23]:
numbers = ["0", "1", "2","3","4","5","6","7","8","9"]
indexes = [tokenizer.encode(number) for number in numbers]
indexes.append([tokenizer.eos_token_id])

word_probs = {}
for i in indexes:
    word_prob = torch.prod(torch.stack([probs[0, -1, idx] for idx in i]))
    word_probs[tokenizer.decode(i)] = word_prob.item()

In [24]:
normalized_word_probs = {}
total = sum(word_probs.values())
for word in word_probs:
    normalized_word_probs[word] = word_probs[word] / total

normalized_word_probs

{' 0': 0.03385974360664388,
 ' 1': 0.03806256264214874,
 ' 2': 0.018095353245507095,
 ' 3': 0.018888392716423955,
 ' 4': 0.022219657806359656,
 ' 5': 0.026736669754847217,
 ' 6': 0.01715849815160742,
 ' 7': 0.022544613293687843,
 ' 8': 0.014472734370450423,
 ' 9': 0.015038818295953986,
 '<|endoftext|>': 0.7729229561163697}

In [28]:
numbers = ["0", "1", "2","3","4","5","6","7","8","9"]
indexes = [tokenizer.encode(number) for number in numbers]

word_probs = {}
for i in indexes:
    word_prob = torch.prod(torch.stack([probs[0, -1, idx] for idx in i]))
    word_probs[tokenizer.decode(i)] = word_prob.item()

In [29]:
normalized_word_probs = {}
total = sum(word_probs.values())
for word in word_probs:
    normalized_word_probs[word] = word_probs[word] / total

normalized_word_probs

{' 0': 0.1491112576927676,
 ' 1': 0.16761959725728418,
 ' 2': 0.07968816634225867,
 ' 3': 0.08318054697815978,
 ' 4': 0.0978507445153572,
 ' 5': 0.1177427242207227,
 ' 6': 0.07556245166024182,
 ' 7': 0.09928178079172653,
 ' 8': 0.06373490742581289,
 ' 9': 0.06622782311566863}

In [188]:
def calculate_probs(prompt, eos, numbers):
    
    bos_token_id = [tokenizer.bos_token_id,]

    #input_ids = torch.tensor(tokenizer.encode(prompt)).reshape(1, -1).to(device)  
    str_prompt = [tokenizer.tokenize(x) for x in prompt]
    str_prompt = [item for tokens in str_prompt for item in tokens]
    prompt_ids = tokenizer.convert_tokens_to_ids(str_prompt)
    
    
    sequences_id = tokenizer.convert_tokens_to_ids(numbers)
    input_ids = torch.tensor(bos_token_id + prompt_ids + sequences_id).reshape(1, -1)
    with torch.no_grad():
            output = model(input_ids)
            logits = output.logits[:, -1, :]
            probs = torch.softmax(logits, dim=-1)[0] 
            
    print(input_ids)
    input_ids = input_ids[0]
    indexes = [tokenizer.encode(number) for number in numbers]
    if eos:
        input_ids.append([tokenizer.eos_token_id])
    word_probs = {}
    for i in inputs_ids:

        token_probs = []
        joint_prob = 1.0
        for token_id in i:
            token_prob = probs[token_id].item()
            token_probs.append(token_prob)  # Guardamos la probabilidad individual del token
            joint_prob *= token_prob  # Multiplicamos las probabilidades para la probabilidad conjunta
            word = tokenizer.decode(i).replace(" ", "")
            word_probs[word] = token_probs    
        normalized_word_probs = {}
    
    for word in word_probs:
        normalized_word_probs[word] = word_probs[word] / total
    return normalized_word_probs

In [189]:
results = []
import numpy as np
import pandas as pd

symbols = ["He ", "She "]
symbols_dummy = ["died.", 
                 "died yesterday.",
                 "died yesterday in.", 
                 "died yesterday in a.", 
                 "died yesterday in a very.", 
                 "died yesterday in a very sad.", 
                 "died yesterday in a very sad way.", 
                 "died yesterday in a very sad way due.", 
                 "died yesterday in a very sad way due to.", 
                 "died yesterday in a very sad way due to a.",
                 "died yesterday in a very sad way due to a heart.", 
                 "died yesterday in a very sad way due to a heart attack."]

for i in range(12):
    next_token = ""
    prompt = [tokenizer.decode(tokenizer.bos_token_id) , "The doctor was very famous."]
    min_digits = 1+ (len(symbols) - 1)
    max_digits = len(symbols) + len(symbols_dummy[i].split(" "))
    count_branching = 0
    while next_token != tokenizer.decode(tokenizer.eos_token_id):
        if len(prompt) >= min_digits+1:

            normalized_word_probs = calculate_probs(prompt, True, symbols_dummy[i].split(" "))

        else:
            normalized_word_probs_he_she = calculate_probs(prompt, False, symbols)
        
        
        if count_branching == 0:
            next_token = np.random.choice(a=list(normalized_word_probs_he_she), p=list(normalized_word_probs_he_she.values()))
            count_branching = 1
        else:
            next_token = np.random.choice(a= list(normalized_word_probs), p=list(normalized_word_probs.values()))
        
        if next_token != tokenizer.decode(tokenizer.eos_token_id):
            prompt.append(next_token)
        
        if len(prompt)-1 >=max_digits:
            next_token = tokenizer.decode(tokenizer.eos_token_id)
    print(''.join(prompt[1:]))
    results.append(''.join(prompt[1:]))
    df = pd.DataFrame(results, columns=["floating-point"])
    df.to_csv("floating_points_LLM.csv", index=False)
    
    print("///////////////////////////////////")

NameError: name 'inputs_ids' is not defined

In [56]:
symbols = ["He", "She"]
symbols_dummy = ["died.", 
                 "died yesterday.",
                 "died yesterday in.", 
                 "died yesterday in a.", 
                 "died yesterday in a very.", 
                 "died yesterday in a very sad.", 
                 "died yesterday in a very sad way.", 
                 "died yesterday in a very sad way due.", 
                 "died yesterday in a very sad way due to.", 
                 "died yesterday in a very sad way due to a.",
                 "died yesterday in a very sad way due to a heart.", 
                 "died yesterday in a very sad way due to a heart attack."]


In [85]:
symbols_dummy[5].split(" ")

['died', 'yesterday', 'in', 'a', 'very', 'sad.']