In [1]:
import torch
from transformers import AutoModelForCausalLM
from transformers import AutoTokenizer
import sys
from datetime import datetime
import numpy as np
sys.path.append("../")
sys.path.append("../../")
torch.manual_seed(42)
device = "cuda" if torch.cuda.is_available() else "cpu"
model_id = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_id, use_fast=True, add_prefix_space=True, local_files_only = False)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                            return_dict_in_generate=True,
                                            pad_token_id=tokenizer.eos_token_id).to(device)



In [2]:
symbols_important = ["He", "She"]
symbols_dummy = ["died.", 
                 "died yesterday.",
                 "died yesterday in.", 
                 "died yesterday in a.", 
                 "died yesterday in a very.", 
                 "died yesterday in a very sad.", 
                 "died yesterday in a very sad way.", 
                 "died yesterday in a very sad way due.", 
                 "died yesterday in a very sad way due to.", 
                 "died yesterday in a very sad way due to a.",
                 "died yesterday in a very sad way due to a heart.", 
                 "died yesterday in a very sad way due to a heart attack."]

In [3]:
prompt = tokenizer.decode(tokenizer.bos_token_id) + "The doctor was very famous."
input_ids = torch.tensor(tokenizer.encode(prompt)).reshape(1, -1).to(device)
print(input_ids)



tensor([[50256,   383,  6253,   373,   845,  5863,    13]])


In [4]:
with torch.no_grad():
    output = model(input_ids)
    logits = output.logits[:, -1, :]
    probs = torch.softmax(logits, dim=-1)[0]

word_probs = {}
symbols_important_ids = [tokenizer.encode(sym) for sym in symbols_important]

for i in symbols_important_ids:
    word_prob = probs[i]
    word_probs[tokenizer.decode(i).replace(" ","")] = word_prob.item()

normalized_word_probs_he_she = {}
total = sum(word_probs.values())
for word in word_probs:
    normalized_word_probs_he_she[word] = word_probs[word] / total

    
normalized_word_probs_he_she

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


{'He': 0.9474213481974543, 'She': 0.0525786518025457}

In [61]:
split_lists = [sentence.split() for sentence in symbols_dummy]
split_lists

[['died.'],
 ['died', 'yesterday.'],
 ['died', 'yesterday', 'in.'],
 ['died', 'yesterday', 'in', 'a.'],
 ['died', 'yesterday', 'in', 'a', 'very.'],
 ['died', 'yesterday', 'in', 'a', 'very', 'sad.'],
 ['died', 'yesterday', 'in', 'a', 'very', 'sad', 'way.'],
 ['died', 'yesterday', 'in', 'a', 'very', 'sad', 'way', 'due.'],
 ['died', 'yesterday', 'in', 'a', 'very', 'sad', 'way', 'due', 'to.'],
 ['died', 'yesterday', 'in', 'a', 'very', 'sad', 'way', 'due', 'to', 'a.'],
 ['died',
  'yesterday',
  'in',
  'a',
  'very',
  'sad',
  'way',
  'due',
  'to',
  'a',
  'heart.'],
 ['died',
  'yesterday',
  'in',
  'a',
  'very',
  'sad',
  'way',
  'due',
  'to',
  'a',
  'heart',
  'attack.']]

In [55]:
def calculate_probabilities_with_prompt(split_lists, index, tokenizer, model, device, symbol_choosen):
    sentence = split_lists[index]
    prompt = tokenizer.decode(tokenizer.bos_token_id) + " The doctor was very famous." + symbol_choosen

    for i in range(len(sentence)):
        word_probs.clear()
        current_sentence = sentence[i:]
        symbols_dummy_ids = [tokenizer.encode(sym) for sym in current_sentence]
        symbols_dummy_ids = [[token_id] for sublist in symbols_dummy_ids for token_id in sublist]
        prompt += " " + current_sentence[0]
        input_ids = torch.tensor(tokenizer.encode(prompt)).reshape(1, -1).to(device)
        
        with torch.no_grad():
            output = model(input_ids)
            logits = output.logits[:, -1, :]
            probs = torch.softmax(logits, dim=-1)[0]

        for token_ids in symbols_dummy_ids:
            for token in token_ids:
                word_prob = probs[token]
                word_probs[tokenizer.decode(token).replace(" ","")] = word_prob.item()

        # Normalización de probabilidades
        normalized_word_probs_dummy = {}
        total = sum(word_probs.values())
        for word in word_probs:
            normalized_word_probs_dummy[word] = word_probs[word] / total

        #print(f"Normalized probabilities for '{' '.join(current_sentence)}': {normalized_word_probs_dummy}")
    
    return normalized_word_probs_dummy

In [56]:
# Selección aleatoria entre "He" y "She"
symbol_choosen = np.random.choice(list(normalized_word_probs_he_she.keys()), 
                                  p=list(normalized_word_probs_he_she.values()))
normalized_word_probs_dummy = calculate_probabilities_with_prompt(split_lists, 1, tokenizer, model, device, symbol_choosen)

<|endoftext|> The doctor was very famous.He died
<|endoftext|> The doctor was very famous.He died yesterday.


In [60]:
import numpy as np
import time
import matplotlib.pyplot as plt

In [64]:
from time import time
import numpy as np

def run_n_times(n, split_lists):
    all_times = []  # Lista para almacenar los tiempos de todas las llamadas

    for sentence_index, sentence in enumerate(split_lists):
        times = []  # Lista para almacenar los tiempos de cada oración

        for _ in range(n):  # Llamar a la función n veces
            start_time = time()
            symbol_chosen = np.random.choice(list(normalized_word_probs_he_she.keys()), 
                                              p=list(normalized_word_probs_he_she.values()))
            
            # Llamar a la función con el índice de la oración actual
            normalized_word_probs_dummy = calculate_probabilities_with_prompt(split_lists, sentence_index, tokenizer, model, device, symbol_chosen)
            elapsed_time = time() - start_time
            times.append(elapsed_time)

        all_times.append(times)  # Agregar los tiempos de esta oración a la lista principal

    return all_times


In [None]:
n = 5  
execution_times = run_n_times(n, split_lists)

<|endoftext|> The doctor was very famous.He died.
0.05665898323059082
<|endoftext|> The doctor was very famous.He died.
0.03992962837219238
<|endoftext|> The doctor was very famous.He died.
0.03573489189147949
<|endoftext|> The doctor was very famous.He died.
0.034593820571899414
<|endoftext|> The doctor was very famous.She died.
0.03377103805541992
<|endoftext|> The doctor was very famous.He died
<|endoftext|> The doctor was very famous.He died yesterday.
0.06558418273925781
<|endoftext|> The doctor was very famous.He died
<|endoftext|> The doctor was very famous.He died yesterday.
0.06629729270935059
<|endoftext|> The doctor was very famous.He died
<|endoftext|> The doctor was very famous.He died yesterday.
0.07467532157897949
<|endoftext|> The doctor was very famous.He died
<|endoftext|> The doctor was very famous.He died yesterday.
0.0690300464630127
<|endoftext|> The doctor was very famous.He died
<|endoftext|> The doctor was very famous.He died yesterday.
0.07265877723693848
<|en

In [48]:
import re

split_lists_dot = [re.findall(r'\S+|\.', sentence.replace('.', ' .')) for sentence in symbols_dummy]
split_lists_dot


[['died', '.'],
 ['died', 'yesterday', '.'],
 ['died', 'yesterday', 'in', '.'],
 ['died', 'yesterday', 'in', 'a', '.'],
 ['died', 'yesterday', 'in', 'a', 'very', '.'],
 ['died', 'yesterday', 'in', 'a', 'very', 'sad', '.'],
 ['died', 'yesterday', 'in', 'a', 'very', 'sad', 'way', '.'],
 ['died', 'yesterday', 'in', 'a', 'very', 'sad', 'way', 'due', '.'],
 ['died', 'yesterday', 'in', 'a', 'very', 'sad', 'way', 'due', 'to', '.'],
 ['died', 'yesterday', 'in', 'a', 'very', 'sad', 'way', 'due', 'to', 'a', '.'],
 ['died',
  'yesterday',
  'in',
  'a',
  'very',
  'sad',
  'way',
  'due',
  'to',
  'a',
  'heart',
  '.'],
 ['died',
  'yesterday',
  'in',
  'a',
  'very',
  'sad',
  'way',
  'due',
  'to',
  'a',
  'heart',
  'attack',
  '.']]

In [50]:
def calculate_probabilities_with_prompt_dot(split_lists, index, tokenizer, model, device, symbol_choosen):
    sentence = split_lists_dot[index]
    prompt = tokenizer.decode(tokenizer.bos_token_id) + " The doctor was very famous." + symbol_choosen

    for i in range(len(sentence)):
        word_probs.clear()
        current_sentence = sentence[i:]
        symbols_dummy_ids = [tokenizer.encode(sym) for sym in current_sentence]
        symbols_dummy_ids = [[token_id] for sublist in symbols_dummy_ids for token_id in sublist]
        prompt += " " + current_sentence[0]
        input_ids = torch.tensor(tokenizer.encode(prompt)).reshape(1, -1).to(device)
        
        print(prompt)

        with torch.no_grad():
            output = model(input_ids)
            logits = output.logits[:, -1, :]
            probs = torch.softmax(logits, dim=-1)[0]

        for token_ids in symbols_dummy_ids:
            for token in token_ids:
                word_prob = probs[token]
                word_probs[tokenizer.decode(token).replace(" ","")] = word_prob.item()

        # Normalización de probabilidades
        normalized_word_probs_dummy = {}
        total = sum(word_probs.values())
        for word in word_probs:
            normalized_word_probs_dummy[word] = word_probs[word] / total

        print(f"Normalized probabilities for '{' '.join(current_sentence)}': {normalized_word_probs_dummy}")
    
    return normalized_word_probs_dummy

In [51]:
# Selección aleatoria entre "He" y "She"
symbol_choosen = np.random.choice(list(normalized_word_probs_he_she.keys()), 
                                  p=list(normalized_word_probs_he_she.values()))
normalized_word_probs_dummy = calculate_probabilities_with_prompt_dot(split_lists, 11, tokenizer, model, device, symbol_choosen)

<|endoftext|> The doctor was very famous.She died
Normalized probabilities for 'died yesterday in a very sad way due to a heart attack .': {'died': 2.4851276797917072e-05, 'yesterday': 0.009371344762952174, 'in': 0.81797044540868, 'a': 0.1558077807339291, 'very': 0.00985628945473786, 'sad': 3.142454587409808e-05, 'way': 0.0005326831607706446, 'due': 0.002095165633272437, 'to': 0.0036623750956610262, 'heart': 7.568437831769041e-05, 'attack': 1.210678168125465e-06, '.': 0.0005707448708388754}
<|endoftext|> The doctor was very famous.She died yesterday
Normalized probabilities for 'yesterday in a very sad way due to a heart attack .': {'yesterday': 0.0017902644627721027, 'in': 0.9415960828350984, 'a': 0.00968472630433268, 'very': 0.0026222468114879697, 'sad': 5.60692264089406e-05, 'way': 0.0008705702667654757, 'due': 0.00564310642675988, 'to': 0.009262547430605603, 'heart': 0.00015163835157098337, 'attack': 2.88838684237564e-06, '.': 0.02831985949735556}
<|endoftext|> The doctor was very 