In [1]:
import sys
import torch
import time

In [2]:
sys.path.append("/content/Speculative-Decoding-main/")

In [3]:
device = torch.device("cuda:0")

In [4]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModelForSeq2SeqLM
from transformers import set_seed
set_seed(42)

# We will use the Google Llama-3.2 3B Instruct as the model we want to accelerate (3B parameters)
target_model_name = "google/flan-t5-xl"
target = AutoModelForSeq2SeqLM.from_pretrained(target_model_name)
target.to(device)

# We will use the Google Llama-3.2 1B Instruct as the drafter model (1B parameters)
drafter_model_name = "google/flan-t5-small"
drafter = AutoModelForSeq2SeqLM.from_pretrained(drafter_model_name)
drafter.to(device)

# Don't forget to load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(target_model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
prefix = "Translate to English: Je m'appelle Ayan. N'hésitez pas à contribuer à mon projet !"
# prefix = "A group of flamingos is called "
# chat_templated = f"<bos><start_of_turn>user\n{prefix}<end_of_turn>\n<start_of_turn>model\n" # Gemma chat template
chat_templated = prefix
input_ids = tokenizer(chat_templated, return_tensors="pt").input_ids
input_ids = input_ids[0].tolist() # Generation methods require a list of ids

In [6]:
# from sampling import speculative_generate, autoregressive_generate
from sampling import speculative_generate_encoder_decoder, autoregressive_generate_encoder_decoder
from utils.logits_processor import NucleusProcessor

# Parameters
gen_len = 100       # Maximum number of tokens generated (could over pass when using speculative decoding)
gamma = [3, 5, 7]  # Number of drafts generated by the drafter model at each step
logits_processor = NucleusProcessor(temperature=.6, top_p=.5) # Nucleus sampling with p=0.9 and T=0.6



# Number of drafts accepted by the target model divided by the number of drafts generated

In [7]:
start_time = time.time()
# Generate text using the classic auto-regressive decoding (slow)
output_ids_ar = autoregressive_generate_encoder_decoder( # or autoregressive_generate_encoder_decoder for encoder-decoder models
                input_ids,
                target,
                logits_processor=logits_processor,
                max_gen_len=gen_len,
                eos_tokens_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id,
            )
output_ar = tokenizer.decode(output_ids_ar, skip_special_tokens=True)
end_time = time.time()
print("Time taken: ", end_time - start_time)
output_ar


Time taken:  3.5887839794158936


"I'm Ayan. Please don't hesitate to contribute to my project!"

In [9]:
for i in gamma:
  print("gamma value: ", i)
  start_time = time.time()
  # Generate text using the speculative decoding (faster)
  output_ids_sd, alpha = speculative_generate_encoder_decoder( # or speculative_generate_encoder_decoder for encoder-decoder models
                  input_ids,
                  drafter,
                  target,
                  logits_processor=logits_processor,
                  gamma=i,
                  max_gen_len=gen_len,
                  eos_tokens_id=tokenizer.eos_token_id,
                  pad_token_id=tokenizer.pad_token_id,
              )
  output_sd = tokenizer.decode(output_ids_sd, skip_special_tokens=True)
  end_time = time.time()
  print("Time taken: ", end_time - start_time)
  print("alpha: ", alpha)
  print("Generated output: ", output_sd)
  print("######")

gamma value:  3
Time taken:  2.0771307945251465
alpha:  0.6666666666666666
Generated output:  I'm Ayan. Don't hesitate to contribute to my project!
######
gamma value:  5
Time taken:  0.864091157913208
alpha:  0.7
Generated output:  My name is Ayan. Don't hesitate to contribute to my project!
######
gamma value:  7
Time taken:  1.5532920360565186
alpha:  0.2857142857142857
Generated output:  I am Ayan. Don't hesitate to contribute to my project !
######


In [11]:
gamma = 3

output_ids_sd, alpha = speculative_generate_encoder_decoder( # or speculative_generate_encoder_decoder for encoder-decoder models
                  input_ids,
                  drafter,
                  target,
                  logits_processor=logits_processor,
                  gamma=gamma,
                  max_gen_len=gen_len,
                  eos_tokens_id=tokenizer.eos_token_id,
                  pad_token_id=tokenizer.pad_token_id,
              )
output_sd = tokenizer.decode(output_ids_sd, skip_special_tokens=True)
print("Generated output: ", output_sd)

Draft model input: tensor([[ 0, 27,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0]], device='cuda:0')
Draft model output:  [0, 27, 580, 71, 63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Target model input:  tensor([[  0,  27, 580,  71,  63]], device='cuda:0')
target model output: tensor([[31]], device='cuda:0')
target model output: tensor([[1512]], device='c