<a href="https://colab.research.google.com/github/kinjaljoshi/llm_param_config/blob/main/prompt_bos_eos_tokens.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers torch bitsandbytes accelerate sentencepiece

In [4]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoModelForCausalLM, AutoTokenizer

# Define models
MODELS = {
    "T5-Small": {"path": "t5-small", "type": "seq2seq"},
    "Mistral": {"path": "mistralai/Mistral-7B-Instruct-v0.1", "type": "causal"},
    "Qwen (8bit)": {"path": "Qwen/Qwen-7B-Chat", "type": "causal", "quantized": True},
    "LLaMA": {"path": "meta-llama/Llama-2-7b-chat-hf", "type": "causal"}
}

# Define prompt templates
PROMPT_TEMPLATES = {
    "without_bos_eos": "Summarize the following text: {text}",
    "with_bos_eos": "<s>Summarize the following text: {text}</s>"
}

# Test text input
TEST_TEXT = "Artificial intelligence is transforming various industries by automating tasks, improving efficiency, and enabling new capabilities."

# Function to load model and tokenizer
def load_model(model_info):
    """Loads the correct model type (Seq2Seq or Causal)."""
    print(f"\nLoading {model_info['path']} (Quantized: {model_info.get('quantized', False)})...")

    tokenizer = AutoTokenizer.from_pretrained(model_info["path"])

    if model_info["type"] == "seq2seq":
        model = AutoModelForSeq2SeqLM.from_pretrained(model_info["path"], torch_dtype=torch.float16, device_map="auto")
    else:  # Causal models (Mistral, Qwen, LLaMA)
        if model_info.get("quantized", False):
            model = AutoModelForCausalLM.from_pretrained(
                model_info["path"],
                torch_dtype=torch.float16,
                load_in_8bit=True,
                device_map="auto"
            )
        else:
            model = AutoModelForCausalLM.from_pretrained(model_info["path"], torch_dtype=torch.float16, device_map="auto")

    return model, tokenizer

# Function to print BOS and EOS tokens
def print_bos_eos_tokens(tokenizer, model_name):
    """Prints BOS and EOS tokens for the model."""
    bos_token = tokenizer.bos_token
    eos_token = tokenizer.eos_token

    print(f"\n{model_name} Token Details:")
    print(f"  - BOS Token: {bos_token} (ID: {tokenizer.bos_token_id})")
    print(f"  - EOS Token: {eos_token} (ID: {tokenizer.eos_token_id})")

    bos_eos_prompt = f"{bos_token} Summarize the following text: {TEST_TEXT} {eos_token}" if bos_token and eos_token else PROMPT_TEMPLATES["without_bos_eos"]
    return bos_eos_prompt

# Function to run inference
def run_inference(model_info, model_name):
    """Runs inference for a given model."""
    model, tokenizer = load_model(model_info)

    # Print BOS and EOS tokens
    bos_eos_prompt = print_bos_eos_tokens(tokenizer, model_name)

    for template_type, template in PROMPT_TEMPLATES.items():
        print(f"\nTesting {model_name} - {template_type} Prompt...")

        # Format prompt
        prompt = bos_eos_prompt if template_type == "with_bos_eos" else template.format(text=TEST_TEXT)
        inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

        # Generate output
        with torch.no_grad():
            output = model.generate(**inputs, max_length=100)

        decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
        print(f"{model_name} Output ({template_type}):\n{decoded_output}")


In [13]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load Flan-T5-Large
MODEL_NAME = "google/flan-t5-large"

print(f"\n Loading {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map="auto")

# Print EOS token details
print(f"\nToken Details for {MODEL_NAME}:")
print(f"  - EOS Token: {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})")

PROMPT_TEMPLATE = """
System: You are a helpful AI assistant that provides clear and concise answers.
User: {user_input}
Assistant:
"""

# Define test input
TEST_TEXT = "Summarize following text : Artificial intelligence is transforming industries by automating tasks, improving efficiency, and enabling new capabilities."

formatted_prompt = PROMPT_TEMPLATE.format(user_input=TEST_TEXT)

input_ids_without_eos = tokenizer(formatted_prompt, return_tensors="pt").input_ids.to("cuda")

# Encode input WITH EOS
input_ids_with_eos = tokenizer(formatted_prompt + " </s>", return_tensors="pt").input_ids.to("cuda")

# Generate output WITHOUT EOS
with torch.no_grad():
    output_without_eos = model.generate(input_ids_without_eos, max_length=100)
decoded_output_without_eos = tokenizer.decode(output_without_eos[0], skip_special_tokens=True)

# Generate output WITH EOS
with torch.no_grad():
    output_with_eos = model.generate(input_ids_with_eos, max_length=100)
decoded_output_with_eos = tokenizer.decode(output_with_eos[0], skip_special_tokens=True)

# Display results
print("\nOutput (Without EOS):")
print(decoded_output_without_eos)

print("\nOutput (With EOS):")
print(decoded_output_with_eos)



 Loading google/flan-t5-large...

Token Details for google/flan-t5-large:
  - EOS Token: </s> (ID: 1)

Output (Without EOS):
You are a helpful AI assistant that provides clear and concise answers.

Output (With EOS):
Artificial intelligence is transforming industries by automating tasks, improving efficiency, and enabling new capabilities.


In [1]:
from huggingface_hub import notebook_login, login
from google.colab import userdata

hf_token = userdata.get('HF_TOKEN')

login(hf_token)


In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load Mistral-7B-Instruct
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"

print(f"\nLoading {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map="auto")

# Print BOS and EOS tokens
print(f"\nToken Details for {MODEL_NAME}:")
print(f"  - BOS Token: {tokenizer.bos_token} (ID: {tokenizer.bos_token_id})")
print(f"  - EOS Token: {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})")

# Define user input
USER_INPUT = "Summarize: Artificial intelligence is transforming industries by automating tasks, improving efficiency, and enabling new capabilities."

# With [INST] Format
PROMPT_WITH_INST = f"""
<s> [INST] <<SYS>>
You are a helpful AI assistant. Provide clear and concise responses.
<</SYS>>

{USER_INPUT} [/INST]
"""

# Without [INST] Format
PROMPT_WITHOUT_INST = f"""
<s> You are a helpful AI assistant. Provide clear and concise responses.

{USER_INPUT}
"""

# Encode input
input_ids_with_inst = tokenizer(PROMPT_WITH_INST, return_tensors="pt").input_ids.to("cuda")
input_ids_without_inst = tokenizer(PROMPT_WITHOUT_INST, return_tensors="pt").input_ids.to("cuda")

# Generate output WITH [INST]
with torch.no_grad():
    output_with_inst = model.generate(input_ids_with_inst, max_length=100)
decoded_output_with_inst = tokenizer.decode(output_with_inst[0], skip_special_tokens=True)

# Generate output WITHOUT [INST]
with torch.no_grad():
    output_without_inst = model.generate(input_ids_without_inst, max_length=100)
decoded_output_without_inst = tokenizer.decode(output_without_inst[0], skip_special_tokens=True)

# Display results
print("\nOutput (With [INST] format):")
print(decoded_output_with_inst)

print("\n Output (Without [INST] format):")
print(decoded_output_without_inst)



Loading mistralai/Mistral-7B-Instruct-v0.1...


tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.



Token Details for mistralai/Mistral-7B-Instruct-v0.1:
  - BOS Token: <s> (ID: 1)
  - EOS Token: </s> (ID: 2)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



Output (With [INST] format):

 [INST] <<SYS>>
You are a helpful AI assistant. Provide clear and concise responses.
<</SYS>>

Summarize: Artificial intelligence is transforming industries by automating tasks, improving efficiency, and enabling new capabilities. [/INST]

AI is revolutionizing businesses by automating tasks, enhancing productivity, and introducing innovative functionalities.

 Output (Without [INST] format):

 You are a helpful AI assistant. Provide clear and concise responses.

Summarize: Artificial intelligence is transforming industries by automating tasks, improving efficiency, and enabling new capabilities.

Explanation: Artificial intelligence (AI) is a rapidly growing field that is changing the way businesses operate. AI technology can automate repetitive tasks, such as data entry and customer service, freeing up employees to focus on more complex and creative work.


In [None]:
!pip install tiktoken transformers_stream_generator

In [8]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load Qwen-7B-Chat
MODEL_NAME = "Qwen/Qwen-7B-Chat"

print(f"\nLoading {MODEL_NAME}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME, torch_dtype=torch.float16, device_map="auto", trust_remote_code=True)

# Print BOS and EOS tokens
print(f"\n Token Details for {MODEL_NAME}:")
print(f"  - BOS Token: {tokenizer.bos_token} (ID: {tokenizer.bos_token_id})")
print(f"  - EOS Token: {tokenizer.eos_token} (ID: {tokenizer.eos_token_id})")

# Define user input
USER_INPUT = "Summarize: Artificial intelligence is transforming industries by automating tasks, improving efficiency, and enabling new capabilities."

# With `<|im_start|>` Format
PROMPT_WITH_INST = f"""
<|im_start|>system
You are a helpful AI assistant. Provide clear and concise responses.
<|im_end|>
<|im_start|>user
{USER_INPUT}
<|im_end|>
<|im_start|>assistant
"""

# Without `<|im_start|>` Format
PROMPT_WITHOUT_INST = f"""
You are a helpful AI assistant. Provide clear and concise responses.

{USER_INPUT}
"""

# Encode input
input_ids_with_inst = tokenizer(PROMPT_WITH_INST, return_tensors="pt").input_ids.to("cuda")
input_ids_without_inst = tokenizer(PROMPT_WITHOUT_INST, return_tensors="pt").input_ids.to("cuda")

# Generate output WITH `<|im_start|>`
with torch.no_grad():
    output_with_inst = model.generate(input_ids_with_inst, max_length=100)
decoded_output_with_inst = tokenizer.decode(output_with_inst[0], skip_special_tokens=True)

# Generate output WITHOUT `<|im_start|>`
with torch.no_grad():
    output_without_inst = model.generate(input_ids_without_inst, max_length=100)
decoded_output_without_inst = tokenizer.decode(output_without_inst[0], skip_special_tokens=True)

# Display results
print("\nOutput (With `<|im_start|>` format):")
print(decoded_output_with_inst)

print("\nOutput (Without `<|im_start|>` format):")
print(decoded_output_without_inst)



Loading Qwen/Qwen-7B-Chat...


qwen_generation_utils.py:   0%|          | 0.00/14.6k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Qwen/Qwen-7B-Chat:
- qwen_generation_utils.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


cpp_kernels.py:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Qwen/Qwen-7B-Chat:
- cpp_kernels.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/Qwen/Qwen-7B-Chat:
- qwen_generation_utils.py
- cpp_kernels.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors.index.json:   0%|          | 0.00/19.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

model-00001-of-00008.safetensors:   0%|          | 0.00/1.96G [00:00<?, ?B/s]

model-00002-of-00008.safetensors:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

model-00003-of-00008.safetensors:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

model-00004-of-00008.safetensors:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

model-00005-of-00008.safetensors:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

model-00006-of-00008.safetensors:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

model-00007-of-00008.safetensors:   0%|          | 0.00/2.02G [00:00<?, ?B/s]

model-00008-of-00008.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]



Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/273 [00:00<?, ?B/s]

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Both `max_new_tokens` (=512) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



 Token Details for Qwen/Qwen-7B-Chat:
  - BOS Token: None (ID: None)
  - EOS Token: None (ID: None)


Both `max_new_tokens` (=512) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)



Output (With `<|im_start|>` format):

system
You are a helpful AI assistant. Provide clear and concise responses.

user
Summarize: Artificial intelligence is transforming industries by automating tasks, improving efficiency, and enabling new capabilities.

assistant
Artificial intelligence (AI) is revolutionizing various sectors by automating routine tasks, enhancing productivity, and offering novel functionalities. It has the potential to transform numerous industries, such as healthcare, finance, transportation, manufacturing, and more. AI-powered solutions can help streamline operations, reduce costs, enhance decision-making, and provide valuable insights into customer behavior and preferences. The continued development of AI technologies is poised to bring about even more significant changes in the future.


Output (Without `<|im_start|>` format):

You are a helpful AI assistant. Provide clear and concise responses.

Summarize: Artificial intelligence is transforming industries by