# dlab project: Preliminaries

## 1. likelihoods

In [3]:
# pip install transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
checkpoint = "HuggingFaceTB/SmolLM-135M-Instruct"

device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# for multiple GPUs install accelerate and do `model = AutoModelForCausalLM.from_pretrained(checkpoint, device_map="auto")`
model = AutoModelForCausalLM.from_pretrained(checkpoint).to(device)

messages = [{"role": "user", "content": "Write a story about a data scientist."}]
input_text=tokenizer.apply_chat_template(messages, tokenize=False)
print(input_text)
inputs = tokenizer.encode(input_text, return_tensors="pt").to(device)
n = 50
outputs = model.generate(inputs, max_new_tokens=n, do_sample=True, return_dict_in_generate=True, output_scores=True)
print(tokenizer.decode(outputs['sequences'][0]))



<|im_start|>user
Write a story about a data scientist.<|im_end|>

<|im_start|>user
Write a story about a data scientist.<|im_end|>
<|im_start|>assistant
**The Data Catastrophe**

Ramsaura, the young woman without a surname, was assigned to the prestigious Data Science department at the prestigious Harvard Business School. She was tasked with analyzing an enormous dataset of pet purchases


In [4]:
# likelihood, and geometric mean

likelihood = 1
for logits in outputs['scores']:
    probs = torch.softmax(logits, dim=1)
    likelihood *= float(torch.max(probs))
print('Likelihood:', likelihood)
print('Geometric mean:', likelihood**(1/n))


Likelihood: 1.3217267224320587e-13
Geometric mean: 0.5526152090933232


Product of likelihoods: for 50 tokens we could expect something between 1e-30 and 1e-100, here we have 4e-9, which tells our sequence might be very likely.

Geometric mean: vary between 0 and 1 (less to most likely, respectively), here we have 0.7, which again confirms how likely our sequence is.

## 2. mask special tokens

In [5]:
# more special tokens might exist, search for them

from transformers import LogitsProcessorList, LogitsProcessor
class BlacklistTokensProcessor(LogitsProcessor):
    def __init__(self, blacklist_token_ids):
        self.blacklist_token_ids = blacklist_token_ids

    def __call__(self, input_ids, scores):
        # Set the logits of blacklisted tokens to a very large negative value (effectively 0 probability)
        for token_id in self.blacklist_token_ids:
            scores[:, token_id] = -float('inf')
        return scores

blacklist_tokens = [".", "!", "?", ",", ";", ":", "*", "**",
                    "...", "(", ")", "-", "\"", ",\"", "Ġ(",
                    ").", "Ġ-", "ĊĊ", "**:", "Ġ**", "!**",
                    "<|im_end|>", ".\"", "\":", "---", "..",
                    "<|endoftext|>", "ĠâĢĵ"]
blacklist_token_ids = tokenizer.convert_tokens_to_ids(blacklist_tokens)

logits_processor = LogitsProcessorList()
logits_processor.append(BlacklistTokensProcessor(blacklist_token_ids))

n=100
outputs = model.generate(inputs, max_new_tokens=n, do_sample=True, return_dict_in_generate=True, output_scores=True, logits_processor=logits_processor)

In [6]:
output_text = tokenizer.decode(outputs['sequences'][0])
print(output_text)

print('\n## TOKENS ANALYSIS:')
token_pos = 16
print('Output:',tokenizer.decode(outputs['sequences'][0][token_pos]))
token_id = outputs['sequences'][0][token_pos].item()
print('TokenID:',token_id)
print('Token:',tokenizer.convert_ids_to_tokens(token_id))

<|im_start|>user
Write a story about a data scientist.<|im_end|>
<|im_start|>assistant
Title
Story 2

It was a typical Monday morning at the corporate firm where the CEO of Tech Industries was leading a team of data scientists to investigate a new project of his that had been stalled for months due to the sheer volume of data generating around it all the time over the past year or two of progress made on the project's timeline was a losing battle for the tech industry as a whole with the sheer volume of data being generated continuously and the sheer level of noise

## TOKENS ANALYSIS:
Output: 

TokenID: 198
Token: Ċ


## 3. beamsearch

In [7]:
# mask generations of forbidden tokens

n=50
outputs = model.generate(
    inputs,
    max_new_tokens=n,
    do_sample=True,
    logits_processor=logits_processor,
    return_dict_in_generate=True,
    output_scores=True,
    num_beams=2)

In [8]:
output_text = tokenizer.decode(outputs['sequences'][0])
print(output_text)

<|im_start|>user
Write a story about a data scientist.<|im_end|>
<|im_start|>assistant
Meet Maya Singh

Maya Singh was a data scientist at a leading tech firm in Silicon Valley who had been working on a project to develop a predictive model to forecast the sales of a new product launch in the city's coffee shops
