# Task 1

In [2]:
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    LogitsProcessorList,
    MinLengthLogitsProcessor,
    StoppingCriteriaList,
    MaxLengthCriteria,
)
import numpy as np

tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2")

model.generation_config.pad_token_id = model.generation_config.eos_token_id

maxLength = 30

prompt = "Today I believe we can finally"
k = 20
p = 0.95

# return Pytorch tensor
inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs.input_ids

In [3]:
# Greedy Search
greedy_output = model.generate(
    input_ids, 
    max_new_tokens=maxLength, 
    return_dict_in_generate=True, 
    output_scores=True
)

# Beam Search
beam_output = model.generate(
    input_ids, 
    max_new_tokens=maxLength, 
    num_beams=5, 
    no_repeat_ngram_size=2, 
    early_stopping=True,
    return_dict_in_generate=True, 
    output_scores=True
)

# Top-K
# set top_k to 20
top_k_output = model.generate(
    input_ids, 
    do_sample=True, 
    max_new_tokens=maxLength, 
    top_k=k,
    return_dict_in_generate=True, 
    output_scores=True
)

# Top-P (nuclear sampling)

# set top_k = 20 and set top_p = 0.95
top_p_output = model.generate(
    input_ids,
    do_sample=True, 
    max_new_tokens=maxLength, 
    top_k=k, 
    top_p=p, 
    return_dict_in_generate=True, 
    output_scores=True
)



The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

### Print outputs

In [4]:
print(f'Greedy Search',tokenizer.batch_decode(greedy_output[0], skip_special_tokens=True))
print(f'\nBeam Search',tokenizer.batch_decode(beam_output[0], skip_special_tokens=True))
print(f'\nTop-K',tokenizer.batch_decode(top_k_output[0], skip_special_tokens=True))
print(f'\nTop-P',tokenizer.batch_decode(top_p_output[0], skip_special_tokens=True))

Greedy Search ['Today I believe we can finally get to the point where we can make a difference in the lives of the people of the United States of America.\n\nI believe that we can']

Beam Search ['Today I believe we can finally get to the bottom of this issue.\n\n"We need to find a way to make sure that we don\'t get into a situation where we']

Top-K ['Today I believe we can finally start a new chapter in our history.\n\nI love people who are dedicated to their craft.\n\nI want my kids to be like my']

Top-P ['Today I believe we can finally do things better."\n\nThat includes making sure that all new vehicles meet a set of safety regulations before new ones are launched. This means they must']


### Generate Scores

In [5]:
input_length = inputs.input_ids.shape[1]
# Greedy
greedy_output_scores = model.compute_transition_scores(
    greedy_output.sequences, greedy_output.scores, normalize_logits=True
)
greedy_output_tokens = greedy_output.sequences[:, input_length:]

# Beam
beam_output_scores = model.compute_transition_scores(
    beam_output.sequences, beam_output.scores, normalize_logits=True
)
beam_output_tokens = beam_output.sequences[:, input_length:]

# Top-K
top_k_output_scores = model.compute_transition_scores(
    top_k_output.sequences, top_k_output.scores, normalize_logits=True
)
top_k_output_tokens = top_k_output.sequences[:, input_length:]

# Top-P
top_p_output_scores = model.compute_transition_scores(
    top_p_output.sequences, top_p_output.scores, normalize_logits=True
)
top_p_output_tokens = top_p_output.sequences[:, input_length:]

# for tok, score in zip(greedy_output_tokens[0], greedy_output_scores[0]):
#     # | token | token string | logits | probability
#     print(f"| {tok:5d} | {tokenizer.decode(tok):8s} | {score.numpy():.3f} | {np.exp(score.numpy()):.2%}")


  beam_indices[beam_indices_mask] = 0
  beam_indices[beam_indices_mask] = 0


In [6]:
# Define function for perplexity and likelihood
def calculate_perplexity_and_likelihood(scores):
    # compute the sum of the log-likelihoods
    perplexity = 0
    likelihood = 0
    for score in scores[0]:
        perplexity += score.numpy()
        likelihood += np.exp(perplexity)
    perplexity = perplexity / len(scores[0])    
    return perplexity, likelihood

In [7]:
# calculate the perplexity and likelihood for the greedy output
greedy_perplexity, greedy_likelihood = calculate_perplexity_and_likelihood(greedy_output_scores)
print(f"Greedy perplexity: {greedy_perplexity:.3f}")
print(f"Greedy likelihood: {greedy_likelihood:.2%}")

# calculate the perplexity and likelihood for the beam output
beam_perplexity, beam_likelihood = calculate_perplexity_and_likelihood(beam_output_scores)
print(f"Beam perplexity: {beam_perplexity:.3f}")
print(f"Beam likelihood: {beam_likelihood:.2%}")

# calculate the perplexity and likelihood for the top-k output
top_k_perplexity, top_k_likelihood = calculate_perplexity_and_likelihood(top_k_output_scores)
print(f"Top-k perplexity: {top_k_perplexity:.3f}")
print(f"Top-k likelihood: {top_k_likelihood:.2%}")

# calculate the perplexity and likelihood for the top-p output
top_p_perplexity, top_p_likelihood = calculate_perplexity_and_likelihood(top_p_output_scores)
print(f"Top-p perplexity: {top_p_perplexity:.3f}")
print(f"Top-p likelihood: {top_p_likelihood:.2%}")

Greedy perplexity: -1.346
Greedy likelihood: 7.93%
Beam perplexity: -6.135
Beam likelihood: 7.48%
Top-k perplexity: -1.799
Top-k likelihood: 5.79%
Top-p perplexity: -2.432
Top-p likelihood: 4.73%


# Task 2

### Samsum: https://huggingface.co/datasets/samsum


In [8]:
# To load_dataset("samsum")
# !pip install py7zr

In [9]:
from datasets import load_dataset

dataset = load_dataset("samsum")

Found cached dataset samsum (C:/Users/Jeonghoon Kim/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e)


  0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("philschmid/bart-large-cnn-samsum")

model = AutoModelForSeq2SeqLM.from_pretrained("philschmid/bart-large-cnn-samsum")

In [14]:
import pandas as pd

# create an empty DataFrame with 5 columns
df = pd.DataFrame(columns=['prompt', 'greedy_search', 'beam_search', 'top_k', 'top_p'])


# maxLength to 150 to assure full text
maxLength = 150

# Create summary from first 50 with test set 
print('Please... be patient...\nThis takes a while...')
for i in range(50):
    prompt = dataset['test'][i]['dialogue']
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs.input_ids

    # Greedy Search
    greedy_output = model.generate(
        input_ids, 
        max_new_tokens=maxLength, 
        return_dict_in_generate=True, 
        output_scores=True
    )

    # Beam Search
    beam_output = model.generate(
        input_ids, 
        max_new_tokens=maxLength, 
        num_beams=5, 
        no_repeat_ngram_size=2, 
        early_stopping=True,
        return_dict_in_generate=True, 
        output_scores=True
    )

    # Top-K
    # set top_k to 20
    top_k_output = model.generate(
        input_ids, 
        do_sample=True, 
        max_new_tokens=maxLength, 
        top_k=k,
        return_dict_in_generate=True, 
        output_scores=True
    )

    # Top-P (nuclear sampling)
    # set top_k = 20 and set top_p = 0.95
    top_p_output = model.generate(
        input_ids,
        do_sample=True, 
        max_new_tokens=maxLength, 
        top_k=k, 
        top_p=p, 
        return_dict_in_generate=True, 
        output_scores=True
    )
    
    # convert list to string
    greedy = ' '.join(tokenizer.batch_decode(greedy_output[0], skip_special_tokens=True))
    beam = ' '.join(tokenizer.batch_decode(beam_output[0], skip_special_tokens=True))
    top_k = ' '.join(tokenizer.batch_decode(top_k_output[0], skip_special_tokens=True))
    top_p = ' '.join(tokenizer.batch_decode(top_p_output[0], skip_special_tokens=True))
    
    row = {'prompt': prompt, 'greedy_search': greedy, 'beam_search': beam, 'top_k': top_k, 'top_p': top_p}
    df = df.append(row, ignore_index=True)
    
# save to csv file
df.to_csv('output.csv', encoding='utf-8-sig', index=False)
print('\ndf saved!')

Please... be patient...
This takes a while...

df saved!


# Task 3