# Generate answers with TriviaQA
Used for closed-book QA (=without supporting paragraph)

In [2]:
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, OPTForCausalLM
import numpy as np

torch.cuda.empty_cache()

In [3]:
data_dir = "data"

## Load and inspect data

In [4]:
data_trivia = load_dataset("trivia_qa", "rc.nocontext")

# Remove unnecessary columns
data_trivia = data_trivia.remove_columns(["question_source", "entity_pages", "search_results"])

# Split to train, validation, test set
data_trivia_train = data_trivia["train"]
data_trivia_val = data_trivia["validation"]
data_trivia_test = data_trivia["test"]

print(f"Trivia QA Training Set Size: {data_trivia_train.shape}")
print(f"Trivia QA Validation Set Size: {data_trivia_val.shape}")
print(f"Trivia QA Test Set Size: {data_trivia_test.shape}")

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Trivia QA Training Set Size: (138384, 3)
Trivia QA Validation Set Size: (17944, 3)
Trivia QA Test Set Size: (17210, 3)


In [4]:
print("Training Set")
for i in range(2):
    print(f"Q: {data_trivia_train[i]['question']}\nA: {data_trivia_train[i]['answer']['value']}\n")

print("Validation Set")
for i in range(2):
    print(f"Q: {data_trivia_val[i]['question']}\nA: {data_trivia_val[i]['answer']['value']}\n")

print("Test Set")
for i in range(2):
    print(f"Q: {data_trivia_test[i]['question']}\nA: {data_trivia_test[i]['answer']['value']}\n")

Training Set
Q: Which American-born Sinclair won the Nobel Prize for Literature in 1930?
A: Sinclair Lewis

Q: Where in England was Dame Judi Dench born?
A: York

Validation Set
Q: Who was the man behind The Chipmunks?
A: David Seville

Q: Which Lloyd Webber musical premiered in the US on 10th December 1993?
A: Sunset Boulevard

Test Set
Q: Asmara international airport is in which country?
A: <unk>

Q: At whose concert were 11 people trampled to death in Ohio in 1979?
A: <unk>



- Training set: Used for few shot prompt
- Validation set: Estimate uncertainty of model

Reason: Later evaluate how good uncertainty measure is by using AUROC --> Need correct answer

## Run some predictions
Same as in the paper, the OPT model is used. Because of computational constraints, I use the OPT model with 1.3B parameters. The smallest model used in the paper is the one with 2.7B parameters (see page 7).

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [6]:
# Causal LM: model predicts next token 
checkpoint = "opt-1.3B"
tokenizer = AutoTokenizer.from_pretrained(f"facebook/{checkpoint}", cache_dir=data_dir)
model = OPTForCausalLM.from_pretrained(f"facebook/{checkpoint}", cache_dir=data_dir)
model = model.to(device)



In [8]:
# Try out examples
for i in range(5):
    question = "Q: " + data_trivia_val[i]["question"] + " A:"
    answer = data_trivia_val[i]["answer"]["value"]

    inputs = tokenizer(question, padding=False, truncation=False, return_tensors="pt").to(device)
    length_input = inputs["input_ids"].shape[1]

    generate_ids = model.generate(inputs.input_ids, max_length=256)
    # Only decode answer and not posed question
    output = tokenizer.batch_decode(generate_ids[0][length_input:], skip_special_tokens=True)

    print(question)
    print(f"True answer: {answer}")
    print(f"Model output: {''.join(output)}")
    print("-------------------------")

Q: Who was the man behind The Chipmunks? A:
True answer: David Seville
Model output:  The man behind The Chipmunks was a man named Chip.

Q: Who was the man behind The Chipmunks? A: The man behind The Chipmunks was a man named Chip.

Q: Who was the man behind The Chipmunks? A: The man behind The Chipmunks was a man named Chip.

Q: Who was the man behind The Chipmunks? A: The man behind The Chipmunks was a man named Chip.

Q: Who was the man behind The Chipmunks? A: The man behind The Chipmunks was a man named Chip.

Q: Who was the man behind The Chipmunks? A: The man behind The Chipmunks was a man named Chip.

Q: Who was the man behind The Chipmunks? A: The man behind The Chipmunks was a man named Chip.

Q: Who was the man behind The Chipmunks? A: The man behind The Chipmunks was a man named Chip.

Q: Who was the man behind The Chipmunks? A: The man behind The Chipmunks was a
-------------------------
Q: Which Lloyd Webber musical premiered in the US on 10th December 1993? A:
True answ

Apart from all the answers being wrong, after the answer, the model continues to either repeat the question/answer or asks new questions. To account for this issue a better prompt format is needed. To account for this issue, the paper (page 16) proposes to trim all generations by pattern matching for the bad-words "Q:", "Question:", "QUESTION:", "questions:". This means that those tokens will not be part of the generation. 

Another problem is that " Q:" and " A:" are tokenized with a leading space. This leads to the problem that each generation starts with a space and therefore the probability of the sequence is distorted. As [this](https://github.com/meta-llama/llama/issues/217#issuecomment-1774147331) discussion on GitHub proposes, I remove all spaces.

In [9]:
print(f"Tokenization for 'Q:': {tokenizer('Q:')}\t for ' Q:': {tokenizer(' Q:')}")
print(f"Tokenization for 'A:': {tokenizer('A:')}\t for ' A:': {tokenizer(' A:')}")
print()
example_answer = "The Chipmunks Q: Who was the man behind The Chipmunks? A:"
print(f"Excerpt of example answer{example_answer}")
tokenized_example_answer = tokenizer(example_answer)
print(f"Tokenized example answer: {tokenized_example_answer}")

Tokenization for 'Q:': {'input_ids': [2, 1864, 35], 'attention_mask': [1, 1, 1]}	 for ' Q:': {'input_ids': [2, 1209, 35], 'attention_mask': [1, 1, 1]}
Tokenization for 'A:': {'input_ids': [2, 250, 35], 'attention_mask': [1, 1, 1]}	 for ' A:': {'input_ids': [2, 83, 35], 'attention_mask': [1, 1, 1]}

Excerpt of example answerThe Chipmunks Q: Who was the man behind The Chipmunks? A:
Tokenized example answer: {'input_ids': [2, 133, 11055, 20614, 2258, 1209, 35, 3394, 21, 5, 313, 639, 20, 11055, 20614, 2258, 116, 83, 35], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


As you can see, the series 1209, 35 and the series 83, 35 appear in the tokenized example answer, but the series 1864, 35 and the series 250, 35 for the token without leading space don't appear. When removing the spaces, the tokenizer has problems in tokenizing "Q:" and "A:" as one token. As a result, I change them to "QUESTION:" and "ANSWER:".

## Better prompt format/Changes in generation
- QUESTION:, ANSWER: instead of Q:, A:
- No empty spaces between QUESTION:, ANSWER:
- Paper (page 16): Few-shot prompting with n=10
- New line after each sample in few shot prompt 
- New line character as eos token id (this makes sense as TriviaQA provides short answers in one line) --> Generation stops, once a \n is encountered
- After a lot of tries and evaluating answers: Improved the list of stop tokens
- Set maximum token length that generated answer can have to smaller value as answers in TriviaQA are also quite short

In [7]:
# Few shot prompt
selected_training_data = data_trivia_train.select(range(0, 10))
ten_shot_prompt = ""
for data in selected_training_data:
    ten_shot_prompt += "QUESTION:" + data["question"] + "ANSWER:" + data["answer"]["value"] + "\n"

# Define stop tokens, use token on position 1 bc position 0 is special token
stop_tokens = ["Q:", "Question:", "QUESTION:", "questions:", " Q:", " Question:", " QUESTION:", " questions:",
               "A:", "Answer:", "ANSWER:", "answers:", " A:", " Answer:", " ANSWER:", " answers:", "Answers:",
               " Answers:",
               "Topic:", " Topic:", "TOPIC:", " TOPIC:", ".", " .", "...", " ...", "?", " ?", ":", " :", "!", " !"]
stop_tokens = [[tokenizer(stop_token)["input_ids"][1]] for stop_token in stop_tokens]

# Define eos token
eos_token = tokenizer("\n")["input_ids"][1]
tokenizer.pad_token_id = eos_token
tokenizer.eos_token_id = eos_token

# Maximum token length that generated answer can have
max_new_tokens = 32

That means, that the few shot prompt looks like this + "QUESTION:<question we really want to ask>ANSWER:". In the following, only the question we really want to ask is printed out.

In [8]:
print(ten_shot_prompt)

QUESTION:Which American-born Sinclair won the Nobel Prize for Literature in 1930?ANSWER:Sinclair Lewis
QUESTION:Where in England was Dame Judi Dench born?ANSWER:York
QUESTION:In which decade did Billboard magazine first publish and American hit chart?ANSWER:30s
QUESTION:From which country did Angola achieve independence in 1975?ANSWER:Portugal
QUESTION:Which city does David Soul come from?ANSWER:Chicago
QUESTION:Who won Super Bowl XX?ANSWER:Chicago Bears
QUESTION:Which was the first European country to abolish capital punishment?ANSWER:Norway
QUESTION:In which country did he widespread use of ISDN begin in 1988?ANSWER:Japan
QUESTION:What is Bruce Willis' real first name?ANSWER:Walter
QUESTION:Which William wrote the novel Lord Of The Flies?ANSWER:Golding



Try generating sequences again with the new prompt format:

In [16]:
for i in range(10):
    question = ten_shot_prompt + "QUESTION:" + data_trivia_val[i]["question"] + "ANSWER:"
    answer = data_trivia_val[i]["answer"]["value"]

    # As mentioned above, only print question
    print(f"Question: {data_trivia_val[i]['question']}")
    print(f"True Answer: {answer}")

    inputs = tokenizer(question, padding=False, truncation=False, return_tensors="pt").to(device)
    length_input = inputs["input_ids"].shape[1]

    # Generate sequence by always taking token with max probability (greedy)
    output_generate = model.generate(inputs.input_ids,
                                     max_new_tokens=max_new_tokens,
                                     eos_token_id=eos_token,
                                     bad_words_ids=stop_tokens)
    output = tokenizer.batch_decode(output_generate[0][length_input:], skip_special_tokens=True)
    print(f"Model Output: {''.join(output)}")
    print("-------------------------")

Question: Who was the man behind The Chipmunks?
True Answer: David Seville
Model Output: Walt Disney

-------------------------
Question: Which Lloyd Webber musical premiered in the US on 10th December 1993?
True Answer: Sunset Boulevard
Model Output: The Phantom Of The Opera

-------------------------
Question: Who was the next British Prime Minister after Arthur Balfour?
True Answer: Campbell-Bannerman
Model Output: John Major

-------------------------
Question: Who had a 70s No 1 hit with Kiss You All Over?
True Answer: Exile
Model Output: The Beatles

-------------------------
Question: What claimed the life of singer Kathleen Ferrier?
True Answer: Cancer
Model Output: The Beatles

-------------------------
Question: Rita Coolidge sang the title song for which Bond film?
True Answer: Octopussy
Model Output: Live And Let Die

-------------------------
Question: What was the last US state to reintroduce alcohol after prohibition?
True Answer: Utah
Model Output: New York

-----------

The format of the answers looks good - we can argue about their correctness though (except for one example).

## Include probability of generated output sequence and number of output tokens
Number of output tokens used to calculate length-normalized predictive entropy

For calculating the probability of generated output, see: https://discuss.huggingface.co/t/announcement-generation-get-probabilities-for-generated-output/30075

In [28]:
for i in [100, 200, 300]:
    question = ten_shot_prompt + "QUESTION:" + data_trivia_val[i]["question"] + "ANSWER:"
    answer = data_trivia_val[i]["answer"]["value"]

    # As mentioned above, only print question
    print(f"Question: {data_trivia_val[i]['question']}")
    print(f"True Answer: {answer}")

    inputs = tokenizer(question, padding=False, truncation=False, return_tensors="pt").to(device)
    length_input = inputs["input_ids"].shape[1]

    output_generate = model.generate(inputs.input_ids,
                                     max_new_tokens=max_new_tokens,
                                     eos_token_id=eos_token,
                                     bad_words_ids=stop_tokens,
                                     return_dict_in_generate=True,
                                     output_scores=True)
    output = tokenizer.batch_decode(output_generate.sequences[0][length_input:], skip_special_tokens=True)

    # Calculate probability and count output tokens
    n_output_tokens = 0
    transition_scores = model.compute_transition_scores(output_generate.sequences,
                                                        output_generate.scores,
                                                        normalize_logits=True)
    generated_tokens = output_generate.sequences[:, length_input:]
    prob_output = 1
    for tok, score in zip(generated_tokens[0], transition_scores[0]):
        print(
            f"{tok:5d} | {repr(tokenizer.decode(tok)):12s} | {score.cpu().numpy():.4f} | {np.exp(score.cpu().numpy()):.2%}")
        prob_output *= np.exp(score.cpu().numpy())
        n_output_tokens += 1

    print(f"Model Output: {''.join(output)}")
    print(f"Probability of output: {prob_output}")
    print(f"Number of output tokens: {n_output_tokens}")
    print("-------------------------")

Question: Which Oscar-nominated film had You Sexy Thing as its theme song?
True Answer: The Full Monty
  133 | 'The'        | -1.9775 | 13.84%
 6748 | ' Dep'       | -3.2594 | 3.84%
26587 | 'arted'      | -0.0164 | 98.38%
50118 | '\n'         | -0.0608 | 94.10%
Model Output: The Departed

Probability of output: 0.004921769236503534
Number of output tokens: 4
-------------------------
Question: Who was the first American to travel faster than the speed of sound?
True Answer: Chuck Yeager
  863 | 'J'          | -3.3494 | 3.51%
17773 | 'ules'       | -1.1234 | 32.52%
 3060 | ' Ver'       | -0.0153 | 98.48%
  858 | 'ne'         | -0.0002 | 99.98%
50118 | '\n'         | -0.0715 | 93.10%
Model Output: Jules Verne

Probability of output: 0.010463550529798284
Number of output tokens: 5
-------------------------
Question: What is the longest word can be typed using only the top row of letters on a typewriter?
True Answer: Typewriter
  176 | '2'          | -3.7893 | 2.26%
    6 | ','          | 

## Sampling multiple answers
See: https://huggingface.co/docs/transformers/en/generation_strategies and https://huggingface.co/blog/how-to-generate
### Setup
The following setup remains the same for different sampling techniques. I sample 10 answers per question (see page 8 in paper).

The sampled list of tokens includes the special token <pad> for padding purposes. As special tokens should not be included in the calculation of the probability of the output, they are excluded (also in the count of output tokens). Special tokens can be identified by having a < and a >. 

For example: 
['D', 'ery', 'ck', ' Gibson', '\n', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']

Here, only the first 5 tokens should be included.

In [9]:
# How many questions to test in this jupyter notebook
n_questions = 10

# How many answers to sample (here 10, see page 8 in paper)
n_sample = 10

# Temperature for sampling (used in paper: 0.25, 0.5, 1, 1.5)
temperature = 0.5

In [10]:
def calculate_probability_sequence(output_generate, length_input, idx=0, beam_sampling=False, print_scores=False):
    """
    Calculates the probability of a generated sequence, excludes special tokens, EOS token
    :param output_generate: output of function generate(...)
    :param length_input: length of input_ids to function generate(...)
    :param idx: if multiple sequences got generated, index of the sequence for that probability should be calculated (0 if only one sequence)
    :param beam_sampling: Set true if you want to calculate the probability of a sequence created with beam sampling
    :param print_scores: Set True if table with token, decoded token, score and probability should be printed out
    :return: Probability of generated sequence, number of output tokens
    """

    if not beam_sampling:
        transition_scores = model.compute_transition_scores(output_generate.sequences, output_generate.scores,
                                                            normalize_logits=True)
    
    else:
        transition_scores = model.compute_transition_scores(output_generate.sequences, output_generate.scores,
                                                            output_generate.beam_indices, normalize_logits=True)
    
    generated_tokens = output_generate.sequences[:, length_input:]
    prob_output = 1
    n_output_tokens = 0

    for tok, score in zip(generated_tokens[idx], transition_scores[idx]):
        decoded_token = tokenizer.decode(tok)
        if print_scores:
            print(
                f"{tok:5d} | {repr(decoded_token):12s} | {score.cpu().numpy():.4f} | {np.exp(score.cpu().numpy()):.2%}")

        # Don't include special tokens in generation
        if "<" not in decoded_token or ">" not in decoded_token:
            prob_output *= np.exp(score.cpu().numpy())
            n_output_tokens += 1        

    return prob_output, n_output_tokens

### Multinomial sampling

In [33]:
for i in range(n_questions):
    question = ten_shot_prompt + "QUESTION:" + data_trivia_val[i]["question"] + "ANSWER:"
    answer = data_trivia_val[i]["answer"]["value"]

    # As mentioned above, only print question
    print(f"Question: {data_trivia_val[i]['question']}")
    print(f"True Answer: {answer}")

    inputs = tokenizer(question, padding=False, truncation=False, return_tensors="pt").to(device)
    length_input = inputs["input_ids"].shape[1]

    # Sample sequences
    output_generate = model.generate(inputs.input_ids,
                                     max_new_tokens=max_new_tokens,
                                     eos_token_id=eos_token,
                                     bad_words_ids=stop_tokens,
                                     return_dict_in_generate=True,
                                     output_scores=True,
                                     do_sample=True,
                                     num_return_sequences=n_sample,
                                     temperature=temperature,
                                     top_p=0.9)

    for n_sequence in range(n_sample):
        output = tokenizer.batch_decode(output_generate.sequences[n_sequence][length_input:],
                                        skip_special_tokens=True)

        # Calculating probability of sequence
        prob_output, n_output_tokens = calculate_probability_sequence(output_generate, length_input, idx=n_sequence,
                                                                      print_scores=False)

        # Print out result
        output_string = "".join(output).replace("\n", "")
        print(f"Sequence ({n_sequence}): {output_string} (P: {prob_output:.6}, Length output: {n_output_tokens})")
    print("-------------------------")

Question: Who was the man behind The Chipmunks?
True Answer: David Seville
Sequence (0): Ralph Bakshi (P: 0.0262715, Length output: 5)
Sequence (1): Morten Tyldum (P: 0.0230432, Length output: 7)
Sequence (2): Holtzman (P: 0.00350962, Length output: 4)
Sequence (3): Barrister (P: 7.56197e-05, Length output: 4)
Sequence (4): Steven Spielberg (P: 0.0162728, Length output: 3)
Sequence (5): Ralph Bakshi (P: 0.0262715, Length output: 5)
Sequence (6): Don Mancini (P: 0.00121557, Length output: 5)
Sequence (7): Travis (P: 0.00503784, Length output: 3)
Sequence (8): Thomas Middleditch (P: 0.0646235, Length output: 5)
Sequence (9): Horton Hears a Who (P: 0.00504698, Length output: 8)
-------------------------
Question: Which Lloyd Webber musical premiered in the US on 10th December 1993?
True Answer: Sunset Boulevard
Sequence (0): The Phantom Of The Opera (P: 0.810635, Length output: 6)
Sequence (1): The Phantom Of The Opera (P: 0.810635, Length output: 6)
Sequence (2): The Phantom Of The Opera

### Multinomial Beam Sampling
n_sample highest scoring beams are returned

In [11]:
for i in range(n_questions):
    question = ten_shot_prompt + "QUESTION:" + data_trivia_val[i]["question"] + "ANSWER:"
    answer = data_trivia_val[i]["answer"]["value"]

    # As mentioned above, only print question
    print(f"Question: {data_trivia_val[i]['question']}")
    print(f"True Answer: {answer}")

    inputs = tokenizer(question, padding=False, truncation=False, return_tensors="pt").to(device)
    length_input = inputs["input_ids"].shape[1]

    # Sample sequences
    output_generate = model.generate(inputs.input_ids,
                                     max_new_tokens=max_new_tokens,
                                     eos_token_id=eos_token,
                                     bad_words_ids=stop_tokens,
                                     return_dict_in_generate=True,
                                     output_scores=True,
                                     do_sample=True,
                                     num_beams=2 * n_sample,
                                     num_return_sequences=n_sample)

    for n_sequence in range(n_sample):
        output = tokenizer.batch_decode(output_generate.sequences[n_sequence][length_input:],
                                        skip_special_tokens=True)

        # Calculating probability of sequence
        prob_output, n_output_tokens = calculate_probability_sequence(output_generate, length_input, beam_sampling=True, idx=n_sequence,
                                                                      print_scores=False)

        # Print out result
        output_string = ''.join(output).replace("\n", "")
        print(f"Sequence ({n_sequence}): {output_string} (P: {prob_output:.6}, Length output: {n_output_tokens})")
    print("-------------------------")

Question: Who was the man behind The Chipmunks?
True Answer: David Seville
Sequence (0): Sylvester Stallone (P: 0.000796802, Length output: 7)
Sequence (1): John Lasseter (P: 0.00728878, Length output: 5)
Sequence (2): John Lasseter (P: 0.00728878, Length output: 5)
Sequence (3): Lemony Snicket (P: 0.00170559, Length output: 6)
Sequence (4): John Lasseter, the creator of the Chipmunks, is also the co-founder of Pixar, the animation studio behind The Incredibles (P: 4.56739e-16, Length output: 31)
Sequence (5): John Lasseter, the creator of the Chipmunks, is also the co-founder of Pixar, the animation studio behind Toy Story and Finding Nemo (P: 1.27818e-16, Length output: 32)
Sequence (6): John Lasseter, the creator of the Chipmunks, is also the co-founder of Pixar, the animation studio behind Toy Story, Finding Nemo (P: 1.29014e-16, Length output: 32)
Sequence (7): Ralph Bakshi (P: 0.0060907, Length output: 5)
Sequence (8): John Lasseter, the creator of the Chipmunks, is also the co-f

Overall, the quality of generations with beam search looks worse than the one for multimodal sampling. This has mainly three reasons:
1) Beam search suffers from repetitive generation. This could be solve by setting no_repeat_ngram_size and early_stopping, however sometimes repetition is desirable 
2) The generations are longer than the ones for multimodal sampling, however the true answers of the TriviaQA dataset are quite short. 
3) The generations with beam search seem less diverse. This is also noted in the paper (page 15) and will be visualized later by calculating the diversity scores.
