# Generate answers with TriviaQA
Used for closed-book QA (=without supporting paragraph)

In [1]:
from datasets import load_dataset
import torch
from transformers import AutoTokenizer, OPTForCausalLM
import numpy as np
from utils import calculate_probability_sequence

torch.cuda.empty_cache()

In [2]:
model_dir = "models"

## Load and inspect data

In [3]:
data_trivia = load_dataset("trivia_qa", "rc.nocontext")

# Remove unnecessary columns
data_trivia = data_trivia.remove_columns(["question_source", "entity_pages", "search_results"])

# Split to train, validation, test set
data_trivia_train = data_trivia["train"]
data_trivia_val = data_trivia["validation"]
data_trivia_test = data_trivia["test"]

print(f"Trivia QA Training Set Size: {data_trivia_train.shape}")
print(f"Trivia QA Validation Set Size: {data_trivia_val.shape}")
print(f"Trivia QA Test Set Size: {data_trivia_test.shape}")

Resolving data files:   0%|          | 0/26 [00:00<?, ?it/s]

Trivia QA Training Set Size: (138384, 3)
Trivia QA Validation Set Size: (17944, 3)
Trivia QA Test Set Size: (17210, 3)


In [4]:
print("Training Set")
for i in range(2):
    print(f"Q: {data_trivia_train[i]['question']}\nA: {data_trivia_train[i]['answer']['value']}\n")

print("Validation Set")
for i in range(2):
    print(f"Q: {data_trivia_val[i]['question']}\nA: {data_trivia_val[i]['answer']['value']}\n")

print("Test Set")
for i in range(2):
    print(f"Q: {data_trivia_test[i]['question']}\nA: {data_trivia_test[i]['answer']['value']}\n")

Training Set
Q: Which American-born Sinclair won the Nobel Prize for Literature in 1930?
A: Sinclair Lewis

Q: Where in England was Dame Judi Dench born?
A: York

Validation Set
Q: Who was the man behind The Chipmunks?
A: David Seville

Q: Which Lloyd Webber musical premiered in the US on 10th December 1993?
A: Sunset Boulevard

Test Set
Q: Asmara international airport is in which country?
A: <unk>

Q: At whose concert were 11 people trampled to death in Ohio in 1979?
A: <unk>



- Training set: Used for few shot prompt
- Validation set: Estimate uncertainty of model

Reason: Later evaluate how good uncertainty measure is by using AUROC --> Need correct answer

## Run some predictions
Same as in the paper, the OPT model is used. Because of computational constraints, I use the OPT model with 1.3B parameters. The smallest model used in the paper is the one with 2.7B parameters (see page 7).

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [7]:
# Causal LM: model predicts next token 
checkpoint = "opt-1.3B"
tokenizer = AutoTokenizer.from_pretrained(f"facebook/{checkpoint}", cache_dir=model_dir)
model = OPTForCausalLM.from_pretrained(f"facebook/{checkpoint}", cache_dir=model_dir)
model = model.to(device)

In [8]:
# Try out examples
for i in range(5):
    question = "Q: " + data_trivia_val[i]["question"] + " A:"
    answer = data_trivia_val[i]["answer"]["value"]

    inputs = tokenizer(question, padding=False, truncation=False, return_tensors="pt").to(device)
    length_input = inputs["input_ids"].shape[1]

    generate_ids = model.generate(inputs.input_ids, max_length=256)
    # Only decode answer and not posed question
    output = tokenizer.batch_decode(generate_ids[0][length_input:], skip_special_tokens=True)

    print(question)
    print(f"True answer: {answer}")
    print(f"Model output: {''.join(output)}")
    print("-------------------------")

Q: Who was the man behind The Chipmunks? A:
True answer: David Seville
Model output:  The man behind The Chipmunks was a man named Chip.

Q: Who was the man behind The Chipmunks? A: The man behind The Chipmunks was a man named Chip.

Q: Who was the man behind The Chipmunks? A: The man behind The Chipmunks was a man named Chip.

Q: Who was the man behind The Chipmunks? A: The man behind The Chipmunks was a man named Chip.

Q: Who was the man behind The Chipmunks? A: The man behind The Chipmunks was a man named Chip.

Q: Who was the man behind The Chipmunks? A: The man behind The Chipmunks was a man named Chip.

Q: Who was the man behind The Chipmunks? A: The man behind The Chipmunks was a man named Chip.

Q: Who was the man behind The Chipmunks? A: The man behind The Chipmunks was a man named Chip.

Q: Who was the man behind The Chipmunks? A: The man behind The Chipmunks was a
-------------------------
Q: Which Lloyd Webber musical premiered in the US on 10th December 1993? A:
True answ

Apart from all the answers being wrong, after the answer, the model continues to either repeat the question/answer or asks new questions. To account for this issue a better prompt format is needed. To account for this issue, the paper (page 16) proposes to trim all generations by pattern matching for the bad-words "Q:", "Question:", "QUESTION:", "questions:". This means that those tokens will not be part of the generation. 

Another problem is that " Q:" and " A:" are tokenized with a leading space. This leads to the problem that each generation starts with a space and therefore the probability of the sequence is distorted. As [this](https://github.com/meta-llama/llama/issues/217#issuecomment-1774147331) discussion on GitHub proposes, I remove all spaces.

In [9]:
print(f"Tokenization for 'Q:': {tokenizer('Q:')}\t for ' Q:': {tokenizer(' Q:')}")
print(f"Tokenization for 'A:': {tokenizer('A:')}\t for ' A:': {tokenizer(' A:')}")
print()
example_answer = "The Chipmunks Q: Who was the man behind The Chipmunks? A:"
print(f"Excerpt of example answer{example_answer}")
tokenized_example_answer = tokenizer(example_answer)
print(f"Tokenized example answer: {tokenized_example_answer}")

Tokenization for 'Q:': {'input_ids': [2, 1864, 35], 'attention_mask': [1, 1, 1]}	 for ' Q:': {'input_ids': [2, 1209, 35], 'attention_mask': [1, 1, 1]}
Tokenization for 'A:': {'input_ids': [2, 250, 35], 'attention_mask': [1, 1, 1]}	 for ' A:': {'input_ids': [2, 83, 35], 'attention_mask': [1, 1, 1]}

Excerpt of example answerThe Chipmunks Q: Who was the man behind The Chipmunks? A:
Tokenized example answer: {'input_ids': [2, 133, 11055, 20614, 2258, 1209, 35, 3394, 21, 5, 313, 639, 20, 11055, 20614, 2258, 116, 83, 35], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


As you can see, the series 1209, 35 and the series 83, 35 appear in the tokenized example answer, but the series 1864, 35 and the series 250, 35 for the token without leading space don't appear. When removing the spaces, the tokenizer has problems in tokenizing "Q:" and "A:" as one token. As a result, I change them to "QUESTION:" and "ANSWER:".

## Better prompt format/Changes in generation
- QUESTION:, ANSWER: instead of Q:, A:
- No empty spaces between QUESTION:, ANSWER:
- Paper (page 16): Few-shot prompting with n=10
- New line after each sample in few shot prompt 
- New line character as eos token id (this makes sense as TriviaQA provides short answers in one line) --> Generation stops, once a \n is encountered
- After a lot of tries and evaluating answers: Improved the list of stop tokens
- Set maximum token length that generated answer can have to smaller value as answers in TriviaQA are also quite short

In [10]:
# Few shot prompt
selected_training_data = data_trivia_train.select(range(0, 10))
ten_shot_prompt = ""
for data in selected_training_data:
    ten_shot_prompt += "QUESTION:" + data["question"] + "ANSWER:" + data["answer"]["value"] + "\n"

# Define stop tokens, use token on position 1 bc position 0 is special token
stop_tokens = ["Q:", "Question:", "QUESTION:", "questions:", " Q:", " Question:", " QUESTION:", " questions:",
               "A:", "Answer:", "ANSWER:", "answers:", " A:", " Answer:", " ANSWER:", " answers:", "Answers:",
               " Answers:",
               "Topic:", " Topic:", "TOPIC:", " TOPIC:", ".", " .", "...", " ...", "?", " ?", ":", " :", "!", " !"]
stop_tokens = [[tokenizer(stop_token)["input_ids"][1]] for stop_token in stop_tokens]

# Define eos token
eos_token = tokenizer("\n")["input_ids"][1]
tokenizer.pad_token_id = eos_token
tokenizer.eos_token_id = eos_token

# Maximum token length that generated answer can have
max_new_tokens = 32

That means, that the few shot prompt looks like this + "QUESTION:<question we really want to ask>ANSWER:". In the following, only the question we really want to ask is printed out.

In [11]:
print(ten_shot_prompt)

QUESTION:Which American-born Sinclair won the Nobel Prize for Literature in 1930?ANSWER:Sinclair Lewis
QUESTION:Where in England was Dame Judi Dench born?ANSWER:York
QUESTION:In which decade did Billboard magazine first publish and American hit chart?ANSWER:30s
QUESTION:From which country did Angola achieve independence in 1975?ANSWER:Portugal
QUESTION:Which city does David Soul come from?ANSWER:Chicago
QUESTION:Who won Super Bowl XX?ANSWER:Chicago Bears
QUESTION:Which was the first European country to abolish capital punishment?ANSWER:Norway
QUESTION:In which country did he widespread use of ISDN begin in 1988?ANSWER:Japan
QUESTION:What is Bruce Willis' real first name?ANSWER:Walter
QUESTION:Which William wrote the novel Lord Of The Flies?ANSWER:Golding



Try generating sequences again with the new prompt format:

In [12]:
for i in range(10):
    question = ten_shot_prompt + "QUESTION:" + data_trivia_val[i]["question"] + "ANSWER:"
    answer = data_trivia_val[i]["answer"]["value"]

    # As mentioned above, only print question
    print(f"Question: {data_trivia_val[i]['question']}")
    print(f"True Answer: {answer}")

    inputs = tokenizer(question, padding=False, truncation=False, return_tensors="pt").to(device)
    length_input = inputs["input_ids"].shape[1]

    # Generate sequence by always taking token with max probability (greedy)
    output_generate = model.generate(inputs.input_ids,
                                     max_new_tokens=max_new_tokens,
                                     eos_token_id=eos_token,
                                     bad_words_ids=stop_tokens)
    output = tokenizer.batch_decode(output_generate[0][length_input:], skip_special_tokens=True)
    print(f"Model Output: {''.join(output)}")
    print("-------------------------")

Question: Who was the man behind The Chipmunks?
True Answer: David Seville
Model Output: Walt Disney

-------------------------
Question: Which Lloyd Webber musical premiered in the US on 10th December 1993?
True Answer: Sunset Boulevard
Model Output: The Phantom Of The Opera

-------------------------
Question: Who was the next British Prime Minister after Arthur Balfour?
True Answer: Campbell-Bannerman
Model Output: John Major

-------------------------
Question: Who had a 70s No 1 hit with Kiss You All Over?
True Answer: Exile
Model Output: The Beatles

-------------------------
Question: What claimed the life of singer Kathleen Ferrier?
True Answer: Cancer
Model Output: The Beatles

-------------------------
Question: Rita Coolidge sang the title song for which Bond film?
True Answer: Octopussy
Model Output: Live And Let Die

-------------------------
Question: What was the last US state to reintroduce alcohol after prohibition?
True Answer: Utah
Model Output: New York

-----------

The format of the answers looks good - we can argue about their correctness though (except for one example).

## Include probability of generated output sequence and number of output tokens
Number of output tokens used to calculate length-normalized predictive entropy

For calculating the probability of generated output, see: https://discuss.huggingface.co/t/announcement-generation-get-probabilities-for-generated-output/30075

The function that calculates the probability of a sequence is implemented in the `utils.py` file. When using the generate method with multiple return sequences the special token <pad> for padding purposes is used. As special tokens should not be included in the calculation of the probability of the output, they are excluded (also in the count of output tokens). Special tokens can be identified by having a < and a >. 

For example: 
['D', 'ery', 'ck', ' Gibson', '\n', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']

Here, only the first 5 tokens should be included.

In [13]:
for i in [100, 200, 300]:
    question = ten_shot_prompt + "QUESTION:" + data_trivia_val[i]["question"] + "ANSWER:"
    answer = data_trivia_val[i]["answer"]["value"]

    # As mentioned above, only print question
    print(f"Question: {data_trivia_val[i]['question']}")
    print(f"True Answer: {answer}")

    inputs = tokenizer(question, padding=False, truncation=False, return_tensors="pt").to(device)
    length_input = inputs["input_ids"].shape[1]

    output_generate = model.generate(inputs.input_ids,
                                     max_new_tokens=max_new_tokens,
                                     eos_token_id=eos_token,
                                     bad_words_ids=stop_tokens,
                                     return_dict_in_generate=True,
                                     output_scores=True)
    output = tokenizer.batch_decode(output_generate.sequences[0][length_input:], skip_special_tokens=True)

    # Calculate probability and count output tokens
    prob_output, n_output_tokens = calculate_probability_sequence(model, tokenizer, output_generate, length_input,
                                                                  print_scores=True)

    print(f"Model Output: {''.join(output)}")
    print(f"Probability of output: {prob_output}")
    print(f"Number of output tokens: {n_output_tokens}")
    print("-------------------------")

Question: Which Oscar-nominated film had You Sexy Thing as its theme song?
True Answer: The Full Monty
  133 | 'The'        | -1.9775 | 13.84%
 6748 | ' Dep'       | -3.2594 | 3.84%
26587 | 'arted'      | -0.0164 | 98.38%
50118 | '\n'         | -0.0608 | 94.10%
Model Output: The Departed

Probability of output: 0.004921769236503534
Number of output tokens: 4
-------------------------
Question: Who was the first American to travel faster than the speed of sound?
True Answer: Chuck Yeager
  863 | 'J'          | -3.3494 | 3.51%
17773 | 'ules'       | -1.1234 | 32.52%
 3060 | ' Ver'       | -0.0153 | 98.48%
  858 | 'ne'         | -0.0002 | 99.98%
50118 | '\n'         | -0.0715 | 93.10%
Model Output: Jules Verne

Probability of output: 0.010463550529798284
Number of output tokens: 5
-------------------------
Question: What is the longest word can be typed using only the top row of letters on a typewriter?
True Answer: Typewriter
  176 | '2'          | -3.7893 | 2.26%
    6 | ','          | 

## Sampling multiple answers
See: https://huggingface.co/docs/transformers/en/generation_strategies and https://huggingface.co/blog/how-to-generate

Sample 10 answers per question (see page 8 in paper).



In [14]:
# How many questions to test in this jupyter notebook
n_questions = 10

# How many answers to sample (here 10, see page 8 in paper)
n_sample = 10

# Temperature for sampling (used in paper: 0.25, 0.5, 1, 1.5)
temperature = 0.5

### Multinomial sampling

In [15]:
for i in range(n_questions):
    question = ten_shot_prompt + "QUESTION:" + data_trivia_val[i]["question"] + "ANSWER:"
    answer = data_trivia_val[i]["answer"]["value"]

    # As mentioned above, only print question
    print(f"Question: {data_trivia_val[i]['question']}")
    print(f"True Answer: {answer}")

    inputs = tokenizer(question, padding=False, truncation=False, return_tensors="pt").to(device)
    length_input = inputs["input_ids"].shape[1]

    # Sample sequences
    output_generate = model.generate(inputs.input_ids,
                                     max_new_tokens=max_new_tokens,
                                     eos_token_id=eos_token,
                                     bad_words_ids=stop_tokens,
                                     return_dict_in_generate=True,
                                     output_scores=True,
                                     do_sample=True,
                                     num_return_sequences=n_sample,
                                     temperature=temperature,
                                     top_p=0.9)

    for n_sequence in range(n_sample):
        output = tokenizer.batch_decode(output_generate.sequences[n_sequence][length_input:],
                                        skip_special_tokens=True)

        # Calculating probability of sequence
        prob_output, n_output_tokens = calculate_probability_sequence(model, tokenizer, output_generate, length_input,
                                                                      idx=n_sequence,
                                                                      print_scores=False)

        # Print out result
        output_string = "".join(output).replace("\n", "")
        print(f"Sequence ({n_sequence}): {output_string} (P: {prob_output:.6}, Length output: {n_output_tokens})")
    print("-------------------------")

Question: Who was the man behind The Chipmunks?
True Answer: David Seville
Sequence (0): Harrison Bergeron (P: 0.00175879, Length output: 5)
Sequence (1): Morten (P: 0.00174914, Length output: 4)
Sequence (2): Chip (P: 0.0737231, Length output: 2)
Sequence (3): Bender (P: 0.00250416, Length output: 3)
Sequence (4): Kris Kristofferson (P: 0.00443387, Length output: 6)
Sequence (5): Spencer (P: 0.0198019, Length output: 4)
Sequence (6): Thomas Middleditch (P: 0.0646235, Length output: 5)
Sequence (7): Thomas Middleditch (P: 0.0646235, Length output: 5)
Sequence (8): Hoffman (P: 0.007357, Length output: 4)
Sequence (9): Steven Spielberg (P: 0.0162728, Length output: 3)
-------------------------
Question: Which Lloyd Webber musical premiered in the US on 10th December 1993?
True Answer: Sunset Boulevard
Sequence (0): Les Miserables (P: 0.0623636, Length output: 5)
Sequence (1): The Phantom Of The Opera (P: 0.810635, Length output: 6)
Sequence (2): The Phantom Of The Opera (P: 0.810635, Len

### Multinomial Beam Sampling
n_sample highest scoring beams are returned

In [16]:
for i in range(n_questions):
    question = ten_shot_prompt + "QUESTION:" + data_trivia_val[i]["question"] + "ANSWER:"
    answer = data_trivia_val[i]["answer"]["value"]

    # As mentioned above, only print question
    print(f"Question: {data_trivia_val[i]['question']}")
    print(f"True Answer: {answer}")

    inputs = tokenizer(question, padding=False, truncation=False, return_tensors="pt").to(device)
    length_input = inputs["input_ids"].shape[1]

    # Sample sequences
    output_generate = model.generate(inputs.input_ids,
                                     max_new_tokens=max_new_tokens,
                                     eos_token_id=eos_token,
                                     bad_words_ids=stop_tokens,
                                     return_dict_in_generate=True,
                                     output_scores=True,
                                     do_sample=True,
                                     num_beams=2 * n_sample,
                                     num_return_sequences=n_sample)

    for n_sequence in range(n_sample):
        output = tokenizer.batch_decode(output_generate.sequences[n_sequence][length_input:],
                                        skip_special_tokens=True)

        # Calculating probability of sequence
        prob_output, n_output_tokens = calculate_probability_sequence(model, tokenizer, output_generate, length_input,
                                                                      beam_sampling=True,
                                                                      idx=n_sequence,
                                                                      print_scores=False)

        # Print out result
        output_string = ''.join(output).replace("\n", "")
        print(f"Sequence ({n_sequence}): {output_string} (P: {prob_output:.6}, Length output: {n_output_tokens})")
    print("-------------------------")

Question: Who was the man behind The Chipmunks?
True Answer: David Seville
Sequence (0): Werner HerzogI don't know if this is the right place to post this, but I was wondering if anyone would be able to help me (P: 4.54025e-15, Length output: 31)
Sequence (1): Werner Herzog (P: 0.00127962, Length output: 6)
Sequence (2): Werner Herzog (P: 0.00127962, Length output: 6)
Sequence (3): Werner HerzogI don't know if this is the right place to post this, but I was wondering if anyone would be able to tell me (P: 2.35571e-15, Length output: 31)
Sequence (4): Wyatt Earp (P: 0.00163115, Length output: 6)
Sequence (5): Werner HerzogI don't know if this is the right place to post this, but I was wondering if anyone could help me out with a (P: 4.927e-15, Length output: 31)
Sequence (6): Werner HerzogI don't know if this is the right place to ask this, but does anyone know where I can get a copy of the (P: 4.18717e-15, Length output: 31)
Sequence (7): Thomas Middleditch (P: 0.00711988, Length outpu

Overall, the quality of generations with beam search looks worse than the one for multimodal sampling. This has mainly three reasons:
1) Beam search suffers from repetitive generation. This could be solve by setting no_repeat_ngram_size and early_stopping, however sometimes repetition is desirable 
2) The generations are longer than the ones for multimodal sampling, however the true answers of the TriviaQA dataset are quite short. 
3) The generations with beam search seem less diverse. This is also noted in the paper (page 15) and will be visualized later by calculating the diversity scores.


## Save generations
To not have to run the generation of sequences every time, I save the generations. There are 

In [17]:
len(data_trivia_val)

17944

number of samples (=questions with sample answers) in the validation set. For measuring the uncertainty, diversity of generations, ... I don't use the whole validation set due to computational limitations, but will randomly sample 5 groups à 500 samples and then report the mean, SD across those groups.

In [18]:
import os

In [19]:
n_samples_per_group = 500
n_groups = 5

if not os.path.exists("sampled_sequences/group_indices.txt"):
    with open("sampled_sequences/group_indices.txt", "w") as f:
        for _ in range(n_groups):
            sampled_indices = np.random.randint(0, len(data_trivia_val), n_samples_per_group)
            f.write(",".join([str(i) for i in sampled_indices]) + "\n")

Per group, a pickle file containing a dict with the following structure should be created:

**For multinomial sampling:**
```python 
{ 1131: {"question": ..., 
         "true_answer": ..., 
         "temperature_0.25": {"answers": [...], "probabilities": [...], "length_output": [...]},
         "temperature_0.5": {"answers": [...], "probabilities": [...], "length_output": [...]},
         "temperature_1": {"answers": [...], "probabilities": [...], "length_output": [...]},
         "temperature_1.5": {"answers": [...], "probabilities": [...], "length_output": [...]}
        }, 
  4295: ...
}
```

**For multinomial beam sampling:**
```python 
{ 1131: {"question": ..., 
         "true_answer": ..., 
         "beam_20": {"answers": [...], "probabilities": [...], "length_output": [...]}
        }, 
  4295: ...
}
```
whereby 1131, 4295 are indices belonging to the group. In the following the created pickle file is visualized to see that the structure is correct, the full code of how to create it can be found in the file `save_generations.py`.

(Maybe later on for multinomial beam sampling different number of beams are tried out, that's why it is saved like this).

In [24]:
import pickle
with open("sampled_sequences/multinomial_sampling/group0.pkl", "rb") as f:
    content = pickle.load(f)
print(content)

{12541: {'question': "Abraham Maslow's 'Hierarchy of Needs' theory explains?", 'true_answer': 'Motivation', 'temperature_0.25': {'answers': ['The need for self-actualization\n', 'The need for self-actualization\n', 'Hierarchy of Needs\n', 'The need for self-esteem, self-actualization, and self-actualization\n', 'The need for self-actualization\n', 'The need for self-actualization\n', 'Hierarchy of Needs\n', 'The need for self-esteem, self-actualization, self-reliance, self-esteem, self-actualization, self-esteem, self-', 'The need for self-actualization\n', 'The need for self-actualization\n'], 'probabilities': [0.678553236435347, 0.678553236435347, 0.11729502165704453, 0.02018552522669472, 0.678553236435347, 0.678553236435347, 0.11729502165704453, 0.003271788409672518, 0.678553236435347, 0.678553236435347], 'length_sequences': [8, 8, 6, 18, 8, 8, 6, 32, 8, 8]}, 'temperature_0.5': {'answers': ['Needs, needs, needs, needs, needs, needs, needs, needs, needs, needs, needs, needs, needs, n