# Group semantically equivalent answers into bins for each question
Model: Deberta (https://huggingface.co/sileod/deberta-v3-large-tasksource-nli)

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import yaml
import os
import pickle
import glob
from tqdm import tqdm

with open("config.yaml", "r") as f:
    config = yaml.safe_load(f)

In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cuda')

In [6]:
model_dir = config["model_dir"]
save_path = config["path_to_saved_generations"]

In [6]:
model_name = "sileod/deberta-v3-large-tasksource-nli"
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=model_dir)
model = AutoModelForSequenceClassification.from_pretrained(model_name, cache_dir=model_dir).to(device)

In [4]:
def load_pickle_files(folder):
    data_groups = []
    pickle_files = glob.glob(f"{folder}/group*.pkl")
    for pickle_file in pickle_files:
        with open(pickle_file, "rb") as f:
            data_groups.append(pickle.load(f))

    return data_groups

## Example
Two answers are semantically equivalent if 
- "Question: *question* Answer: *generated answer*" $\Rightarrow$ "Question: *question* Answer: *true answer*" **and** 
- "Question: *question* Answer: *true answer*" $\Rightarrow$ "Question: *question* Answer: *generated answer*"

= Bidirectional entailment

In [11]:
question = "Question: How many continents does the world have?"
sequence1 = "Answer: There are seven continents."
sequence2 = "Answer: Seven."

# Direction 1
premise = question + " " + sequence1
hypothesis = question + " " + sequence2
print(premise + " => " + hypothesis)
input = tokenizer(premise, hypothesis, return_tensors="pt")
output = model(input["input_ids"].to(device))
prediction = torch.softmax(output["logits"][0], -1).tolist()
label_names = ["entailment", "neutral", "contradiction"]
prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
print(prediction, end="\n\n")

# Direction 2
premise = question + " " + sequence2
hypothesis = question + " " + sequence1
print(premise + " => " + hypothesis)
input = tokenizer(premise, hypothesis, return_tensors="pt")
output = model(input["input_ids"].to(device))
prediction = torch.softmax(output["logits"][0], -1).tolist()
prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
print(prediction)

Question: How many continents does the world have? Answer: There are seven continents. => Question: How many continents does the world have? Answer: Seven.
{'entailment': 99.4, 'neutral': 0.6, 'contradiction': 0.0}

Question: How many continents does the world have? Answer: Seven. => Question: How many continents does the world have? Answer: There are seven continents.
{'entailment': 97.1, 'neutral': 2.9, 'contradiction': 0.0}


## Save lexical equivalence groups
Used to see the distinct words in an equivalence class

Structure to save them
```python 
{ 1131: {"question": ..., 
         "true_answer": ..., 
         "temperature_0.25": {"answers": [...], "probabilities": [...], "length_output": [...], "lexical_eq_groups": [0, 1, 0, 2, 3, ...]},
         "temperature_0.5": {"answers": [...], "probabilities": [...], "length_output": [...], "lexical_eq_groups": [...]},
         "temperature_1": {"answers": [...], "probabilities": [...], "length_output": [...], "lexical_eq_groups": [...]},
         "temperature_1.5": {"answers": [...], "probabilities": [...], "length_output": [...], "lexical_eq_groups": [...]},
         "beam_20": {"answers": [...], "probabilities": [...], "length_output": [...], "lexical_eq_groups": [...]}
        }, 
  4295: ...
}
```
where the numbers in lexical_eq_groups stand for the group they belong to.

In [36]:
data = load_pickle_files(save_path)
config_keys = [f"temperature_{t}" for t in config["temperatures"]] + [f"beam_{b}" for b in config["n_beams"]]

In [38]:
for group_nr, group in enumerate(data):
    for question_id in tqdm(group):
        for k in config_keys:
            answers = group[question_id][k]["answers"]
            lexical_eq_groups = [-1] * len(answers)
            group_count = 0

            for i in range(len(answers)):
                if lexical_eq_groups[i] == -1:
                    lexical_eq_groups[i] = group_count
                    for j in range(i + 1, len(answers)):
                        if answers[i] == answers[j]:
                            lexical_eq_groups[j] = group_count
                    group_count += 1
            group[question_id][k]["lexical_eq_groups"] = lexical_eq_groups

    # Save result
    with open(os.path.join(save_path, f"group{group_nr}.pkl"), "wb") as f:
        pickle.dump(group, f)

100%|██████████| 1000/1000 [00:00<00:00, 16643.20it/s]
100%|██████████| 1000/1000 [00:00<00:00, 22476.31it/s]
100%|██████████| 1000/1000 [00:00<00:00, 22723.87it/s]
100%|██████████| 1000/1000 [00:00<00:00, 22207.71it/s]
100%|██████████| 1000/1000 [00:00<00:00, 21737.55it/s]


## Bidirectional Entailment Clustering
Used to calculate semantic entropy

Structure to save them
```python 
{ 1131: {"question": ..., 
         "true_answer": ..., 
         "temperature_0.25": {"answers": [...], "probabilities": [...], "length_output": [...], "lexical_eq_groups": [...], "semantic_eq_groups": [...]},
         "temperature_0.5": {"answers": [...], "probabilities": [...], "length_output": [...], "lexical_eq_groups": [...], "semantic_eq_groups": [...]},
         "temperature_1": {"answers": [...], "probabilities": [...], "length_output": [...], "lexical_eq_groups": [...], "semantic_eq_groups": [...]},
         "temperature_1.5": {"answers": [...], "probabilities": [...], "length_output": [...], "lexical_eq_groups": [...], "semantic_eq_groups": [...]},
         "beam_20": {"answers": [...], "probabilities": [...], "length_output": [...], "lexical_eq_groups": [...], "semantic_eq_groups": [...]}
        }, 
  4295: ...
}
```
where the numbers in semantic_eq_groups stand for the semantic equivalence class they belong to.

The algo is written as pseudocode on page 15.

In [8]:
def bidirectional_entailment(question, answer1, answer2):
    """
    Tests whether bidirectional entailment Question answer1 <=> Question answer2 holds
    :return: True for bidirectional entailment, False otherwise
    """
    true_answer = "Answer: " + answer1
    question = "Question: " + question
    generated_answer = "Answer: " + answer2

    # First direction
    premise = question + " " + generated_answer
    hypothesis = question + " " + true_answer
    input = tokenizer(premise, hypothesis, return_tensors="pt")
    output = model(input["input_ids"].to(device))
    prediction = torch.argmax(output["logits"][0], dim=-1).item()  # 0: entail, 1: neutral, 2: contradiction

    # Only if first direction entailment: look at second direction
    if prediction == 0:
        input = tokenizer(hypothesis, premise, return_tensors="pt")
        output = model(input["input_ids"].to(device))
        prediction = torch.argmax(output["logits"][0], dim=-1).item()
        if prediction == 0:
            return True

    return False

In [9]:
def bidirectional_entailment_clustering(question, answers):
    # Save as list as order is needed ("Use first sequence for each semantic-class")
    # Only save indices of answers
    meanings = [[0]]

    for m in range(1, len(answers)):
        added = False
        for equivalence_class_nr, equivalence_class in enumerate(meanings):
            # Use first sequence for each semantic class
            s_c = answers[equivalence_class[0]]
            if bidirectional_entailment(question, s_c, answers[m]):
                meanings[equivalence_class_nr].append(m)
                added = True
                break
        if not added:
            meanings.append([m])

    return meanings

In [10]:
data = load_pickle_files(save_path)
config_keys = [f"temperature_{t}" for t in config["temperatures"]] + [f"beam_{b}" for b in config["n_beams"]]

In [16]:
for group_nr, group in enumerate(data):
    for question_id in tqdm(group):
        for k in config_keys:
            # Already created
            if "semantic_eq_groups" in group[question_id][k]:
                continue

            answers = group[question_id][k]["answers"]
            question = group[question_id]["question"]
            semantic_eq_groups = [-1] * len(answers)

            semantic_clusters = bidirectional_entailment_clustering(question, answers)
            for cluster_id, cluster in enumerate(semantic_clusters):
                for index in cluster:
                    semantic_eq_groups[index] = cluster_id

            group[question_id][k]["semantic_eq_groups"] = semantic_eq_groups

        # Save result
        with open(os.path.join(save_path, f"group{group_nr}.pkl"), "wb") as f:
            pickle.dump(group, f)


100%|██████████| 1000/1000 [00:13<00:00, 72.26it/s]
100%|██████████| 1000/1000 [00:13<00:00, 73.04it/s]
100%|██████████| 1000/1000 [00:13<00:00, 72.49it/s]
100%|██████████| 1000/1000 [00:13<00:00, 73.66it/s]
100%|██████████| 1000/1000 [00:13<00:00, 73.39it/s]


## Test if correctly saved

In [17]:
data = load_pickle_files(save_path)
with open(os.path.join(save_path, "group_indices.txt"), "r") as f:
    indices_groups = [[int(i) for i in line.strip().split(",")] for line in f]

for group_nr, group in enumerate(data):

    assert len(group) == 1000
    assert set(group.keys()) == set(indices_groups[group_nr])
    for question_id in group.keys():
        for k in config_keys:
            assert len(group[question_id][k]["semantic_eq_groups"]) == config["n_generations_per_answer"]
    print(f"group{group_nr} - Yes.")

group0 - Yes.
group1 - Yes.
group2 - Yes.
group3 - Yes.
group4 - Yes.


## Analysis of result
Most of the time, this approach correctly identifies the equivalence classes. Examples:

In [7]:
data = load_pickle_files(save_path)

In [178]:
print(f"Question: {data[3][76]['question']}")
for a, entail in zip(data[3][76]["temperature_0.5"]["answers"], data[3][76]["temperature_0.5"]["semantic_eq_groups"]):
    print(f"{a} (Group: {entail})")

print(f"\nQuestion: {data[0][129]['question']}")
for a, entail in zip(data[0][129]["temperature_1"]["answers"], data[0][129]["temperature_1"]["semantic_eq_groups"]):
    print(f"{a} (Group: {entail})")

print(f"\nQuestion: {data[2][354]['question']}")
for a, entail in zip(data[2][354]["temperature_0.25"]["answers"],
                     data[2][354]["temperature_0.25"]["semantic_eq_groups"]):
    print(f"{a} (Group: {entail})")

Question: On what date in 1969 did Neil Armstrong first set foot on the Moon?
August 20 (Group: 0)
July 20 (Group: 1)
August 20th (Group: 0)
July 20th (Group: 1)
August 20th (Group: 0)
20th (Group: 2)
20th (Group: 2)
July 20 (Group: 1)
July 20 (Group: 1)
July 20th (Group: 1)

Question: The Naismith Award is presented in which sport?
The game of basketball (Group: 0)
All Sports (Group: 1)
basketball (Group: 0)
American Football, Basketball and Baseball (Group: 2)
Hockey (Group: 3)
basketball (Group: 0)
basketball (Group: 0)
NBA (Group: 4)
Baseball (Group: 5)
Soccer (Group: 6)

Question: In a standard deck of cards, how many Kings have a moustache?
Seven (Group: 0)
7 (Group: 0)
6 (Group: 1)
2 (Group: 2)
2 (Group: 2)
2 (Group: 2)
Seven (Group: 0)
Seven (Group: 0)
5 (Group: 3)
2 (Group: 2)


Even if answers like "July 20"/"July 20th" or "Seven"/"7" are not lexical equivalent, they still belong to the same equivalence class. Answers like "American Football, Basketball and Baseball" that contain parts of other answers ("basketball/"Baseball") still form their own equivalence class.

There are two potential problems of the bidirectional entailment clustering algo:
1. Due to the transitivity of an equivalence relation, all elements in an equivalence class should entail each other. However, with this implementation, that does not always have to hold true. An answer is placed into an equivalence class by only comparing it to the first element. Despite this, all cases I had a look at seemed to be fine. In the paper, they observe the same thing (page 14).
2. The entailment determined by using a model is sometimes inaccurate. Especially in the beam sampling cases, more equivalence classes than actually needed are formed.

In [9]:
print(f"Question: {data[0][12962]['question']}")
for a, entail in zip(data[0][12962]["beam_20"]["answers"], data[0][12962]["beam_20"]["lexical_eq_groups"]):
    print(f"{a} (Group: {entail})")

Question: "Which German social economist of Jewish descent, expelled from Germany and France, co-wrote in London ""The Communist Manifesto"" and ""Das Kapital""?"
Karl Marx (Group: 0)
Karl Marx (1818-1883) (Group: 1)
Karl Marx and Friedrich Engels (Group: 2)
Karl Marx (1818-1883) and Friedrich Engels (1817-1892) (Group: 3)
Karl Marx (1818-1883) and Friedrich Engels (1819-1892) (Group: 4)
Karl Marx (1818-1883) and Friedrich Engels (1817-1876) (Group: 5)
Karl Marx (1818-1883) and Friedrich Engels (1817-1896) (Group: 6)
Karl Marx (1818-1883) and Friedrich Engels (1819-1894) (Group: 7)
Karl Liebknecht (Group: 8)
Karl Marx (1818-1883) and Friedrich Engels (1817-1894) (Group: 9)


"Karl Marx" should be in the same equivalence class as "Karl Marx (1818 - 1883)". The majority of other answers are also similar and only vary in the years that was not even asked.