Code below can be used to generate the Attention Head Redundancy (AHR) matrices to be used for the next steps. 
Things to that need to be modified to replicate the experiments on different tasks/models:
<ol>
<li> Task and model (loaded using HuggingFace) </li>
<li> Number of heads in the AHR visualization </li>
</ol>

In [1]:
from datasets import load_dataset
from tqdm import tqdm
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import pickle
import matplotlib.pyplot as plt
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm
To use data.metrics please install scikit-learn. See https://scikit-learn.org/stable/index.html


In [None]:
dataset = load_dataset("boolq") # Select dataset for experiments
input_pairs = [[row['question'], row['passage']] for row in dataset['train']] # Change to fit the input sequence of the chosen dataset

## Supporting functions

In [None]:
# Computes average cosine similarity for each input
def compute_distances_for_all_pairs(attention_matrices):
    """
    Compute the cosine similarities across the attention matrices generated for a singular input

    Args:
    - inputs atention matrix.

    Returns:
    - Similarity scores for all pairs of attention heads in the input (expected no_of_attention_head_pairs C 2).
    """
    # Assume attention_matrices is of shape (144, n, n)
    num_attention_matrices, n_tokens, _ = attention_matrices.shape
    distances = []

    for i in range(num_attention_matrices):
        for j in range(i + 1, num_attention_matrices):
            total_distance = 0
            for token_idx in range(n_tokens):
                vector_a = attention_matrices[i, token_idx, :].flatten()
                vector_b = attention_matrices[j, token_idx, :].flatten()

                # Compute cosine similarity
                dot_product = np.dot(vector_a, vector_b)
                norm_a = np.linalg.norm(vector_a)
                norm_b = np.linalg.norm(vector_b)
                similarity = dot_product / (norm_a * norm_b)

                total_distance += similarity

            avg_distance = total_distance / n_tokens
            distances.append(avg_distance)

    return distances

In [None]:
# Iterate through all input pairs and extract attention matrix
def compute_cosine_sim_for_all_inputs(inputs_list):
  all_avg_similarities = []
  for input_pairs in tqdm(inputs_list, desc='Pairs of inputs'):
    tokenized_input = tokenizer(input_pairs[0], input_pairs[1], truncation=True, padding=True, max_length=512, return_tensors='pt')
    with torch.no_grad():
      outputs = model(**tokenized_input)

    attention_outputs = torch.stack(outputs.attentions)
    #attention_outputs = attention_outputs.cpu()
    flattened_attention_matrices = attention_outputs.view(-1, attention_outputs.size(3), attention_outputs.size(4))

    final_scalar_values = compute_distances_for_all_pairs(flattened_attention_matrices.numpy())
    all_avg_similarities.append(final_scalar_values)

  return all_avg_similarities

## Load model

In [None]:
# Loading models from HF, the models used below are custom fine-tuned models on the BoolQ task

# Fine-tuned BERT BoolQ
tokenizer = AutoTokenizer.from_pretrained("rycecorn/bert-fine-tuned-boolq")
model = AutoModelForSequenceClassification.from_pretrained("rycecorn/bert-fine-tuned-boolq", output_attentions=True)

'''# Fine-tuned DistilBERT
model_link = "rycecorn/distil-bert-fine-tuned-boolq"
tokenizer = AutoTokenizer.from_pretrained(model_link)
model = AutoModelForSequenceClassification.from_pretrained(model_link, output_attentions=True)'''

## Generate the AHR matrices

In [None]:
ahr_matrices = compute_cosine_sim_for_all_inputs(input_pairs[:1000]) # change to desired number of input
## output has dim (N, num_pairs)

# Saving the input to visualize the AHR matrices
with open('./outputs/attention_head_redundancy/boolq_cosine_sim_BERT_1000.pkl', 'wb') as file:
    pickle.dump(ahr_matrices, file)

# AHR visualization

In [None]:
# Calculate average redundancies across all pairs
average_correlation_across_inputs = [sum(values) / len(values) for values in zip(*ahr_matrices)]

# Number of attention heads
num_heads = 144 # Change to 72 for DistilBERT variants

# Convert distances to similarities
# Example using exponential decay, adjust based on your preference
similarities = np.exp(-np.array(average_correlation_across_inputs))

# Initialize the redundancy matrix with zeros
redundancy_matrix = np.zeros((num_heads, num_heads))

# Fill the redundancy matrix
k = 0
for i in range(num_heads):
    for j in range(i+1, num_heads):
        redundancy_matrix[i, j] = similarities[k]
        redundancy_matrix[j, i] = similarities[k]  # Symmetric
        k += 1

# Set the diagonal to the maximum similarity score
np.fill_diagonal(redundancy_matrix, 1)


# Visualizing the attention redundancy matrix
plt.figure(figsize=(10, 8))
sns.heatmap(redundancy_matrix, cmap='viridis', annot=False, square=True, cbar_kws={'shrink': .5})
plt.title('Attention Redundancy Matrix for BoolQ - Cosine similarity / 1000 inputs / BERT-base')
plt.xlabel('Attention Head')
plt.ylabel('Attention Head')
plt.savefig('./outputs/attention_head_redundancy/ahr_boolq_cosine_sim_BERT-Base-BoolQ_1000.png')
plt.show()