In [1]:
import os
import sys

sys.path.append("../../../../../")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import copy
import torch
from datetime import datetime
from src.utils.helper import Config, color_print
from src.utils.load import load_model, load_data, save_checkpoint
from src.models.evaluate import (
    evaluate_model,
    get_sparsity,
    get_similarity,
    get_perplexity,
)
from src.utils.sampling import SamplingDataset
from src.pruning.prune_head import head_importance_prunning
from src.pruning.prune import (
    prune_concern_identification,
)
from src.utils.helper import report_to_df, append_nth_row

In [3]:
name = "bert-6-128-yahoo"
device = torch.device("cuda:0")
checkpoint = None
batch_size = 16
num_workers = 4
num_samples = 16
ratio = 0.6
seed = 44
include_layers = ["intermediate", "output"]
exclude_layers = ["attention"]

In [4]:
script_start_time = datetime.now()
print(f"Script started at: {script_start_time.strftime('%Y-%m-%d %H:%M:%S')}")

Script started at: 2024-10-22 20:12:40


In [5]:
config = Config(name, device)
num_labels = config.config["num_labels"]
model = load_model(config)

Loading the model.




{

'architectures'

: 

'bert'

,
 

'dataset_name'

: 

'YahooAnswersTopics'

,
 

'model_name'

: 

'models/bert-6-128-yahoo'

,
 

'num_labels'

: 

10

,
 

'tokenizer_name'

: 

'fabriceyhc/bert-base-uncased-yahoo_answers_topics'

}




The model models/bert-6-128-yahoo is loaded.




In [6]:
train_dataloader, valid_dataloader, test_dataloader = load_data(
    config,
    batch_size=batch_size,
    num_workers=num_workers,
    do_cache=True,
)

Loading cached dataset YahooAnswersTopics.




train.pkl is loaded from cache.




valid.pkl is loaded from cache.




test.pkl is loaded from cache.




The dataset YahooAnswersTopics is loaded




{

'config_name'

: 

'yahoo_answers_topics'

,
 

'features'

: 

{'first_column': 'question_title', 'second_column': 'topic'}

,
 

'path'

: 

'yahoo_answers_topics'

}




In [7]:
from src.utils.load import load_cache
from src.utils.data_class import CustomEmbeddingDataset
from torch.utils.data import DataLoader

generated = load_cache(
    "datasets/generated_dataset/embedding_based/4_128-yahoo",
    "4_128-yahoo_top1.pkl",
)

4_128-yahoo_top1.pkl is loaded from cache.




In [8]:
generated["embeddings"] = generated.pop("example_list")
generated["labels"] = generated.pop("example_label")
generated["attention_mask"] = generated.pop("attn_list")

In [9]:
generated_data = CustomEmbeddingDataset(generated)
generated_dataloder = DataLoader(
    generated_data,
    batch_size=4,
)

In [10]:
# print("Evaluate the original model")
# result = evaluate_model(model, config, test_dataloader)

In [11]:
result_list = []

for concern in range(config.num_labels):
    config.init_seed()
    positive_samples = SamplingDataset(
        generated_dataloder,
        config,
        concern,
        num_samples,
        True,
        4,
        resample=False,
    )
    negative_samples = SamplingDataset(
        generated_dataloder,
        config,
        concern,
        num_samples,
        False,
        4,
        resample=False,
    )
    all_samples = SamplingDataset(
        generated_dataloder,
        config,
        200,
        num_samples,
        False,
        4,
        resample=False,
    )

    module = copy.deepcopy(model)

    head_importance_prunning(module, config, all_samples, ratio)

    prune_concern_identification(
        module,
        config,
        positive_samples,
        negative_samples,
        include_layers=include_layers,
        exclude_layers=exclude_layers,
        sparsity_ratio=ratio,
        keep_dim=False,
        method="structed",
    )
    print(f"Evaluate the pruned model {concern}")
    result = evaluate_model(module, config, test_dataloader, verbose=True)
    result_list.append(result)
    get_sparsity(module)

    get_similarity(model, module, valid_dataloader, concern, num_samples, config)
    print("original model's perplexity")
    get_perplexity(model, valid_dataloader, config)
    print("pruned model's perplexity")
    get_perplexity(module, valid_dataloader, config)

Total heads to prune: 7




tensor([[0.5141, 0.4859],
        [0.5482, 0.4518],
        [0.5592, 0.4408],
        [0.4687, 0.5313],
        [0.4890, 0.5110],
        [0.4927, 0.5073]])




{(0, 1), (4, 0), (2, 1), (1, 1), (5, 1), (3, 0), (5, 0)}




Evaluate the pruned model 0




Evaluating the model:   0%|                                                                                   …

0.3137975944227381




{'bert.encoder.layer.0.attention.self.query.weight': 0.5, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5, 'bert.encoder.layer.1.attention.self.value.bias': 0.0, 'bert.encoder.l




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.9660701427945384




CCA coefficients mean non-concern: 0.9622365751064134




Linear CKA concern: 0.6264528688692976




Linear CKA non-concern: 0.6367184679241555




Kernel CKA concern: 0.6567704224904459




Kernel CKA non-concern: 0.6293524958853169




original model's perplexity




3.187649726867676




pruned model's perplexity




3.8163065910339355




Total heads to prune: 7




tensor([[0.5141, 0.4859],
        [0.5482, 0.4518],
        [0.5592, 0.4408],
        [0.4687, 0.5313],
        [0.4890, 0.5110],
        [0.4927, 0.5073]])




{(0, 1), (4, 0), (2, 1), (1, 1), (5, 1), (3, 0), (5, 0)}




Evaluate the pruned model 1




Evaluating the model:   0%|                                                                                   …

0.3137975944227381




{'bert.encoder.layer.0.attention.self.query.weight': 0.5, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5, 'bert.encoder.layer.1.attention.self.value.bias': 0.0, 'bert.encoder.l




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.97533093778815




CCA coefficients mean non-concern: 0.9616737184317661




Linear CKA concern: 0.5771223097195451




Linear CKA non-concern: 0.6411191367738074




Kernel CKA concern: 0.5818264707410163




Kernel CKA non-concern: 0.6253003623635202




original model's perplexity




3.187649726867676




pruned model's perplexity




3.8245787620544434




Total heads to prune: 7




tensor([[0.5141, 0.4859],
        [0.5482, 0.4518],
        [0.5592, 0.4408],
        [0.4687, 0.5313],
        [0.4890, 0.5110],
        [0.4927, 0.5073]])




{(0, 1), (4, 0), (2, 1), (1, 1), (5, 1), (3, 0), (5, 0)}




Evaluate the pruned model 2




Evaluating the model:   0%|                                                                                   …

0.3137975944227381




{'bert.encoder.layer.0.attention.self.query.weight': 0.5, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5, 'bert.encoder.layer.1.attention.self.value.bias': 0.0, 'bert.encoder.l




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.9619441456315962




CCA coefficients mean non-concern: 0.9598923568793193




Linear CKA concern: 0.6331118285548982




Linear CKA non-concern: 0.6417347849337616




Kernel CKA concern: 0.7252340440854086




Kernel CKA non-concern: 0.6292327501602856




original model's perplexity




3.187649726867676




pruned model's perplexity




3.8247833251953125




Total heads to prune: 7




tensor([[0.5141, 0.4859],
        [0.5482, 0.4518],
        [0.5592, 0.4408],
        [0.4687, 0.5313],
        [0.4890, 0.5110],
        [0.4927, 0.5073]])




{(0, 1), (4, 0), (2, 1), (1, 1), (5, 1), (3, 0), (5, 0)}




Evaluate the pruned model 3




Evaluating the model:   0%|                                                                                   …

0.3137975944227381




{'bert.encoder.layer.0.attention.self.query.weight': 0.5, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5, 'bert.encoder.layer.1.attention.self.value.bias': 0.0, 'bert.encoder.l




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.9650663856505287




CCA coefficients mean non-concern: 0.9651043768734336




Linear CKA concern: 0.673771659897624




Linear CKA non-concern: 0.6231023403338511




Kernel CKA concern: 0.6407567664506834




Kernel CKA non-concern: 0.6271648191391606




original model's perplexity




3.187649726867676




pruned model's perplexity




3.816075563430786




Total heads to prune: 7




tensor([[0.5141, 0.4859],
        [0.5482, 0.4518],
        [0.5592, 0.4408],
        [0.4687, 0.5313],
        [0.4890, 0.5110],
        [0.4927, 0.5073]])




{(0, 1), (4, 0), (2, 1), (1, 1), (5, 1), (3, 0), (5, 0)}




Evaluate the pruned model 4




Evaluating the model:   0%|                                                                                   …

0.3137975944227381




{'bert.encoder.layer.0.attention.self.query.weight': 0.5, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5, 'bert.encoder.layer.1.attention.self.value.bias': 0.0, 'bert.encoder.l




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.9660917689060151




CCA coefficients mean non-concern: 0.9644919880922714




Linear CKA concern: 0.7172852925342434




Linear CKA non-concern: 0.639024674512436




Kernel CKA concern: 0.7563734437334042




Kernel CKA non-concern: 0.6097295978105107




original model's perplexity




3.187649726867676




pruned model's perplexity




3.8154327869415283




Total heads to prune: 7




tensor([[0.5141, 0.4859],
        [0.5482, 0.4518],
        [0.5592, 0.4408],
        [0.4687, 0.5313],
        [0.4890, 0.5110],
        [0.4927, 0.5073]])




{(0, 1), (4, 0), (2, 1), (1, 1), (5, 1), (3, 0), (5, 0)}




Evaluate the pruned model 5




Evaluating the model:   0%|                                                                                   …

0.3137975944227381




{'bert.encoder.layer.0.attention.self.query.weight': 0.5, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5, 'bert.encoder.layer.1.attention.self.value.bias': 0.0, 'bert.encoder.l




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.9638581472235609




CCA coefficients mean non-concern: 0.9673955085063685




Linear CKA concern: 0.5300387992215106




Linear CKA non-concern: 0.6202908742114989




Kernel CKA concern: 0.44159379329730614




Kernel CKA non-concern: 0.6320813756309549




original model's perplexity




3.187649726867676




pruned model's perplexity




3.8277299404144287




Total heads to prune: 7




tensor([[0.5141, 0.4859],
        [0.5482, 0.4518],
        [0.5592, 0.4408],
        [0.4687, 0.5313],
        [0.4890, 0.5110],
        [0.4927, 0.5073]])




{(0, 1), (4, 0), (2, 1), (1, 1), (5, 1), (3, 0), (5, 0)}




Evaluate the pruned model 6




Evaluating the model:   0%|                                                                                   …

0.3137975944227381




{'bert.encoder.layer.0.attention.self.query.weight': 0.5, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5, 'bert.encoder.layer.1.attention.self.value.bias': 0.0, 'bert.encoder.l




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.9710783912778719




CCA coefficients mean non-concern: 0.9612040112971446




Linear CKA concern: 0.6532028591339621




Linear CKA non-concern: 0.6297160124480621




Kernel CKA concern: 0.6613751365045812




Kernel CKA non-concern: 0.6272122018546845




original model's perplexity




3.187649726867676




pruned model's perplexity




3.8198554515838623




Total heads to prune: 7




tensor([[0.5141, 0.4859],
        [0.5482, 0.4518],
        [0.5592, 0.4408],
        [0.4687, 0.5313],
        [0.4890, 0.5110],
        [0.4927, 0.5073]])




{(0, 1), (4, 0), (2, 1), (1, 1), (5, 1), (3, 0), (5, 0)}




Evaluate the pruned model 7




Evaluating the model:   0%|                                                                                   …

0.3137975944227381




{'bert.encoder.layer.0.attention.self.query.weight': 0.5, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5, 'bert.encoder.layer.1.attention.self.value.bias': 0.0, 'bert.encoder.l




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.9691943182513182




CCA coefficients mean non-concern: 0.9677787154667755




Linear CKA concern: 0.6912249625675996




Linear CKA non-concern: 0.6061608071158973




Kernel CKA concern: 0.5811771183456991




Kernel CKA non-concern: 0.6254871501560914




original model's perplexity




3.187649726867676




pruned model's perplexity




3.8165974617004395




Total heads to prune: 7




tensor([[0.5141, 0.4859],
        [0.5482, 0.4518],
        [0.5592, 0.4408],
        [0.4687, 0.5313],
        [0.4890, 0.5110],
        [0.4927, 0.5073]])




{(0, 1), (4, 0), (2, 1), (1, 1), (5, 1), (3, 0), (5, 0)}




Evaluate the pruned model 8




Evaluating the model:   0%|                                                                                   …

0.3137975944227381




{'bert.encoder.layer.0.attention.self.query.weight': 0.5, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5, 'bert.encoder.layer.1.attention.self.value.bias': 0.0, 'bert.encoder.l




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.9712296511930616




CCA coefficients mean non-concern: 0.9690371480680267




Linear CKA concern: 0.6594515230572425




Linear CKA non-concern: 0.6223011117676158




Kernel CKA concern: 0.7188993003237171




Kernel CKA non-concern: 0.6306306264096602




original model's perplexity




3.187649726867676




pruned model's perplexity




3.8261232376098633




Total heads to prune: 7




tensor([[0.5141, 0.4859],
        [0.5482, 0.4518],
        [0.5592, 0.4408],
        [0.4687, 0.5313],
        [0.4890, 0.5110],
        [0.4927, 0.5073]])




{(0, 1), (4, 0), (2, 1), (1, 1), (5, 1), (3, 0), (5, 0)}




Evaluate the pruned model 9




Evaluating the model:   0%|                                                                                   …

0.3137975944227381




{'bert.encoder.layer.0.attention.self.query.weight': 0.5, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5, 'bert.encoder.layer.1.attention.self.value.bias': 0.0, 'bert.encoder.l




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.9779330340984289




CCA coefficients mean non-concern: 0.9661293804100023




Linear CKA concern: 0.6011532868264693




Linear CKA non-concern: 0.6187654340515846




Kernel CKA concern: 0.5439155469997654




Kernel CKA non-concern: 0.6252877252151864




original model's perplexity




3.187649726867676




pruned model's perplexity




3.8280863761901855




In [12]:
df_list = [report_to_df(df) for df in result_list]
new_df = append_nth_row(df_list)
csv_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
new_df.to_csv(f"results/{csv_name}.csv", index=False)
print(csv_name)
new_df

2024-10-22_20-26-54




Unnamed: 0,class,precision,recall,f1-score,support
0,0,0.4325,0.5715,0.4924,2992
1,1,0.6528,0.5227,0.5805,2992
2,2,0.6786,0.6086,0.6417,3012
3,3,0.3244,0.5874,0.4179,2998
4,4,0.7674,0.6391,0.6974,2973
5,5,0.806,0.5658,0.6649,3054
6,6,0.6584,0.3633,0.4682,3003
7,7,0.6745,0.4595,0.5466,3012
8,8,0.6147,0.6559,0.6347,2982
9,9,0.5379,0.7237,0.6171,2982
