In [1]:
import os
import sys

sys.path.append("../../../../../")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import copy
import torch
from datetime import datetime
from src.utils.helper import Config, color_print
from src.utils.load import load_model, load_data, save_checkpoint
from src.models.evaluate import (
    evaluate_model,
    get_sparsity,
    get_similarity,
    get_perplexity,
)
from src.utils.sampling import SamplingDataset
from src.pruning.prune_head import head_importance_prunning
from src.pruning.prune import prune_concern_identification
from src.utils.helper import report_to_df, append_nth_row

In [3]:
name = "YahooAnswersTopics"
device = torch.device("cuda:0")
checkpoint = None
batch_size = 16
num_workers = 4
num_samples = 16
ratio = 0.4
seed = 44
include_layers = ["intermediate", "output"]
exclude_layers = [
    "attention",
]

In [4]:
script_start_time = datetime.now()
print(f"Script started at: {script_start_time.strftime('%Y-%m-%d %H:%M:%S')}")

Script started at: 2024-10-20 02:17:01


In [5]:
config = Config(name, device)
num_labels = config.config["num_labels"]
model = load_model(config)

Loading the model.




{

'architectures'

: 

'bert'

,
 

'dataset_name'

: 

'YahooAnswersTopics'

,
 

'model_name'

: 

'fabriceyhc/bert-base-uncased-yahoo_answers_topics'

,
 

'num_labels'

: 

10

,
 

'tokenizer_name'

: 

'fabriceyhc/bert-base-uncased-yahoo_answers_topics'

}




The model fabriceyhc/bert-base-uncased-yahoo_answers_topics is loaded.




In [6]:
train_dataloader, valid_dataloader, test_dataloader = load_data(
    config,
    batch_size=batch_size,
    num_workers=num_workers,
    do_cache=True,
)

Loading cached dataset YahooAnswersTopics.




train.pkl is loaded from cache.




valid.pkl is loaded from cache.




test.pkl is loaded from cache.




The dataset YahooAnswersTopics is loaded




{

'config_name'

: 

'yahoo_answers_topics'

,
 

'features'

: 

{'first_column': 'question_title', 'second_column': 'topic'}

,
 

'path'

: 

'yahoo_answers_topics'

}




In [7]:
# print("Evaluate the original model")
# result = evaluate_model(model, config, test_dataloader)

In [8]:
result_list = []

for concern in range(config.num_labels):
    config.init_seed()
    positive_samples = SamplingDataset(
        train_dataloader,
        config,
        concern,
        num_samples,
        True,
        4,
        resample=False,
    )
    negative_samples = SamplingDataset(
        train_dataloader,
        config,
        concern,
        num_samples,
        False,
        4,
        resample=False,
    )
    all_samples = SamplingDataset(
        train_dataloader,
        config,
        200,
        num_samples,
        False,
        4,
        resample=False,
    )

    module = copy.deepcopy(model)

    head_importance_prunning(module, config, all_samples, ratio)

    prune_concern_identification(
        module,
        config,
        positive_samples,
        negative_samples,
        include_layers=include_layers,
        exclude_layers=exclude_layers,
        sparsity_ratio=ratio,
        keep_dim=False,
        method="structed",
    )

    print(f"Evaluate the pruned model {concern}")
    result = evaluate_model(module, config, test_dataloader, verbose=True)
    result_list.append(result)
    get_sparsity(module)

    get_similarity(model, module, valid_dataloader, concern, num_samples, config)
    print("original model's perplexity")
    get_perplexity(model, valid_dataloader, config)
    print("pruned model's perplexity")
    get_perplexity(module, valid_dataloader, config)

Total heads to prune: 57




tensor([[0.6076, 0.4772, 0.3568, 0.3846, 0.4560, 0.3766, 0.5436, 0.4334, 0.6432,
         0.5125, 0.4895, 0.4496],
        [0.5521, 0.6671, 0.3830, 0.3551, 0.5890, 0.4487, 0.3329, 0.3896, 0.4603,
         0.4626, 0.4191, 0.5129],
        [0.7132, 0.4412, 0.3058, 0.3228, 0.2868, 0.3049, 0.4221, 0.3026, 0.3443,
         0.6358, 0.3968, 0.3584],
        [0.3185, 0.2584, 0.2544, 0.3329, 0.4143, 0.7929, 0.2071, 0.3755, 0.3678,
         0.5597, 0.2876, 0.3968],
        [0.5160, 0.3828, 0.3549, 0.6017, 0.6927, 0.4596, 0.3273, 0.3173, 0.3073,
         0.3367, 0.4392, 0.6130],
        [0.4848, 0.3186, 0.5481, 0.2955, 0.5394, 0.2806, 0.2502, 0.4210, 0.2485,
         0.7515, 0.5052, 0.2697],
        [0.6288, 0.4500, 0.3530, 0.3226, 0.4115, 0.5442, 0.6367, 0.3214, 0.4035,
         0.4223, 0.3378, 0.6786],
        [0.5168, 0.3592, 0.4791, 0.3492, 0.6916, 0.3457, 0.3678, 0.4930, 0.3084,
         0.3526, 0.4262, 0.6160],
        [0.2873, 0.2877, 0.7601, 0.3070, 0.4546, 0.4570, 0.4629, 0.2402, 0.2399,




{(3, 1), (4, 9), (4, 6), (3, 10), (8, 0), (5, 1), (8, 3), (10, 0), (8, 9), (9, 8), (11, 5), (10, 3), (10, 9), (9, 11), (2, 2), (2, 5), (11, 8), (1, 6), (2, 8), (6, 2), (4, 2), (3, 0), (3, 3), (5, 6), (4, 8), (3, 6), (5, 3), (10, 2), (9, 4), (9, 10), (8, 8), (2, 4), (10, 5), (11, 1), (11, 10), (10, 8), (2, 7), (7, 3), (7, 9), (6, 7), (6, 10), (3, 2), (4, 7), (9, 0), (5, 5), (5, 11), (5, 8), (8, 7), (8, 1), (11, 3), (10, 1), (10, 7), (8, 10), (2, 3), (7, 5), (6, 3), (7, 8)}




Evaluate the pruned model 0




Evaluating the model:   0%|                                                                               | 0/…

0.17797764372092212




{'bert.encoder.layer.0.attention.self.query.weight': 0.0, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.0, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.0, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.0, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.08333333333333333, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.08333333333333333, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.08333333333333333, 'bert.encoder.layer.1.




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.5970696040459635




CCA coefficients mean non-concern: 0.6054373364075742




Linear CKA concern: 0.5227961420777701




Linear CKA non-concern: 0.540140207483476




Kernel CKA concern: 0.16689341080753098




Kernel CKA non-concern: 0.13236224995852983




original model's perplexity




2.6398401260375977




pruned model's perplexity




9.96669864654541




Total heads to prune: 57




tensor([[0.6076, 0.4772, 0.3568, 0.3846, 0.4560, 0.3766, 0.5436, 0.4334, 0.6432,
         0.5125, 0.4895, 0.4496],
        [0.5521, 0.6671, 0.3830, 0.3551, 0.5890, 0.4487, 0.3329, 0.3896, 0.4603,
         0.4626, 0.4191, 0.5129],
        [0.7132, 0.4412, 0.3058, 0.3228, 0.2868, 0.3049, 0.4221, 0.3026, 0.3443,
         0.6358, 0.3968, 0.3584],
        [0.3185, 0.2584, 0.2544, 0.3329, 0.4143, 0.7929, 0.2071, 0.3755, 0.3678,
         0.5597, 0.2876, 0.3968],
        [0.5160, 0.3828, 0.3549, 0.6017, 0.6927, 0.4596, 0.3273, 0.3173, 0.3073,
         0.3367, 0.4392, 0.6130],
        [0.4848, 0.3186, 0.5481, 0.2955, 0.5394, 0.2806, 0.2502, 0.4210, 0.2485,
         0.7515, 0.5052, 0.2697],
        [0.6288, 0.4500, 0.3530, 0.3226, 0.4115, 0.5442, 0.6367, 0.3214, 0.4035,
         0.4223, 0.3378, 0.6786],
        [0.5168, 0.3592, 0.4791, 0.3492, 0.6916, 0.3457, 0.3678, 0.4930, 0.3084,
         0.3526, 0.4262, 0.6160],
        [0.2873, 0.2877, 0.7601, 0.3070, 0.4546, 0.4570, 0.4629, 0.2402, 0.2399,




{(3, 1), (4, 9), (4, 6), (3, 10), (8, 0), (5, 1), (8, 3), (10, 0), (8, 9), (9, 8), (11, 5), (10, 3), (10, 9), (9, 11), (2, 2), (2, 5), (11, 8), (1, 6), (2, 8), (6, 2), (4, 2), (3, 0), (3, 3), (5, 6), (4, 8), (3, 6), (5, 3), (10, 2), (9, 4), (9, 10), (8, 8), (2, 4), (10, 5), (11, 1), (11, 10), (10, 8), (2, 7), (7, 3), (7, 9), (6, 7), (6, 10), (3, 2), (4, 7), (9, 0), (5, 5), (5, 11), (5, 8), (8, 7), (8, 1), (11, 3), (10, 1), (10, 7), (8, 10), (2, 3), (7, 5), (6, 3), (7, 8)}




Evaluate the pruned model 1




Evaluating the model:   0%|                                                                               | 0/…

0.17797764372092212




{'bert.encoder.layer.0.attention.self.query.weight': 0.0, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.0, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.0, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.0, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.08333333333333333, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.08333333333333333, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.08333333333333333, 'bert.encoder.layer.1.




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.5987482085512843




CCA coefficients mean non-concern: 0.6040240469585337




Linear CKA concern: 0.37535533219194894




Linear CKA non-concern: 0.5448252992397373




Kernel CKA concern: 0.0724475907272264




Kernel CKA non-concern: 0.15011659660990595




original model's perplexity




2.6398401260375977




pruned model's perplexity




9.969001770019531




Total heads to prune: 57




tensor([[0.6076, 0.4772, 0.3568, 0.3846, 0.4560, 0.3766, 0.5436, 0.4334, 0.6432,
         0.5125, 0.4895, 0.4496],
        [0.5521, 0.6671, 0.3830, 0.3551, 0.5890, 0.4487, 0.3329, 0.3896, 0.4603,
         0.4626, 0.4191, 0.5129],
        [0.7132, 0.4412, 0.3058, 0.3228, 0.2868, 0.3049, 0.4221, 0.3026, 0.3443,
         0.6358, 0.3968, 0.3584],
        [0.3185, 0.2584, 0.2544, 0.3329, 0.4143, 0.7929, 0.2071, 0.3755, 0.3678,
         0.5597, 0.2876, 0.3968],
        [0.5160, 0.3828, 0.3549, 0.6017, 0.6927, 0.4596, 0.3273, 0.3173, 0.3073,
         0.3367, 0.4392, 0.6130],
        [0.4848, 0.3186, 0.5481, 0.2955, 0.5394, 0.2806, 0.2502, 0.4210, 0.2485,
         0.7515, 0.5052, 0.2697],
        [0.6288, 0.4500, 0.3530, 0.3226, 0.4115, 0.5442, 0.6367, 0.3214, 0.4035,
         0.4223, 0.3378, 0.6786],
        [0.5168, 0.3592, 0.4791, 0.3492, 0.6916, 0.3457, 0.3678, 0.4930, 0.3084,
         0.3526, 0.4262, 0.6160],
        [0.2873, 0.2877, 0.7601, 0.3070, 0.4546, 0.4570, 0.4629, 0.2402, 0.2399,




{(3, 1), (4, 9), (4, 6), (3, 10), (8, 0), (5, 1), (8, 3), (10, 0), (8, 9), (9, 8), (11, 5), (10, 3), (10, 9), (9, 11), (2, 2), (2, 5), (11, 8), (1, 6), (2, 8), (6, 2), (4, 2), (3, 0), (3, 3), (5, 6), (4, 8), (3, 6), (5, 3), (10, 2), (9, 4), (9, 10), (8, 8), (2, 4), (10, 5), (11, 1), (11, 10), (10, 8), (2, 7), (7, 3), (7, 9), (6, 7), (6, 10), (3, 2), (4, 7), (9, 0), (5, 5), (5, 11), (5, 8), (8, 7), (8, 1), (11, 3), (10, 1), (10, 7), (8, 10), (2, 3), (7, 5), (6, 3), (7, 8)}




Evaluate the pruned model 2




Evaluating the model:   0%|                                                                               | 0/…

0.17797764372092212




{'bert.encoder.layer.0.attention.self.query.weight': 0.0, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.0, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.0, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.0, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.08333333333333333, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.08333333333333333, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.08333333333333333, 'bert.encoder.layer.1.




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.5901774539991682




CCA coefficients mean non-concern: 0.6062304872115293




Linear CKA concern: 0.19947449727382577




Linear CKA non-concern: 0.5721971378032544




Kernel CKA concern: 0.036511947675072834




Kernel CKA non-concern: 0.16002807806519526




original model's perplexity




2.6398401260375977




pruned model's perplexity




9.979424476623535




Total heads to prune: 57




tensor([[0.6076, 0.4772, 0.3568, 0.3846, 0.4560, 0.3766, 0.5436, 0.4334, 0.6432,
         0.5125, 0.4895, 0.4496],
        [0.5521, 0.6671, 0.3830, 0.3551, 0.5890, 0.4487, 0.3329, 0.3896, 0.4603,
         0.4626, 0.4191, 0.5129],
        [0.7132, 0.4412, 0.3058, 0.3228, 0.2868, 0.3049, 0.4221, 0.3026, 0.3443,
         0.6358, 0.3968, 0.3584],
        [0.3185, 0.2584, 0.2544, 0.3329, 0.4143, 0.7929, 0.2071, 0.3755, 0.3678,
         0.5597, 0.2876, 0.3968],
        [0.5160, 0.3828, 0.3549, 0.6017, 0.6927, 0.4596, 0.3273, 0.3173, 0.3073,
         0.3367, 0.4392, 0.6130],
        [0.4848, 0.3186, 0.5481, 0.2955, 0.5394, 0.2806, 0.2502, 0.4210, 0.2485,
         0.7515, 0.5052, 0.2697],
        [0.6288, 0.4500, 0.3530, 0.3226, 0.4115, 0.5442, 0.6367, 0.3214, 0.4035,
         0.4223, 0.3378, 0.6786],
        [0.5168, 0.3592, 0.4791, 0.3492, 0.6916, 0.3457, 0.3678, 0.4930, 0.3084,
         0.3526, 0.4262, 0.6160],
        [0.2873, 0.2877, 0.7601, 0.3070, 0.4546, 0.4570, 0.4629, 0.2402, 0.2399,




{(3, 1), (4, 9), (4, 6), (3, 10), (8, 0), (5, 1), (8, 3), (10, 0), (8, 9), (9, 8), (11, 5), (10, 3), (10, 9), (9, 11), (2, 2), (2, 5), (11, 8), (1, 6), (2, 8), (6, 2), (4, 2), (3, 0), (3, 3), (5, 6), (4, 8), (3, 6), (5, 3), (10, 2), (9, 4), (9, 10), (8, 8), (2, 4), (10, 5), (11, 1), (11, 10), (10, 8), (2, 7), (7, 3), (7, 9), (6, 7), (6, 10), (3, 2), (4, 7), (9, 0), (5, 5), (5, 11), (5, 8), (8, 7), (8, 1), (11, 3), (10, 1), (10, 7), (8, 10), (2, 3), (7, 5), (6, 3), (7, 8)}




Evaluate the pruned model 3




Evaluating the model:   0%|                                                                               | 0/…

0.17797764372092212




{'bert.encoder.layer.0.attention.self.query.weight': 0.0, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.0, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.0, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.0, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.08333333333333333, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.08333333333333333, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.08333333333333333, 'bert.encoder.layer.1.




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.6018388325031656




CCA coefficients mean non-concern: 0.6051193187387528




Linear CKA concern: 0.433537145141622




Linear CKA non-concern: 0.5266360988429228




Kernel CKA concern: 0.11262001960002976




Kernel CKA non-concern: 0.13822772519262058




original model's perplexity




2.6398401260375977




pruned model's perplexity




9.961028099060059




Total heads to prune: 57




tensor([[0.6076, 0.4772, 0.3568, 0.3846, 0.4560, 0.3766, 0.5436, 0.4334, 0.6432,
         0.5125, 0.4895, 0.4496],
        [0.5521, 0.6671, 0.3830, 0.3551, 0.5890, 0.4487, 0.3329, 0.3896, 0.4603,
         0.4626, 0.4191, 0.5129],
        [0.7132, 0.4412, 0.3058, 0.3228, 0.2868, 0.3049, 0.4221, 0.3026, 0.3443,
         0.6358, 0.3968, 0.3584],
        [0.3185, 0.2584, 0.2544, 0.3329, 0.4143, 0.7929, 0.2071, 0.3755, 0.3678,
         0.5597, 0.2876, 0.3968],
        [0.5160, 0.3828, 0.3549, 0.6017, 0.6927, 0.4596, 0.3273, 0.3173, 0.3073,
         0.3367, 0.4392, 0.6130],
        [0.4848, 0.3186, 0.5481, 0.2955, 0.5394, 0.2806, 0.2502, 0.4210, 0.2485,
         0.7515, 0.5052, 0.2697],
        [0.6288, 0.4500, 0.3530, 0.3226, 0.4115, 0.5442, 0.6367, 0.3214, 0.4035,
         0.4223, 0.3378, 0.6786],
        [0.5168, 0.3592, 0.4791, 0.3492, 0.6916, 0.3457, 0.3678, 0.4930, 0.3084,
         0.3526, 0.4262, 0.6160],
        [0.2873, 0.2877, 0.7601, 0.3070, 0.4546, 0.4570, 0.4629, 0.2402, 0.2399,




{(3, 1), (4, 9), (4, 6), (3, 10), (8, 0), (5, 1), (8, 3), (10, 0), (8, 9), (9, 8), (11, 5), (10, 3), (10, 9), (9, 11), (2, 2), (2, 5), (11, 8), (1, 6), (2, 8), (6, 2), (4, 2), (3, 0), (3, 3), (5, 6), (4, 8), (3, 6), (5, 3), (10, 2), (9, 4), (9, 10), (8, 8), (2, 4), (10, 5), (11, 1), (11, 10), (10, 8), (2, 7), (7, 3), (7, 9), (6, 7), (6, 10), (3, 2), (4, 7), (9, 0), (5, 5), (5, 11), (5, 8), (8, 7), (8, 1), (11, 3), (10, 1), (10, 7), (8, 10), (2, 3), (7, 5), (6, 3), (7, 8)}




Evaluate the pruned model 4




Evaluating the model:   0%|                                                                               | 0/…

0.17797764372092212




{'bert.encoder.layer.0.attention.self.query.weight': 0.0, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.0, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.0, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.0, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.08333333333333333, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.08333333333333333, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.08333333333333333, 'bert.encoder.layer.1.




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.5962319900996272




CCA coefficients mean non-concern: 0.6060034024050107




Linear CKA concern: 0.280452940237152




Linear CKA non-concern: 0.5365525026262534




Kernel CKA concern: 0.11635769375583184




Kernel CKA non-concern: 0.14109326531422464




original model's perplexity




2.6398401260375977




pruned model's perplexity




9.964201927185059




Total heads to prune: 57




tensor([[0.6076, 0.4772, 0.3568, 0.3846, 0.4560, 0.3766, 0.5436, 0.4334, 0.6432,
         0.5125, 0.4895, 0.4496],
        [0.5521, 0.6671, 0.3830, 0.3551, 0.5890, 0.4487, 0.3329, 0.3896, 0.4603,
         0.4626, 0.4191, 0.5129],
        [0.7132, 0.4412, 0.3058, 0.3228, 0.2868, 0.3049, 0.4221, 0.3026, 0.3443,
         0.6358, 0.3968, 0.3584],
        [0.3185, 0.2584, 0.2544, 0.3329, 0.4143, 0.7929, 0.2071, 0.3755, 0.3678,
         0.5597, 0.2876, 0.3968],
        [0.5160, 0.3828, 0.3549, 0.6017, 0.6927, 0.4596, 0.3273, 0.3173, 0.3073,
         0.3367, 0.4392, 0.6130],
        [0.4848, 0.3186, 0.5481, 0.2955, 0.5394, 0.2806, 0.2502, 0.4210, 0.2485,
         0.7515, 0.5052, 0.2697],
        [0.6288, 0.4500, 0.3530, 0.3226, 0.4115, 0.5442, 0.6367, 0.3214, 0.4035,
         0.4223, 0.3378, 0.6786],
        [0.5168, 0.3592, 0.4791, 0.3492, 0.6916, 0.3457, 0.3678, 0.4930, 0.3084,
         0.3526, 0.4262, 0.6160],
        [0.2873, 0.2877, 0.7601, 0.3070, 0.4546, 0.4570, 0.4629, 0.2402, 0.2399,




{(3, 1), (4, 9), (4, 6), (3, 10), (8, 0), (5, 1), (8, 3), (10, 0), (8, 9), (9, 8), (11, 5), (10, 3), (10, 9), (9, 11), (2, 2), (2, 5), (11, 8), (1, 6), (2, 8), (6, 2), (4, 2), (3, 0), (3, 3), (5, 6), (4, 8), (3, 6), (5, 3), (10, 2), (9, 4), (9, 10), (8, 8), (2, 4), (10, 5), (11, 1), (11, 10), (10, 8), (2, 7), (7, 3), (7, 9), (6, 7), (6, 10), (3, 2), (4, 7), (9, 0), (5, 5), (5, 11), (5, 8), (8, 7), (8, 1), (11, 3), (10, 1), (10, 7), (8, 10), (2, 3), (7, 5), (6, 3), (7, 8)}




Evaluate the pruned model 5




Evaluating the model:   0%|                                                                               | 0/…

In [None]:
df_list = [report_to_df(df) for df in result_list]
new_df = append_nth_row(df_list)
csv_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
new_df.to_csv(f"results/{csv_name}.csv", index=False)
print(csv_name)
new_df