In [1]:
import os
import sys

sys.path.append("../../../../../")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import copy
import torch
from datetime import datetime
from src.utils.helper import Config, color_print
from src.utils.load import load_model, load_data, save_checkpoint
from src.models.evaluate import (
    evaluate_model,
    get_sparsity,
    get_similarity,
    get_perplexity,
)
from src.utils.sampling import SamplingDataset
from src.pruning.prune_head import head_importance_prunning
from src.pruning.prune import prune_concern_identification
from src.utils.helper import report_to_df, append_nth_row

In [3]:
name = "bert-small-yahoo"
device = torch.device("cuda:0")
checkpoint = None
batch_size = 16
num_workers = 4
num_samples = 16
ratio = 0.6
seed = 44
include_layers = ["intermediate", "output"]
exclude_layers = [
    "attention",
]

In [4]:
script_start_time = datetime.now()
print(f"Script started at: {script_start_time.strftime('%Y-%m-%d %H:%M:%S')}")

Script started at: 2024-10-22 04:04:56


In [5]:
config = Config(name, device)
num_labels = config.config["num_labels"]
model = load_model(config)

Loading the model.




{

'architectures'

: 

'bert'

,
 

'dataset_name'

: 

'YahooAnswersTopics'

,
 

'model_name'

: 

'models/bert-small-yahoo'

,
 

'num_labels'

: 

10

,
 

'tokenizer_name'

: 

'fabriceyhc/bert-base-uncased-yahoo_answers_topics'

}




The model models/bert-small-yahoo is loaded.




In [6]:
train_dataloader, valid_dataloader, test_dataloader = load_data(
    config,
    batch_size=batch_size,
    num_workers=num_workers,
    do_cache=True,
)

Loading cached dataset YahooAnswersTopics.




train.pkl is loaded from cache.




valid.pkl is loaded from cache.




test.pkl is loaded from cache.




The dataset YahooAnswersTopics is loaded




{

'config_name'

: 

'yahoo_answers_topics'

,
 

'features'

: 

{'first_column': 'question_title', 'second_column': 'topic'}

,
 

'path'

: 

'yahoo_answers_topics'

}




In [7]:
# print("Evaluate the original model")
# result = evaluate_model(model, config, test_dataloader)

In [8]:
result_list = []

for concern in range(config.num_labels):
    config.init_seed()
    positive_samples = SamplingDataset(
        train_dataloader,
        config,
        concern,
        num_samples,
        True,
        4,
        resample=False,
    )
    negative_samples = SamplingDataset(
        train_dataloader,
        config,
        concern,
        num_samples,
        False,
        4,
        resample=False,
    )
    all_samples = SamplingDataset(
        train_dataloader,
        config,
        200,
        num_samples,
        False,
        4,
        resample=False,
    )

    module = copy.deepcopy(model)

    head_importance_prunning(module, config, all_samples, ratio)

    prune_concern_identification(
        module,
        config,
        positive_samples,
        negative_samples,
        include_layers=include_layers,
        exclude_layers=exclude_layers,
        sparsity_ratio=ratio,
        keep_dim=True,
        method="structed",
    )

    print(f"Evaluate the pruned model {concern}")
    result = evaluate_model(module, config, test_dataloader, verbose=True)
    result_list.append(result)
    get_sparsity(module)

    get_similarity(model, module, valid_dataloader, concern, num_samples, config)
    print("original model's perplexity")
    get_perplexity(model, valid_dataloader, config)
    print("pruned model's perplexity")
    get_perplexity(module, valid_dataloader, config)

Total heads to prune: 9




tensor([[0.4593, 0.4443, 0.4726, 0.5557],
        [0.4624, 0.4484, 0.5516, 0.4916],
        [0.4027, 0.5485, 0.5973, 0.4291],
        [0.4493, 0.5507, 0.4512, 0.4729]])




{(0, 1), (0, 0), (1, 1), (2, 0), (3, 0), (2, 3), (0, 2), (1, 0), (3, 2)}




Evaluate the pruned model 0




Evaluating the model:   0%|                                                                               | 0/…

0.5741789849041854




{'bert.encoder.layer.0.attention.self.query.weight': 0.75, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.75, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.75, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.75, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.599609375, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.599609375, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5, 'bert.encoder.layer.1.attention.self.value.bias': 




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.9451092065825311




CCA coefficients mean non-concern: 0.944591265257189




Linear CKA concern: 0.9327895868462415




Linear CKA non-concern: 0.928766031422348




Kernel CKA concern: 0.8348133827312767




Kernel CKA non-concern: 0.8215221074710094




original model's perplexity




3.168053388595581




pruned model's perplexity




3.5918400287628174




Total heads to prune: 9




tensor([[0.4593, 0.4443, 0.4726, 0.5557],
        [0.4624, 0.4484, 0.5516, 0.4916],
        [0.4027, 0.5485, 0.5973, 0.4291],
        [0.4493, 0.5507, 0.4512, 0.4729]])




{(0, 1), (0, 0), (1, 1), (2, 0), (3, 0), (2, 3), (0, 2), (1, 0), (3, 2)}




Evaluate the pruned model 1




Evaluating the model:   0%|                                                                               | 0/…

0.5741789849041854




{'bert.encoder.layer.0.attention.self.query.weight': 0.75, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.75, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.75, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.75, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.599609375, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.599609375, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5, 'bert.encoder.layer.1.attention.self.value.bias': 




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.95238592185948




CCA coefficients mean non-concern: 0.9437758537475839




Linear CKA concern: 0.9192559097995138




Linear CKA non-concern: 0.9275115161371361




Kernel CKA concern: 0.7945102219761108




Kernel CKA non-concern: 0.8193848431478896




original model's perplexity




3.168053388595581




pruned model's perplexity




3.62343168258667




Total heads to prune: 9




tensor([[0.4593, 0.4443, 0.4726, 0.5557],
        [0.4624, 0.4484, 0.5516, 0.4916],
        [0.4027, 0.5485, 0.5973, 0.4291],
        [0.4493, 0.5507, 0.4512, 0.4729]])




{(0, 1), (0, 0), (1, 1), (2, 0), (3, 0), (2, 3), (0, 2), (1, 0), (3, 2)}




Evaluate the pruned model 2




Evaluating the model:   0%|                                                                               | 0/…

0.5741789849041854




{'bert.encoder.layer.0.attention.self.query.weight': 0.75, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.75, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.75, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.75, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.599609375, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.599609375, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5, 'bert.encoder.layer.1.attention.self.value.bias': 




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.9433163977688322




CCA coefficients mean non-concern: 0.945492645423758




Linear CKA concern: 0.9135321542380246




Linear CKA non-concern: 0.9262294718052335




Kernel CKA concern: 0.7886783752054096




Kernel CKA non-concern: 0.8166277437219008




original model's perplexity




3.168053388595581




pruned model's perplexity




3.612149238586426




Total heads to prune: 9




tensor([[0.4593, 0.4443, 0.4726, 0.5557],
        [0.4624, 0.4484, 0.5516, 0.4916],
        [0.4027, 0.5485, 0.5973, 0.4291],
        [0.4493, 0.5507, 0.4512, 0.4729]])




{(0, 1), (0, 0), (1, 1), (2, 0), (3, 0), (2, 3), (0, 2), (1, 0), (3, 2)}




Evaluate the pruned model 3




Evaluating the model:   0%|                                                                               | 0/…

0.5741789849041854




{'bert.encoder.layer.0.attention.self.query.weight': 0.75, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.75, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.75, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.75, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.599609375, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.599609375, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5, 'bert.encoder.layer.1.attention.self.value.bias': 




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.9473978588486183




CCA coefficients mean non-concern: 0.9449840335467216




Linear CKA concern: 0.9320501253767597




Linear CKA non-concern: 0.9259139892597399




Kernel CKA concern: 0.8254392748621532




Kernel CKA non-concern: 0.8165263441691788




original model's perplexity




3.168053388595581




pruned model's perplexity




3.6154491901397705




Total heads to prune: 9




tensor([[0.4593, 0.4443, 0.4726, 0.5557],
        [0.4624, 0.4484, 0.5516, 0.4916],
        [0.4027, 0.5485, 0.5973, 0.4291],
        [0.4493, 0.5507, 0.4512, 0.4729]])




{(0, 1), (0, 0), (1, 1), (2, 0), (3, 0), (2, 3), (0, 2), (1, 0), (3, 2)}




Evaluate the pruned model 4




Evaluating the model:   0%|                                                                               | 0/…

0.5741789849041854




{'bert.encoder.layer.0.attention.self.query.weight': 0.75, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.75, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.75, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.75, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.599609375, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.599609375, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5, 'bert.encoder.layer.1.attention.self.value.bias': 




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.9353947542619935




CCA coefficients mean non-concern: 0.9462815360282444




Linear CKA concern: 0.8709802910455733




Linear CKA non-concern: 0.9258199983457316




Kernel CKA concern: 0.7728508956064629




Kernel CKA non-concern: 0.8098735267183187




original model's perplexity




3.168053388595581




pruned model's perplexity




3.6069672107696533




Total heads to prune: 9




tensor([[0.4593, 0.4443, 0.4726, 0.5557],
        [0.4624, 0.4484, 0.5516, 0.4916],
        [0.4027, 0.5485, 0.5973, 0.4291],
        [0.4493, 0.5507, 0.4512, 0.4729]])




{(0, 1), (0, 0), (1, 1), (2, 0), (3, 0), (2, 3), (0, 2), (1, 0), (3, 2)}




Evaluate the pruned model 5




Evaluating the model:   0%|                                                                               | 0/…

0.5741789849041854




{'bert.encoder.layer.0.attention.self.query.weight': 0.75, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.75, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.75, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.75, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.599609375, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.599609375, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5, 'bert.encoder.layer.1.attention.self.value.bias': 




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.9245657423412652




CCA coefficients mean non-concern: 0.9507459806623803




Linear CKA concern: 0.7788254880945809




Linear CKA non-concern: 0.9378332863319632




Kernel CKA concern: 0.6314078549414717




Kernel CKA non-concern: 0.8404578746964703




original model's perplexity




3.168053388595581




pruned model's perplexity




3.6140315532684326




Total heads to prune: 9




tensor([[0.4593, 0.4443, 0.4726, 0.5557],
        [0.4624, 0.4484, 0.5516, 0.4916],
        [0.4027, 0.5485, 0.5973, 0.4291],
        [0.4493, 0.5507, 0.4512, 0.4729]])




{(0, 1), (0, 0), (1, 1), (2, 0), (3, 0), (2, 3), (0, 2), (1, 0), (3, 2)}




Evaluate the pruned model 6




Evaluating the model:   0%|                                                                               | 0/…

0.5741789849041854




{'bert.encoder.layer.0.attention.self.query.weight': 0.75, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.75, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.75, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.75, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.599609375, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.599609375, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5, 'bert.encoder.layer.1.attention.self.value.bias': 




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.9522209457857601




CCA coefficients mean non-concern: 0.9429108536861761




Linear CKA concern: 0.932589062721881




Linear CKA non-concern: 0.9243496277000398




Kernel CKA concern: 0.8282715189125063




Kernel CKA non-concern: 0.8171446976406074




original model's perplexity




3.168053388595581




pruned model's perplexity




3.6040866374969482




Total heads to prune: 9




tensor([[0.4593, 0.4443, 0.4726, 0.5557],
        [0.4624, 0.4484, 0.5516, 0.4916],
        [0.4027, 0.5485, 0.5973, 0.4291],
        [0.4493, 0.5507, 0.4512, 0.4729]])




{(0, 1), (0, 0), (1, 1), (2, 0), (3, 0), (2, 3), (0, 2), (1, 0), (3, 2)}




Evaluate the pruned model 7




Evaluating the model:   0%|                                                                               | 0/…

0.5741789849041854




{'bert.encoder.layer.0.attention.self.query.weight': 0.75, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.75, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.75, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.75, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.599609375, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.599609375, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5, 'bert.encoder.layer.1.attention.self.value.bias': 




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.9412421900858284




CCA coefficients mean non-concern: 0.9449352359062217




Linear CKA concern: 0.9260705279892142




Linear CKA non-concern: 0.9288441115513072




Kernel CKA concern: 0.8194908655164377




Kernel CKA non-concern: 0.8263961796955028




original model's perplexity




3.168053388595581




pruned model's perplexity




3.5977768898010254




Total heads to prune: 9




tensor([[0.4593, 0.4443, 0.4726, 0.5557],
        [0.4624, 0.4484, 0.5516, 0.4916],
        [0.4027, 0.5485, 0.5973, 0.4291],
        [0.4493, 0.5507, 0.4512, 0.4729]])




{(0, 1), (0, 0), (1, 1), (2, 0), (3, 0), (2, 3), (0, 2), (1, 0), (3, 2)}




Evaluate the pruned model 8




Evaluating the model:   0%|                                                                               | 0/…

0.5741789849041854




{'bert.encoder.layer.0.attention.self.query.weight': 0.75, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.75, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.75, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.75, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.599609375, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.599609375, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5, 'bert.encoder.layer.1.attention.self.value.bias': 




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.9433603280302595




CCA coefficients mean non-concern: 0.9440054253721375




Linear CKA concern: 0.9180242822198438




Linear CKA non-concern: 0.9226928099700171




Kernel CKA concern: 0.8010156843479126




Kernel CKA non-concern: 0.8126362281629937




original model's perplexity




3.168053388595581




pruned model's perplexity




3.580552816390991




Total heads to prune: 9




tensor([[0.4593, 0.4443, 0.4726, 0.5557],
        [0.4624, 0.4484, 0.5516, 0.4916],
        [0.4027, 0.5485, 0.5973, 0.4291],
        [0.4493, 0.5507, 0.4512, 0.4729]])




{(0, 1), (0, 0), (1, 1), (2, 0), (3, 0), (2, 3), (0, 2), (1, 0), (3, 2)}




Evaluate the pruned model 9




Evaluating the model:   0%|                                                                               | 0/…

0.5741789849041854




{'bert.encoder.layer.0.attention.self.query.weight': 0.75, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.75, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.75, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.75, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.599609375, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.599609375, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5, 'bert.encoder.layer.1.attention.self.value.bias': 




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.9475149923709938




CCA coefficients mean non-concern: 0.9431104512531895




Linear CKA concern: 0.903429114251966




Linear CKA non-concern: 0.9254886521308517




Kernel CKA concern: 0.7840841339654013




Kernel CKA non-concern: 0.8189706962164248




original model's perplexity




3.168053388595581




pruned model's perplexity




3.6123688220977783




In [9]:
df_list = [report_to_df(df) for df in result_list]
new_df = append_nth_row(df_list)
csv_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
new_df.to_csv(f"results/{csv_name}.csv", index=False)
print(csv_name)
new_df

2024-10-22_04-43-34




Unnamed: 0,class,precision,recall,f1-score,support
0,0,0.4997,0.5214,0.5103,2992
1,1,0.7269,0.4198,0.5322,2992
2,2,0.7012,0.5415,0.6111,3012
3,3,0.3026,0.6258,0.408,2998
4,4,0.8142,0.5896,0.684,2973
5,5,0.7638,0.7485,0.7561,3054
6,6,0.662,0.381,0.4836,3003
7,7,0.5864,0.6149,0.6003,3012
8,8,0.5382,0.7059,0.6108,2982
9,9,0.7191,0.6439,0.6794,2982
