In [1]:
import os
import sys

sys.path.append("../../../../../")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import copy
import torch
from datetime import datetime
from src.utils.helper import Config, color_print
from src.utils.load import load_model, load_data, save_checkpoint
from src.models.evaluate import (
    evaluate_model,
    get_sparsity,
    get_similarity,
    get_perplexity,
)
from src.utils.sampling import SamplingDataset
from src.pruning.prune_head import head_importance_prunning
from src.pruning.prune import prune_concern_identification
from src.utils.helper import report_to_df, append_nth_row

In [3]:
name = "OSDG"
device = torch.device("cuda:0")
checkpoint = None
batch_size = 16
num_workers = 4
num_samples = 16
ratio = 0.5
seed = 44
include_layers = ["intermediate", "output"]
exclude_layers = [
    "attention",
]

In [4]:
script_start_time = datetime.now()
print(f"Script started at: {script_start_time.strftime('%Y-%m-%d %H:%M:%S')}")

Script started at: 2024-10-23 19:19:45


In [5]:
config = Config(name, device)
num_labels = config.config["num_labels"]
model = load_model(config)

Loading the model.




{

'architectures'

: 

'bert'

,
 

'dataset_name'

: 

'OSDG'

,
 

'model_name'

: 

'sadickam/sdg-classification-bert'

,
 

'num_labels'

: 

16

,
 

'tokenizer_name'

: 

'sadickam/sdg-classification-bert'

}




The model sadickam/sdg-classification-bert is loaded.




In [6]:
train_dataloader, valid_dataloader, test_dataloader = load_data(
    config,
    batch_size=batch_size,
    num_workers=num_workers,
    do_cache=True,
)

Loading cached dataset OSDG.




train.pkl is loaded from cache.




valid.pkl is loaded from cache.




test.pkl is loaded from cache.




The dataset OSDG is loaded




{

'config_name'

: 

'2024-01-01'

,
 

'features'

: 

{'first_column': 'text', 'second_column': 'labels'}

,
 

'path'

: 

'albertmartinez/OSDG'

}




In [7]:
# print("Evaluate the original model")
# result = evaluate_model(model, config, test_dataloader)

In [8]:
result_list = []

for concern in range(config.num_labels):
    config.init_seed()
    positive_samples = SamplingDataset(
        train_dataloader,
        config,
        concern,
        num_samples,
        True,
        4,
        resample=False,
    )
    negative_samples = SamplingDataset(
        train_dataloader,
        config,
        concern,
        num_samples,
        False,
        4,
        resample=False,
    )
    all_samples = SamplingDataset(
        train_dataloader,
        config,
        200,
        num_samples,
        False,
        4,
        resample=False,
    )

    module = copy.deepcopy(model)

    head_importance_prunning(module, config, all_samples, ratio)

    prune_concern_identification(
        module,
        config,
        positive_samples,
        negative_samples,
        include_layers=include_layers,
        exclude_layers=exclude_layers,
        sparsity_ratio=ratio,
        keep_dim=False,
        method="structed",
    )

    print(f"Evaluate the pruned model {concern}")
    result = evaluate_model(module, config, test_dataloader, verbose=True)
    result_list.append(result)
    get_sparsity(module)

    get_similarity(model, module, valid_dataloader, concern, num_samples, config)
    print("original model's perplexity")
    get_perplexity(model, valid_dataloader, config)
    print("pruned model's perplexity")
    get_perplexity(module, valid_dataloader, config)

Total heads to prune: 72




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(3, 4), (4, 9), (3, 7), (3, 1), (4, 6), (5, 1), (3, 10), (11, 2), (10, 0), (8, 9), (11, 5), (2, 2), (10, 3), (9, 11), (11, 8), (2, 5), (1, 3), (2, 11), (1, 9), (2, 8), (0, 11), (1, 0), (6, 8), (4, 2), (5, 0), (5, 6), (4, 8), (3, 6), (11, 4), (10, 2), (9, 4), (11, 1), (8, 8), (2, 4), (10, 11), (1, 2), (11, 10), (10, 8), (2, 7), (9, 10), (2, 1), (2, 10), (1, 8), (1, 5), (6, 1), (6, 4), (7, 3), (6, 7), (7, 6), (6, 10), (3, 2), (4, 1), (4, 7), (5, 2), (3, 8), (5, 5), (5, 11), (9, 3), (5, 8), (8, 7), (10, 4), (8, 1), (11, 3), (11, 9), (10, 7), (0, 9), (11, 6), (2, 3), (1, 7), (2, 6), (10, 5), (0, 1)}




Evaluate the pruned model 0




Evaluating the model:   0%|                                                                                   …

0.24708761548509114




{'bert.encoder.layer.0.attention.self.query.weight': 0.25, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.25, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.25, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.25, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.1




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4731964075042635




CCA coefficients mean non-concern: 0.4719973886257267




Linear CKA concern: 0.12917873528303853




Linear CKA non-concern: 0.3497458661021983




Kernel CKA concern: 0.05634224462504734




Kernel CKA non-concern: 0.12585450880317406




original model's perplexity




2.445301055908203




pruned model's perplexity




15.150482177734375




Total heads to prune: 72




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(3, 4), (4, 9), (3, 7), (3, 1), (4, 6), (5, 1), (3, 10), (11, 2), (10, 0), (8, 9), (11, 5), (2, 2), (10, 3), (9, 11), (11, 8), (2, 5), (1, 3), (2, 11), (1, 9), (2, 8), (0, 11), (1, 0), (6, 8), (4, 2), (5, 0), (5, 6), (4, 8), (3, 6), (11, 4), (10, 2), (9, 4), (11, 1), (8, 8), (2, 4), (10, 11), (1, 2), (11, 10), (10, 8), (2, 7), (9, 10), (2, 1), (2, 10), (1, 8), (1, 5), (6, 1), (6, 4), (7, 3), (6, 7), (7, 6), (6, 10), (3, 2), (4, 1), (4, 7), (5, 2), (3, 8), (5, 5), (5, 11), (9, 3), (5, 8), (8, 7), (10, 4), (8, 1), (11, 3), (11, 9), (10, 7), (0, 9), (11, 6), (2, 3), (1, 7), (2, 6), (10, 5), (0, 1)}




Evaluate the pruned model 1




Evaluating the model:   0%|                                                                                   …

0.24708761548509114




{'bert.encoder.layer.0.attention.self.query.weight': 0.25, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.25, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.25, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.25, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.1




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.47807033630746937




CCA coefficients mean non-concern: 0.47106519208324565




Linear CKA concern: 0.1412765847637699




Linear CKA non-concern: 0.3428170472852865




Kernel CKA concern: 0.049962164818435705




Kernel CKA non-concern: 0.12084179699417051




original model's perplexity




2.445301055908203




pruned model's perplexity




15.137909889221191




Total heads to prune: 72




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(3, 4), (4, 9), (3, 7), (3, 1), (4, 6), (5, 1), (3, 10), (11, 2), (10, 0), (8, 9), (11, 5), (2, 2), (10, 3), (9, 11), (11, 8), (2, 5), (1, 3), (2, 11), (1, 9), (2, 8), (0, 11), (1, 0), (6, 8), (4, 2), (5, 0), (5, 6), (4, 8), (3, 6), (11, 4), (10, 2), (9, 4), (11, 1), (8, 8), (2, 4), (10, 11), (1, 2), (11, 10), (10, 8), (2, 7), (9, 10), (2, 1), (2, 10), (1, 8), (1, 5), (6, 1), (6, 4), (7, 3), (6, 7), (7, 6), (6, 10), (3, 2), (4, 1), (4, 7), (5, 2), (3, 8), (5, 5), (5, 11), (9, 3), (5, 8), (8, 7), (10, 4), (8, 1), (11, 3), (11, 9), (10, 7), (0, 9), (11, 6), (2, 3), (1, 7), (2, 6), (10, 5), (0, 1)}




Evaluate the pruned model 2




Evaluating the model:   0%|                                                                                   …

0.24708761548509114




{'bert.encoder.layer.0.attention.self.query.weight': 0.25, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.25, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.25, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.25, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.1




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x7210e049a840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x7210e049a840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x7210e049a840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4788947788306308




CCA coefficients mean non-concern: 0.4714957471397565




Linear CKA concern: 0.13683815649596967




Linear CKA non-concern: 0.3606772917434649




Kernel CKA concern: 0.06706404769533873




Kernel CKA non-concern: 0.12359324566653541




original model's perplexity




2.445301055908203




pruned model's perplexity




15.149694442749023




Total heads to prune: 72




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(3, 4), (4, 9), (3, 7), (3, 1), (4, 6), (5, 1), (3, 10), (11, 2), (10, 0), (8, 9), (11, 5), (2, 2), (10, 3), (9, 11), (11, 8), (2, 5), (1, 3), (2, 11), (1, 9), (2, 8), (0, 11), (1, 0), (6, 8), (4, 2), (5, 0), (5, 6), (4, 8), (3, 6), (11, 4), (10, 2), (9, 4), (11, 1), (8, 8), (2, 4), (10, 11), (1, 2), (11, 10), (10, 8), (2, 7), (9, 10), (2, 1), (2, 10), (1, 8), (1, 5), (6, 1), (6, 4), (7, 3), (6, 7), (7, 6), (6, 10), (3, 2), (4, 1), (4, 7), (5, 2), (3, 8), (5, 5), (5, 11), (9, 3), (5, 8), (8, 7), (10, 4), (8, 1), (11, 3), (11, 9), (10, 7), (0, 9), (11, 6), (2, 3), (1, 7), (2, 6), (10, 5), (0, 1)}




Evaluate the pruned model 3




Evaluating the model:   0%|                                                                                   …

0.24708761548509114




{'bert.encoder.layer.0.attention.self.query.weight': 0.25, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.25, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.25, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.25, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.1




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4749307968575815




CCA coefficients mean non-concern: 0.4736278270948295




Linear CKA concern: 0.14526079826541763




Linear CKA non-concern: 0.3632176388808346




Kernel CKA concern: 0.04518649529203125




Kernel CKA non-concern: 0.126914677254809




original model's perplexity




2.445301055908203




pruned model's perplexity




15.088980674743652




Total heads to prune: 72




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(3, 4), (4, 9), (3, 7), (3, 1), (4, 6), (5, 1), (3, 10), (11, 2), (10, 0), (8, 9), (11, 5), (2, 2), (10, 3), (9, 11), (11, 8), (2, 5), (1, 3), (2, 11), (1, 9), (2, 8), (0, 11), (1, 0), (6, 8), (4, 2), (5, 0), (5, 6), (4, 8), (3, 6), (11, 4), (10, 2), (9, 4), (11, 1), (8, 8), (2, 4), (10, 11), (1, 2), (11, 10), (10, 8), (2, 7), (9, 10), (2, 1), (2, 10), (1, 8), (1, 5), (6, 1), (6, 4), (7, 3), (6, 7), (7, 6), (6, 10), (3, 2), (4, 1), (4, 7), (5, 2), (3, 8), (5, 5), (5, 11), (9, 3), (5, 8), (8, 7), (10, 4), (8, 1), (11, 3), (11, 9), (10, 7), (0, 9), (11, 6), (2, 3), (1, 7), (2, 6), (10, 5), (0, 1)}




Evaluate the pruned model 4




Evaluating the model:   0%|                                                                                   …

0.24708761548509114




{'bert.encoder.layer.0.attention.self.query.weight': 0.25, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.25, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.25, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.25, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.1




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.47888276150459613




CCA coefficients mean non-concern: 0.47368806057912816




Linear CKA concern: 0.2641234867440071




Linear CKA non-concern: 0.33824342411867364




Kernel CKA concern: 0.16807641775222124




Kernel CKA non-concern: 0.10920537804211718




original model's perplexity




2.445301055908203




pruned model's perplexity




15.153274536132812




Total heads to prune: 72




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(3, 4), (4, 9), (3, 7), (3, 1), (4, 6), (5, 1), (3, 10), (11, 2), (10, 0), (8, 9), (11, 5), (2, 2), (10, 3), (9, 11), (11, 8), (2, 5), (1, 3), (2, 11), (1, 9), (2, 8), (0, 11), (1, 0), (6, 8), (4, 2), (5, 0), (5, 6), (4, 8), (3, 6), (11, 4), (10, 2), (9, 4), (11, 1), (8, 8), (2, 4), (10, 11), (1, 2), (11, 10), (10, 8), (2, 7), (9, 10), (2, 1), (2, 10), (1, 8), (1, 5), (6, 1), (6, 4), (7, 3), (6, 7), (7, 6), (6, 10), (3, 2), (4, 1), (4, 7), (5, 2), (3, 8), (5, 5), (5, 11), (9, 3), (5, 8), (8, 7), (10, 4), (8, 1), (11, 3), (11, 9), (10, 7), (0, 9), (11, 6), (2, 3), (1, 7), (2, 6), (10, 5), (0, 1)}




Evaluate the pruned model 5




Evaluating the model:   0%|                                                                                   …

0.24708761548509114




{'bert.encoder.layer.0.attention.self.query.weight': 0.25, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.25, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.25, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.25, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.1




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x7210e049a840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x7210e049a840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.46942314089359216




CCA coefficients mean non-concern: 0.4704977538571534




Linear CKA concern: 0.17304858141494095




Linear CKA non-concern: 0.337763334968919




Kernel CKA concern: 0.05967838192562706




Kernel CKA non-concern: 0.11579757547390584




original model's perplexity




2.445301055908203




pruned model's perplexity




15.163834571838379




Total heads to prune: 72




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(3, 4), (4, 9), (3, 7), (3, 1), (4, 6), (5, 1), (3, 10), (11, 2), (10, 0), (8, 9), (11, 5), (2, 2), (10, 3), (9, 11), (11, 8), (2, 5), (1, 3), (2, 11), (1, 9), (2, 8), (0, 11), (1, 0), (6, 8), (4, 2), (5, 0), (5, 6), (4, 8), (3, 6), (11, 4), (10, 2), (9, 4), (11, 1), (8, 8), (2, 4), (10, 11), (1, 2), (11, 10), (10, 8), (2, 7), (9, 10), (2, 1), (2, 10), (1, 8), (1, 5), (6, 1), (6, 4), (7, 3), (6, 7), (7, 6), (6, 10), (3, 2), (4, 1), (4, 7), (5, 2), (3, 8), (5, 5), (5, 11), (9, 3), (5, 8), (8, 7), (10, 4), (8, 1), (11, 3), (11, 9), (10, 7), (0, 9), (11, 6), (2, 3), (1, 7), (2, 6), (10, 5), (0, 1)}




Evaluate the pruned model 6




Evaluating the model:   0%|                                                                                   …

0.24708761548509114




{'bert.encoder.layer.0.attention.self.query.weight': 0.25, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.25, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.25, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.25, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.1




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.46816631499737194




CCA coefficients mean non-concern: 0.4737669363497539




Linear CKA concern: 0.09527932154542028




Linear CKA non-concern: 0.35778253924475567




Kernel CKA concern: 0.03293000013058182




Kernel CKA non-concern: 0.12769149048971612




original model's perplexity




2.445301055908203




pruned model's perplexity




14.984041213989258




Total heads to prune: 72




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(3, 4), (4, 9), (3, 7), (3, 1), (4, 6), (5, 1), (3, 10), (11, 2), (10, 0), (8, 9), (11, 5), (2, 2), (10, 3), (9, 11), (11, 8), (2, 5), (1, 3), (2, 11), (1, 9), (2, 8), (0, 11), (1, 0), (6, 8), (4, 2), (5, 0), (5, 6), (4, 8), (3, 6), (11, 4), (10, 2), (9, 4), (11, 1), (8, 8), (2, 4), (10, 11), (1, 2), (11, 10), (10, 8), (2, 7), (9, 10), (2, 1), (2, 10), (1, 8), (1, 5), (6, 1), (6, 4), (7, 3), (6, 7), (7, 6), (6, 10), (3, 2), (4, 1), (4, 7), (5, 2), (3, 8), (5, 5), (5, 11), (9, 3), (5, 8), (8, 7), (10, 4), (8, 1), (11, 3), (11, 9), (10, 7), (0, 9), (11, 6), (2, 3), (1, 7), (2, 6), (10, 5), (0, 1)}




Evaluate the pruned model 7




Evaluating the model:   0%|                                                                                   …

0.24708761548509114




{'bert.encoder.layer.0.attention.self.query.weight': 0.25, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.25, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.25, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.25, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.1




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x7210e049a840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x7210e049a840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4838356991097999




CCA coefficients mean non-concern: 0.47025839276870507




Linear CKA concern: 0.19386050253983844




Linear CKA non-concern: 0.3524091028990015




Kernel CKA concern: 0.06862496178217033




Kernel CKA non-concern: 0.12164048242892063




original model's perplexity




2.445301055908203




pruned model's perplexity




15.166807174682617




Total heads to prune: 72




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(3, 4), (4, 9), (3, 7), (3, 1), (4, 6), (5, 1), (3, 10), (11, 2), (10, 0), (8, 9), (11, 5), (2, 2), (10, 3), (9, 11), (11, 8), (2, 5), (1, 3), (2, 11), (1, 9), (2, 8), (0, 11), (1, 0), (6, 8), (4, 2), (5, 0), (5, 6), (4, 8), (3, 6), (11, 4), (10, 2), (9, 4), (11, 1), (8, 8), (2, 4), (10, 11), (1, 2), (11, 10), (10, 8), (2, 7), (9, 10), (2, 1), (2, 10), (1, 8), (1, 5), (6, 1), (6, 4), (7, 3), (6, 7), (7, 6), (6, 10), (3, 2), (4, 1), (4, 7), (5, 2), (3, 8), (5, 5), (5, 11), (9, 3), (5, 8), (8, 7), (10, 4), (8, 1), (11, 3), (11, 9), (10, 7), (0, 9), (11, 6), (2, 3), (1, 7), (2, 6), (10, 5), (0, 1)}




Evaluate the pruned model 8




Evaluating the model:   0%|                                                                                   …

0.24708761548509114




{'bert.encoder.layer.0.attention.self.query.weight': 0.25, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.25, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.25, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.25, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.1




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4744754112269863




CCA coefficients mean non-concern: 0.4712236642338554




Linear CKA concern: 0.15679176887464447




Linear CKA non-concern: 0.3538475549711056




Kernel CKA concern: 0.07520995328823521




Kernel CKA non-concern: 0.11754727467454411




original model's perplexity




2.445301055908203




pruned model's perplexity




15.146126747131348




Total heads to prune: 72




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(3, 4), (4, 9), (3, 7), (3, 1), (4, 6), (5, 1), (3, 10), (11, 2), (10, 0), (8, 9), (11, 5), (2, 2), (10, 3), (9, 11), (11, 8), (2, 5), (1, 3), (2, 11), (1, 9), (2, 8), (0, 11), (1, 0), (6, 8), (4, 2), (5, 0), (5, 6), (4, 8), (3, 6), (11, 4), (10, 2), (9, 4), (11, 1), (8, 8), (2, 4), (10, 11), (1, 2), (11, 10), (10, 8), (2, 7), (9, 10), (2, 1), (2, 10), (1, 8), (1, 5), (6, 1), (6, 4), (7, 3), (6, 7), (7, 6), (6, 10), (3, 2), (4, 1), (4, 7), (5, 2), (3, 8), (5, 5), (5, 11), (9, 3), (5, 8), (8, 7), (10, 4), (8, 1), (11, 3), (11, 9), (10, 7), (0, 9), (11, 6), (2, 3), (1, 7), (2, 6), (10, 5), (0, 1)}




Evaluate the pruned model 9




Evaluating the model:   0%|                                                                                   …

0.24708761548509114




{'bert.encoder.layer.0.attention.self.query.weight': 0.25, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.25, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.25, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.25, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.1




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4828334822777383




CCA coefficients mean non-concern: 0.47154121217968653




Linear CKA concern: 0.19963559612469778




Linear CKA non-concern: 0.35008565185327023




Kernel CKA concern: 0.07567937734848999




Kernel CKA non-concern: 0.11904530466367998




original model's perplexity




2.445301055908203




pruned model's perplexity




15.104119300842285




Total heads to prune: 72




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(3, 4), (4, 9), (3, 7), (3, 1), (4, 6), (5, 1), (3, 10), (11, 2), (10, 0), (8, 9), (11, 5), (2, 2), (10, 3), (9, 11), (11, 8), (2, 5), (1, 3), (2, 11), (1, 9), (2, 8), (0, 11), (1, 0), (6, 8), (4, 2), (5, 0), (5, 6), (4, 8), (3, 6), (11, 4), (10, 2), (9, 4), (11, 1), (8, 8), (2, 4), (10, 11), (1, 2), (11, 10), (10, 8), (2, 7), (9, 10), (2, 1), (2, 10), (1, 8), (1, 5), (6, 1), (6, 4), (7, 3), (6, 7), (7, 6), (6, 10), (3, 2), (4, 1), (4, 7), (5, 2), (3, 8), (5, 5), (5, 11), (9, 3), (5, 8), (8, 7), (10, 4), (8, 1), (11, 3), (11, 9), (10, 7), (0, 9), (11, 6), (2, 3), (1, 7), (2, 6), (10, 5), (0, 1)}




Evaluate the pruned model 10




Evaluating the model:   0%|                                                                                   …

0.24708761548509114




{'bert.encoder.layer.0.attention.self.query.weight': 0.25, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.25, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.25, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.25, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.1




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x7210e049a840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x7210e049a840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x7210e049a840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4838872052056058




CCA coefficients mean non-concern: 0.4703876405540807




Linear CKA concern: 0.1881968605164181




Linear CKA non-concern: 0.34309314829676973




Kernel CKA concern: 0.08135283081463253




Kernel CKA non-concern: 0.1209630152159468




original model's perplexity




2.445301055908203




pruned model's perplexity




15.179258346557617




Total heads to prune: 72




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(3, 4), (4, 9), (3, 7), (3, 1), (4, 6), (5, 1), (3, 10), (11, 2), (10, 0), (8, 9), (11, 5), (2, 2), (10, 3), (9, 11), (11, 8), (2, 5), (1, 3), (2, 11), (1, 9), (2, 8), (0, 11), (1, 0), (6, 8), (4, 2), (5, 0), (5, 6), (4, 8), (3, 6), (11, 4), (10, 2), (9, 4), (11, 1), (8, 8), (2, 4), (10, 11), (1, 2), (11, 10), (10, 8), (2, 7), (9, 10), (2, 1), (2, 10), (1, 8), (1, 5), (6, 1), (6, 4), (7, 3), (6, 7), (7, 6), (6, 10), (3, 2), (4, 1), (4, 7), (5, 2), (3, 8), (5, 5), (5, 11), (9, 3), (5, 8), (8, 7), (10, 4), (8, 1), (11, 3), (11, 9), (10, 7), (0, 9), (11, 6), (2, 3), (1, 7), (2, 6), (10, 5), (0, 1)}




Evaluate the pruned model 11




Evaluating the model:   0%|                                                                                   …

0.24708761548509114




{'bert.encoder.layer.0.attention.self.query.weight': 0.25, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.25, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.25, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.25, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.1




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x7210e049a840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x7210e049a840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4924290779434991




CCA coefficients mean non-concern: 0.4705429894207803




Linear CKA concern: 0.1518261882337815




Linear CKA non-concern: 0.34597888310747005




Kernel CKA concern: 0.051434070838544454




Kernel CKA non-concern: 0.12170900474999505




original model's perplexity




2.445301055908203




pruned model's perplexity




15.120271682739258




Total heads to prune: 72




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(3, 4), (4, 9), (3, 7), (3, 1), (4, 6), (5, 1), (3, 10), (11, 2), (10, 0), (8, 9), (11, 5), (2, 2), (10, 3), (9, 11), (11, 8), (2, 5), (1, 3), (2, 11), (1, 9), (2, 8), (0, 11), (1, 0), (6, 8), (4, 2), (5, 0), (5, 6), (4, 8), (3, 6), (11, 4), (10, 2), (9, 4), (11, 1), (8, 8), (2, 4), (10, 11), (1, 2), (11, 10), (10, 8), (2, 7), (9, 10), (2, 1), (2, 10), (1, 8), (1, 5), (6, 1), (6, 4), (7, 3), (6, 7), (7, 6), (6, 10), (3, 2), (4, 1), (4, 7), (5, 2), (3, 8), (5, 5), (5, 11), (9, 3), (5, 8), (8, 7), (10, 4), (8, 1), (11, 3), (11, 9), (10, 7), (0, 9), (11, 6), (2, 3), (1, 7), (2, 6), (10, 5), (0, 1)}




Evaluate the pruned model 12




Evaluating the model:   0%|                                                                                   …

0.24708761548509114




{'bert.encoder.layer.0.attention.self.query.weight': 0.25, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.25, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.25, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.25, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.1




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.47628949710897894




CCA coefficients mean non-concern: 0.4738201372798488




Linear CKA concern: 0.20292176246040466




Linear CKA non-concern: 0.35932176326278653




Kernel CKA concern: 0.0870940789106234




Kernel CKA non-concern: 0.121162276598823




original model's perplexity




2.445301055908203




pruned model's perplexity




15.15401840209961




Total heads to prune: 72




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(3, 4), (4, 9), (3, 7), (3, 1), (4, 6), (5, 1), (3, 10), (11, 2), (10, 0), (8, 9), (11, 5), (2, 2), (10, 3), (9, 11), (11, 8), (2, 5), (1, 3), (2, 11), (1, 9), (2, 8), (0, 11), (1, 0), (6, 8), (4, 2), (5, 0), (5, 6), (4, 8), (3, 6), (11, 4), (10, 2), (9, 4), (11, 1), (8, 8), (2, 4), (10, 11), (1, 2), (11, 10), (10, 8), (2, 7), (9, 10), (2, 1), (2, 10), (1, 8), (1, 5), (6, 1), (6, 4), (7, 3), (6, 7), (7, 6), (6, 10), (3, 2), (4, 1), (4, 7), (5, 2), (3, 8), (5, 5), (5, 11), (9, 3), (5, 8), (8, 7), (10, 4), (8, 1), (11, 3), (11, 9), (10, 7), (0, 9), (11, 6), (2, 3), (1, 7), (2, 6), (10, 5), (0, 1)}




Evaluate the pruned model 13




Evaluating the model:   0%|                                                                                   …

0.24708761548509114




{'bert.encoder.layer.0.attention.self.query.weight': 0.25, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.25, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.25, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.25, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.1




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4844177830411879




CCA coefficients mean non-concern: 0.4698490050783351




Linear CKA concern: 0.14016182482888317




Linear CKA non-concern: 0.3562070211758475




Kernel CKA concern: 0.06109990172439917




Kernel CKA non-concern: 0.11982608711912095




original model's perplexity




2.445301055908203




pruned model's perplexity




15.219446182250977




Total heads to prune: 72




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(3, 4), (4, 9), (3, 7), (3, 1), (4, 6), (5, 1), (3, 10), (11, 2), (10, 0), (8, 9), (11, 5), (2, 2), (10, 3), (9, 11), (11, 8), (2, 5), (1, 3), (2, 11), (1, 9), (2, 8), (0, 11), (1, 0), (6, 8), (4, 2), (5, 0), (5, 6), (4, 8), (3, 6), (11, 4), (10, 2), (9, 4), (11, 1), (8, 8), (2, 4), (10, 11), (1, 2), (11, 10), (10, 8), (2, 7), (9, 10), (2, 1), (2, 10), (1, 8), (1, 5), (6, 1), (6, 4), (7, 3), (6, 7), (7, 6), (6, 10), (3, 2), (4, 1), (4, 7), (5, 2), (3, 8), (5, 5), (5, 11), (9, 3), (5, 8), (8, 7), (10, 4), (8, 1), (11, 3), (11, 9), (10, 7), (0, 9), (11, 6), (2, 3), (1, 7), (2, 6), (10, 5), (0, 1)}




Evaluate the pruned model 14




Evaluating the model:   0%|                                                                                   …

0.24708761548509114




{'bert.encoder.layer.0.attention.self.query.weight': 0.25, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.25, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.25, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.25, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.1




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x7210e049a840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x7210e049a840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.47517670528518685




CCA coefficients mean non-concern: 0.47177165379038555




Linear CKA concern: 0.16276468110887787




Linear CKA non-concern: 0.34765872768742656




Kernel CKA concern: 0.07785392211525216




Kernel CKA non-concern: 0.12439248989898397




original model's perplexity




2.445301055908203




pruned model's perplexity




15.048724174499512




Total heads to prune: 72




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(3, 4), (4, 9), (3, 7), (3, 1), (4, 6), (5, 1), (3, 10), (11, 2), (10, 0), (8, 9), (11, 5), (2, 2), (10, 3), (9, 11), (11, 8), (2, 5), (1, 3), (2, 11), (1, 9), (2, 8), (0, 11), (1, 0), (6, 8), (4, 2), (5, 0), (5, 6), (4, 8), (3, 6), (11, 4), (10, 2), (9, 4), (11, 1), (8, 8), (2, 4), (10, 11), (1, 2), (11, 10), (10, 8), (2, 7), (9, 10), (2, 1), (2, 10), (1, 8), (1, 5), (6, 1), (6, 4), (7, 3), (6, 7), (7, 6), (6, 10), (3, 2), (4, 1), (4, 7), (5, 2), (3, 8), (5, 5), (5, 11), (9, 3), (5, 8), (8, 7), (10, 4), (8, 1), (11, 3), (11, 9), (10, 7), (0, 9), (11, 6), (2, 3), (1, 7), (2, 6), (10, 5), (0, 1)}




Evaluate the pruned model 15




Evaluating the model:   0%|                                                                                   …

0.24708761548509114




{'bert.encoder.layer.0.attention.self.query.weight': 0.25, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.25, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.25, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.25, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.1




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4749827260310002




CCA coefficients mean non-concern: 0.4721399777440637




Linear CKA concern: 0.2043347624312855




Linear CKA non-concern: 0.3493511874317582




Kernel CKA concern: 0.11078993892049079




Kernel CKA non-concern: 0.11000682997140776




original model's perplexity




2.445301055908203




pruned model's perplexity




15.192314147949219




In [9]:
df_list = [report_to_df(df) for df in result_list]
new_df = append_nth_row(df_list)
csv_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
new_df.to_csv(f"results/{csv_name}.csv", index=False)
print(csv_name)
new_df

2024-10-23_20-24-30




Unnamed: 0,class,precision,recall,f1-score,support
0,0,0.0,0.0,0.0,797
1,1,0.0,0.0,0.0,775
2,2,1.0,0.0025,0.005,795
3,3,0.0,0.0,0.0,1110
4,4,0.1224,0.9849,0.2178,1260
5,5,0.0,0.0,0.0,882
6,6,0.0,0.0,0.0,940
7,7,0.0,0.0,0.0,473
8,8,0.0,0.0,0.0,746
9,9,0.0,0.0,0.0,689
