In [1]:
import os
import sys

sys.path.append("../../../../../")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import copy
import torch
from datetime import datetime
from src.utils.helper import Config, color_print
from src.utils.load import load_model, load_data, save_checkpoint
from src.models.evaluate import (
    evaluate_model,
    get_sparsity,
    get_similarity,
    get_perplexity,
)
from src.utils.sampling import SamplingDataset
from src.pruning.prune_head import head_importance_prunning
from src.pruning.prune import prune_concern_identification
from src.utils.helper import report_to_df, append_nth_row

In [3]:
name = "OSDG"
device = torch.device("cuda:0")
checkpoint = None
batch_size = 16
num_workers = 4
num_samples = 16
ratio = 0.6
seed = 44
include_layers = ["intermediate", "output"]
exclude_layers = [
    "attention",
]

In [4]:
script_start_time = datetime.now()
print(f"Script started at: {script_start_time.strftime('%Y-%m-%d %H:%M:%S')}")

Script started at: 2024-10-24 04:34:48


In [5]:
config = Config(name, device)
num_labels = config.config["num_labels"]
model = load_model(config)

Loading the model.




{

'architectures'

: 

'bert'

,
 

'dataset_name'

: 

'OSDG'

,
 

'model_name'

: 

'sadickam/sdg-classification-bert'

,
 

'num_labels'

: 

16

,
 

'tokenizer_name'

: 

'sadickam/sdg-classification-bert'

}




The model sadickam/sdg-classification-bert is loaded.




In [6]:
train_dataloader, valid_dataloader, test_dataloader = load_data(
    config,
    batch_size=batch_size,
    num_workers=num_workers,
    do_cache=True,
)

Loading cached dataset OSDG.




train.pkl is loaded from cache.




valid.pkl is loaded from cache.




test.pkl is loaded from cache.




The dataset OSDG is loaded




{

'config_name'

: 

'2024-01-01'

,
 

'features'

: 

{'first_column': 'text', 'second_column': 'labels'}

,
 

'path'

: 

'albertmartinez/OSDG'

}




In [7]:
# print("Evaluate the original model")
# result = evaluate_model(model, config, test_dataloader)

In [8]:
result_list = []

for concern in range(config.num_labels):
    config.init_seed()
    positive_samples = SamplingDataset(
        train_dataloader,
        config,
        concern,
        num_samples,
        True,
        4,
        resample=False,
    )
    negative_samples = SamplingDataset(
        train_dataloader,
        config,
        concern,
        num_samples,
        False,
        4,
        resample=False,
    )
    all_samples = SamplingDataset(
        train_dataloader,
        config,
        200,
        num_samples,
        False,
        4,
        resample=False,
    )

    module = copy.deepcopy(model)

    head_importance_prunning(module, config, all_samples, ratio)

    prune_concern_identification(
        module,
        config,
        positive_samples,
        negative_samples,
        include_layers=include_layers,
        exclude_layers=exclude_layers,
        sparsity_ratio=ratio,
        keep_dim=True,
        method="structed",
    )

    print(f"Evaluate the pruned model {concern}")
    result = evaluate_model(module, config, test_dataloader, verbose=True)
    result_list.append(result)
    get_sparsity(module)

    get_similarity(model, module, valid_dataloader, concern, num_samples, config)
    print("original model's perplexity")
    get_perplexity(model, valid_dataloader, config)
    print("pruned model's perplexity")
    get_perplexity(module, valid_dataloader, config)

Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 0




Evaluating the model:   0%|                                                                                   …

0.5942322969023679




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.e




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.47089693561630314




CCA coefficients mean non-concern: 0.4669276884378736




Linear CKA concern: 0.11381794529166957




Linear CKA non-concern: 0.3349831771836485




Kernel CKA concern: 0.03436172509034394




Kernel CKA non-concern: 0.09908450075424742




original model's perplexity




2.445301055908203




pruned model's perplexity




16.17376136779785




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 1




Evaluating the model:   0%|                                                                                   …

0.5942322969023679




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.e




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4728501684487353




CCA coefficients mean non-concern: 0.4674277210920287




Linear CKA concern: 0.12990899425901575




Linear CKA non-concern: 0.32965657683847605




Kernel CKA concern: 0.03672699640771266




Kernel CKA non-concern: 0.09806579831675892




original model's perplexity




2.445301055908203




pruned model's perplexity




16.165218353271484




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 2




Evaluating the model:   0%|                                                                                   …

0.5942322969023679




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.e




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x70a2d318e8e0>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x70a2d318e8e0>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x70a2d318e8e0>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4756403257848613




CCA coefficients mean non-concern: 0.46716609461047576




Linear CKA concern: 0.11611139854522849




Linear CKA non-concern: 0.34578562812003993




Kernel CKA concern: 0.033883865400751834




Kernel CKA non-concern: 0.10010439904628568




original model's perplexity




2.445301055908203




pruned model's perplexity




16.15528678894043




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 3




Evaluating the model:   0%|                                                                                   …

0.5942322969023679




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.e




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4712464035198533




CCA coefficients mean non-concern: 0.47061847805204793




Linear CKA concern: 0.13244856615134515




Linear CKA non-concern: 0.3483825781860345




Kernel CKA concern: 0.029772276478012772




Kernel CKA non-concern: 0.1025803194452803




original model's perplexity




2.445301055908203




pruned model's perplexity




16.143627166748047




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 4




Evaluating the model:   0%|                                                                                   …

0.5942322969023679




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.e




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4719130519103713




CCA coefficients mean non-concern: 0.4694828021663245




Linear CKA concern: 0.21909973442126995




Linear CKA non-concern: 0.3286949309357355




Kernel CKA concern: 0.09380065500025032




Kernel CKA non-concern: 0.08963778842489152




original model's perplexity




2.445301055908203




pruned model's perplexity




16.17348861694336




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 5




Evaluating the model:   0%|                                                                                   …

0.5942322969023679




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.e




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x70a2d318e8e0>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x70a2d318e8e0>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.46288301214069144




CCA coefficients mean non-concern: 0.46600103737452253




Linear CKA concern: 0.16253120110162605




Linear CKA non-concern: 0.3263556107815702




Kernel CKA concern: 0.04566173571235617




Kernel CKA non-concern: 0.09496962171218332




original model's perplexity




2.445301055908203




pruned model's perplexity




16.193973541259766




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 6




Evaluating the model:   0%|                                                                                   …

0.5942322969023679




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.e




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.46241766980147764




CCA coefficients mean non-concern: 0.46920467667098925




Linear CKA concern: 0.08639443683744084




Linear CKA non-concern: 0.3450917726701919




Kernel CKA concern: 0.022345034675797536




Kernel CKA non-concern: 0.1024170355410524




original model's perplexity




2.445301055908203




pruned model's perplexity




16.15776252746582




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 7




Evaluating the model:   0%|                                                                                   …

0.5942322969023679




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.e




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4770974987438995




CCA coefficients mean non-concern: 0.4667690184699892




Linear CKA concern: 0.17285518104054498




Linear CKA non-concern: 0.3398014741704248




Kernel CKA concern: 0.04344841968793674




Kernel CKA non-concern: 0.09895556072722611




original model's perplexity




2.445301055908203




pruned model's perplexity




16.16157341003418




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 8




Evaluating the model:   0%|                                                                                   …

0.5942322969023679




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.e




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x70a2d318e8e0>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4691751890920333




CCA coefficients mean non-concern: 0.46713825538640036




Linear CKA concern: 0.1414013870002991




Linear CKA non-concern: 0.3410215184593425




Kernel CKA concern: 0.05164869323834582




Kernel CKA non-concern: 0.09528177756622835




original model's perplexity




2.445301055908203




pruned model's perplexity




16.162498474121094




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 9




Evaluating the model:   0%|                                                                                   …

0.5942322969023679




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.e




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4789744174686207




CCA coefficients mean non-concern: 0.4677983454791633




Linear CKA concern: 0.17315889095010203




Linear CKA non-concern: 0.33554469950596194




Kernel CKA concern: 0.044952094467267695




Kernel CKA non-concern: 0.09544523327193215




original model's perplexity




2.445301055908203




pruned model's perplexity




16.169710159301758




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 10




Evaluating the model:   0%|                                                                                   …

0.5942322969023679




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.e




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x70a2d318e8e0>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x70a2d318e8e0>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x70a2d318e8e0>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4793570016455176




CCA coefficients mean non-concern: 0.46638830747185045




Linear CKA concern: 0.16655314251879277




Linear CKA non-concern: 0.3278563925761156




Kernel CKA concern: 0.05021553643191946




Kernel CKA non-concern: 0.09667740753909788




original model's perplexity




2.445301055908203




pruned model's perplexity




16.199722290039062




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 11




Evaluating the model:   0%|                                                                                   …

0.5942322969023679




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.e




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.48705637070080127




CCA coefficients mean non-concern: 0.46633925319440095




Linear CKA concern: 0.1437525252743308




Linear CKA non-concern: 0.3321737936755964




Kernel CKA concern: 0.03873962877246574




Kernel CKA non-concern: 0.0982572280547239




original model's perplexity




2.445301055908203




pruned model's perplexity




16.163986206054688




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 12




Evaluating the model:   0%|                                                                                   …

0.5942322969023679




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.e




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.47136180479342826




CCA coefficients mean non-concern: 0.47024293193365646




Linear CKA concern: 0.18013367220509538




Linear CKA non-concern: 0.34648334432384353




Kernel CKA concern: 0.055070475281511495




Kernel CKA non-concern: 0.09766907828819252




original model's perplexity




2.445301055908203




pruned model's perplexity




16.1757755279541




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 13




Evaluating the model:   0%|                                                                                   …

0.5942322969023679




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.e




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x70a2d318e8e0>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x70a2d318e8e0>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.48038685745087867




CCA coefficients mean non-concern: 0.46525648963737515




Linear CKA concern: 0.1248864375049657




Linear CKA non-concern: 0.3437925998198338




Kernel CKA concern: 0.039085942032688135




Kernel CKA non-concern: 0.09993251430186988




original model's perplexity




2.445301055908203




pruned model's perplexity




16.18681526184082




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 14




Evaluating the model:   0%|                                                                                   …

0.5942322969023679




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.e




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.470845059807765




CCA coefficients mean non-concern: 0.4663886659085059




Linear CKA concern: 0.14161889194804883




Linear CKA non-concern: 0.33282463903273274




Kernel CKA concern: 0.040516474231047886




Kernel CKA non-concern: 0.09716727345125949




original model's perplexity




2.445301055908203




pruned model's perplexity




16.182300567626953




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 15




Evaluating the model:   0%|                                                                                   …

0.5942322969023679




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.5999348958333334, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.e




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4668828025423072




CCA coefficients mean non-concern: 0.4686059267031002




Linear CKA concern: 0.18688382653096902




Linear CKA non-concern: 0.33763168143648425




Kernel CKA concern: 0.08805758797635842




Kernel CKA non-concern: 0.08675299094504672




original model's perplexity




2.445301055908203




pruned model's perplexity




16.19196128845215




In [9]:
df_list = [report_to_df(df) for df in result_list]
new_df = append_nth_row(df_list)
csv_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
new_df.to_csv(f"results/{csv_name}.csv", index=False)
print(csv_name)
new_df

2024-10-24_05-45-00




Unnamed: 0,class,precision,recall,f1-score,support
0,0,0.0,0.0,0.0,797
1,1,0.0,0.0,0.0,775
2,2,0.0,0.0,0.0,795
3,3,0.0,0.0,0.0,1110
4,4,0.1391,0.9421,0.2425,1260
5,5,0.0,0.0,0.0,882
6,6,0.0,0.0,0.0,940
7,7,0.0,0.0,0.0,473
8,8,0.0,0.0,0.0,746
9,9,0.0,0.0,0.0,689
