In [1]:
import os
import sys

sys.path.append("../../../../../")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
import copy
import torch
from datetime import datetime
from src.utils.helper import Config, color_print
from src.utils.load import load_model, load_data, save_checkpoint
from src.models.evaluate import (
    evaluate_model,
    get_sparsity,
    get_similarity,
    get_perplexity,
)
from src.utils.sampling import SamplingDataset
from src.pruning.prune_head import head_importance_prunning
from src.pruning.prune import prune_concern_identification
from src.utils.helper import report_to_df, append_nth_row

In [3]:
name = "OSDG"
device = torch.device("cuda:0")
checkpoint = None
batch_size = 16
num_workers = 4
num_samples = 16
ratio = 0.6
seed = 44
include_layers = ["intermediate", "output"]
exclude_layers = [
    "attention",
]

In [4]:
script_start_time = datetime.now()
print(f"Script started at: {script_start_time.strftime('%Y-%m-%d %H:%M:%S')}")

Script started at: 2024-10-23 20:24:32


In [5]:
config = Config(name, device)
num_labels = config.config["num_labels"]
model = load_model(config)

Loading the model.




{

'architectures'

: 

'bert'

,
 

'dataset_name'

: 

'OSDG'

,
 

'model_name'

: 

'sadickam/sdg-classification-bert'

,
 

'num_labels'

: 

16

,
 

'tokenizer_name'

: 

'sadickam/sdg-classification-bert'

}




The model sadickam/sdg-classification-bert is loaded.




In [6]:
train_dataloader, valid_dataloader, test_dataloader = load_data(
    config,
    batch_size=batch_size,
    num_workers=num_workers,
    do_cache=True,
)

Loading cached dataset OSDG.




train.pkl is loaded from cache.




valid.pkl is loaded from cache.




test.pkl is loaded from cache.




The dataset OSDG is loaded




{

'config_name'

: 

'2024-01-01'

,
 

'features'

: 

{'first_column': 'text', 'second_column': 'labels'}

,
 

'path'

: 

'albertmartinez/OSDG'

}




In [7]:
# print("Evaluate the original model")
# result = evaluate_model(model, config, test_dataloader)

In [8]:
result_list = []

for concern in range(config.num_labels):
    config.init_seed()
    positive_samples = SamplingDataset(
        train_dataloader,
        config,
        concern,
        num_samples,
        True,
        4,
        resample=False,
    )
    negative_samples = SamplingDataset(
        train_dataloader,
        config,
        concern,
        num_samples,
        False,
        4,
        resample=False,
    )
    all_samples = SamplingDataset(
        train_dataloader,
        config,
        200,
        num_samples,
        False,
        4,
        resample=False,
    )

    module = copy.deepcopy(model)

    head_importance_prunning(module, config, all_samples, ratio)

    prune_concern_identification(
        module,
        config,
        positive_samples,
        negative_samples,
        include_layers=include_layers,
        exclude_layers=exclude_layers,
        sparsity_ratio=ratio,
        keep_dim=False,
        method="structed",
    )

    print(f"Evaluate the pruned model {concern}")
    result = evaluate_model(module, config, test_dataloader, verbose=True)
    result_list.append(result)
    get_sparsity(module)

    get_similarity(model, module, valid_dataloader, concern, num_samples, config)
    print("original model's perplexity")
    get_perplexity(model, valid_dataloader, config)
    print("pruned model's perplexity")
    get_perplexity(module, valid_dataloader, config)

Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 0




Evaluating the model:   0%|                                                                                   …

0.3275009915123137




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4708969409937056




CCA coefficients mean non-concern: 0.46692769087673214




Linear CKA concern: 0.11381794553526824




Linear CKA non-concern: 0.33498317888053686




Kernel CKA concern: 0.034361724103125275




Kernel CKA non-concern: 0.09908450241149283




original model's perplexity




2.445301055908203




pruned model's perplexity




16.17376136779785




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 1




Evaluating the model:   0%|                                                                                   …

0.3275009915123137




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4728501653687455




CCA coefficients mean non-concern: 0.46742772069169186




Linear CKA concern: 0.12990899333283787




Linear CKA non-concern: 0.3296565764997042




Kernel CKA concern: 0.03672699528362581




Kernel CKA non-concern: 0.0980657970872398




original model's perplexity




2.445301055908203




pruned model's perplexity




16.16522216796875




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 2




Evaluating the model:   0%|                                                                                   …

0.3275009915123137




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x75d1d02a2840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x75d1d02a2840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x75d1d02a2840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.47564032562045316




CCA coefficients mean non-concern: 0.4671660984262911




Linear CKA concern: 0.11611139897249072




Linear CKA non-concern: 0.3457856287893219




Kernel CKA concern: 0.0338838675990777




Kernel CKA non-concern: 0.10010440035403301




original model's perplexity




2.445301055908203




pruned model's perplexity




16.15528678894043




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 3




Evaluating the model:   0%|                                                                                   …

0.3275009915123137




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.471246396840823




CCA coefficients mean non-concern: 0.47061847983613864




Linear CKA concern: 0.1324485658524211




Linear CKA non-concern: 0.34838257839545717




Kernel CKA concern: 0.029772276853229265




Kernel CKA non-concern: 0.10258031925579915




original model's perplexity




2.445301055908203




pruned model's perplexity




16.143627166748047




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 4




Evaluating the model:   0%|                                                                                   …

0.3275009915123137




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x75d1d02a2840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x75d1d02a2840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4719130555106952




CCA coefficients mean non-concern: 0.46948280015278043




Linear CKA concern: 0.21909973390174808




Linear CKA non-concern: 0.3286949309725113




Kernel CKA concern: 0.09380065884172684




Kernel CKA non-concern: 0.08963778635681849




original model's perplexity




2.445301055908203




pruned model's perplexity




16.17348861694336




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 5




Evaluating the model:   0%|                                                                                   …

0.3275009915123137




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4628830097909024




CCA coefficients mean non-concern: 0.4660010326860924




Linear CKA concern: 0.16253120174585217




Linear CKA non-concern: 0.32635561120562084




Kernel CKA concern: 0.04566173658610513




Kernel CKA non-concern: 0.09496962333101087




original model's perplexity




2.445301055908203




pruned model's perplexity




16.193973541259766




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 6




Evaluating the model:   0%|                                                                                   …

0.3275009915123137




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.46241766712992305




CCA coefficients mean non-concern: 0.4692046789203598




Linear CKA concern: 0.08639443660720812




Linear CKA non-concern: 0.34509177460167184




Kernel CKA concern: 0.022345034879415616




Kernel CKA non-concern: 0.10241703633130972




original model's perplexity




2.445301055908203




pruned model's perplexity




16.15776252746582




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 7




Evaluating the model:   0%|                                                                                   …

0.3275009915123137




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x75d1d02a2840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x75d1d02a2840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x75d1d02a2840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4770974995905292




CCA coefficients mean non-concern: 0.4667690123209644




Linear CKA concern: 0.17285518053384072




Linear CKA non-concern: 0.33980147239880104




Kernel CKA concern: 0.04344841845389864




Kernel CKA non-concern: 0.09895555407740197




original model's perplexity




2.445301055908203




pruned model's perplexity




16.16157341003418




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 8




Evaluating the model:   0%|                                                                                   …

0.3275009915123137




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4691751925128502




CCA coefficients mean non-concern: 0.4671382540861614




Linear CKA concern: 0.14140138722447423




Linear CKA non-concern: 0.3410215182691787




Kernel CKA concern: 0.051648695184009635




Kernel CKA non-concern: 0.0952817861206111




original model's perplexity




2.445301055908203




pruned model's perplexity




16.162498474121094




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 9




Evaluating the model:   0%|                                                                                   …

0.3275009915123137




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x75d1d02a2840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x75d1d02a2840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x75d1d02a2840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4789744146475934




CCA coefficients mean non-concern: 0.4677983503750431




Linear CKA concern: 0.1731588911987306




Linear CKA non-concern: 0.3355446993610756




Kernel CKA concern: 0.04495209420116505




Kernel CKA non-concern: 0.09544523330548821




original model's perplexity




2.445301055908203




pruned model's perplexity




16.169710159301758




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 10




Evaluating the model:   0%|                                                                                   …

0.3275009915123137




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x75d1d02a2840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x75d1d02a2840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4793570005590977




CCA coefficients mean non-concern: 0.4663883101864727




Linear CKA concern: 0.16655314252841372




Linear CKA non-concern: 0.32785639262837163




Kernel CKA concern: 0.05021553481674129




Kernel CKA non-concern: 0.09667740727899508




original model's perplexity




2.445301055908203




pruned model's perplexity




16.199722290039062




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 11




Evaluating the model:   0%|                                                                                   …

0.3275009915123137




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4870563748258579




CCA coefficients mean non-concern: 0.46633924355540346




Linear CKA concern: 0.14375252644026995




Linear CKA non-concern: 0.3321737936106253




Kernel CKA concern: 0.03873962996747433




Kernel CKA non-concern: 0.09825723117959256




original model's perplexity




2.445301055908203




pruned model's perplexity




16.163986206054688




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 12




Evaluating the model:   0%|                                                                                   …

0.3275009915123137




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.47136181074202027




CCA coefficients mean non-concern: 0.4702429295920276




Linear CKA concern: 0.18013367240187292




Linear CKA non-concern: 0.34648334446036977




Kernel CKA concern: 0.055070476747827515




Kernel CKA non-concern: 0.09766907824801321




original model's perplexity




2.445301055908203




pruned model's perplexity




16.175779342651367




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 13




Evaluating the model:   0%|                                                                                   …

0.3275009915123137




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x75d1d02a2840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




Exception ignored in: 

<function _MultiProcessingDataLoaderIter.__del__ at 0x75d1d02a2840>




Traceback (most recent call last):


  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1477, in __del__


    

self._shutdown_workers()




  File "/home/jieungkim/.cache/pypoetry/virtualenvs/decomposetransformer-UESb9BbT-py3.12/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1460, in _shutdown_workers


    

if w.is_alive():




 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^




  File "/home/jieungkim/anaconda3/lib/python3.12/multiprocessing/process.py", line 160, in is_alive


    

assert self._parent_pid == os.getpid(), 'can only test a child process'




 

 

 

 

 

 

 

 

 

 

 

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^

^




AssertionError

: 

can only test a child process




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4803868588913015




CCA coefficients mean non-concern: 0.46525648470622005




Linear CKA concern: 0.12488643682877312




Linear CKA non-concern: 0.34379259850463656




Kernel CKA concern: 0.03908594350284231




Kernel CKA non-concern: 0.09993251450587753




original model's perplexity




2.445301055908203




pruned model's perplexity




16.18681526184082




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 14




Evaluating the model:   0%|                                                                                   …

0.3275009915123137




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4708450603538039




CCA coefficients mean non-concern: 0.466388672745908




Linear CKA concern: 0.14161889283366824




Linear CKA non-concern: 0.33282463913279253




Kernel CKA concern: 0.040516476254554394




Kernel CKA non-concern: 0.09716727111551914




original model's perplexity




2.445301055908203




pruned model's perplexity




16.182300567626953




Total heads to prune: 86




tensor([[0.4747, 0.4317, 0.4891, 0.5093, 0.4567, 0.4666, 0.5067, 0.4785, 0.5582,
         0.4236, 0.5825, 0.4175],
        [0.4234, 0.6757, 0.3818, 0.3243, 0.6439, 0.4396, 0.6521, 0.3851, 0.3816,
         0.3963, 0.4775, 0.5047],
        [0.6747, 0.4075, 0.2934, 0.2980, 0.3167, 0.3683, 0.3579, 0.3163, 0.2893,
         0.7107, 0.3948, 0.3906],
        [0.4759, 0.4354, 0.3277, 0.4589, 0.3294, 0.6723, 0.3433, 0.3908, 0.4428,
         0.6593, 0.3811, 0.6148],
        [0.7708, 0.3074, 0.3783, 0.5660, 0.4790, 0.4956, 0.3807, 0.3298, 0.2292,
         0.4202, 0.5358, 0.5003],
        [0.4186, 0.3720, 0.3554, 0.5116, 0.4853, 0.2781, 0.3575, 0.5138, 0.3230,
         0.7219, 0.6544, 0.2794],
        [0.4631, 0.4400, 0.4843, 0.4607, 0.4504, 0.6346, 0.6040, 0.4176, 0.4031,
         0.6338, 0.3654, 0.5141],
        [0.6227, 0.5131, 0.4976, 0.3439, 0.6072, 0.5023, 0.3705, 0.5096, 0.4806,
         0.6561, 0.5942, 0.4583],
        [0.5007, 0.4401, 0.5599, 0.5067, 0.4545, 0.5377, 0.5388, 0.3886, 0.3313,




{(4, 9), (5, 1), (8, 9), (11, 5), (2, 2), (0, 5), (2, 11), (4, 2), (3, 6), (9, 10), (0, 7), (2, 4), (1, 8), (6, 4), (7, 3), (3, 8), (5, 5), (8, 4), (9, 3), (0, 0), (11, 9), (0, 9), (1, 10), (3, 1), (3, 10), (11, 2), (1, 3), (3, 3), (5, 0), (11, 4), (10, 8), (1, 5), (6, 1), (5, 2), (4, 4), (5, 11), (1, 7), (2, 6), (7, 11), (3, 7), (4, 6), (10, 3), (1, 0), (1, 9), (0, 11), (2, 8), (6, 8), (3, 0), (5, 6), (4, 8), (8, 8), (10, 5), (1, 2), (0, 4), (2, 1), (2, 10), (6, 10), (3, 2), (4, 1), (8, 1), (10, 7), (11, 6), (2, 3), (6, 3), (3, 4), (10, 0), (9, 11), (11, 8), (2, 5), (10, 2), (9, 4), (11, 1), (0, 1), (10, 11), (11, 10), (2, 7), (6, 7), (7, 6), (4, 7), (5, 8), (8, 7), (10, 4), (9, 6), (11, 3), (6, 0), (7, 8)}




Evaluate the pruned model 15




Evaluating the model:   0%|                                                                                   …

0.3275009915123137




{'bert.encoder.layer.0.attention.self.query.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.query.bias': 0.0, 'bert.encoder.layer.0.attention.self.key.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.key.bias': 0.0, 'bert.encoder.layer.0.attention.self.value.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.self.value.bias': 0.0, 'bert.encoder.layer.0.attention.output.dense.weight': 0.5833333333333334, 'bert.encoder.layer.0.attention.output.dense.bias': 0.0, 'bert.encoder.layer.0.intermediate.dense.weight': 0.0, 'bert.encoder.layer.0.intermediate.dense.bias': 0.0, 'bert.encoder.layer.0.output.dense.weight': 0.0, 'bert.encoder.layer.0.output.dense.bias': 0.0, 'bert.encoder.layer.1.attention.self.query.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.query.bias': 0.0, 'bert.encoder.layer.1.attention.self.key.weight': 0.6666666666666666, 'bert.encoder.layer.1.attention.self.key.bias': 0.0, 'bert.encoder.layer.1.attention.self.




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




adding eps to diagonal and taking inverse




taking square root




dot products...




trying to take final svd




computed everything!




CCA coefficients mean concern: 0.4668828079070941




CCA coefficients mean non-concern: 0.4686059211839192




Linear CKA concern: 0.18688382609881266




Linear CKA non-concern: 0.3376316805807016




Kernel CKA concern: 0.08805758534608732




Kernel CKA non-concern: 0.08675299425236516




original model's perplexity




2.445301055908203




pruned model's perplexity




16.19196128845215




In [9]:
df_list = [report_to_df(df) for df in result_list]
new_df = append_nth_row(df_list)
csv_name = f"{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}"
new_df.to_csv(f"results/{csv_name}.csv", index=False)
print(csv_name)
new_df

2024-10-23_21-26-28




Unnamed: 0,class,precision,recall,f1-score,support
0,0,0.0,0.0,0.0,797
1,1,0.0,0.0,0.0,775
2,2,0.0,0.0,0.0,795
3,3,0.0,0.0,0.0,1110
4,4,0.1391,0.9421,0.2425,1260
5,5,0.0,0.0,0.0,882
6,6,0.0,0.0,0.0,940
7,7,0.0,0.0,0.0,473
8,8,0.0,0.0,0.0,746
9,9,0.0,0.0,0.0,689
