In [2]:
import os
import sys

sys.path.append("../../../")
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
import copy
import torch
from datetime import datetime
from src.utils.helper import Config, color_print
from src.utils.load import load_model, load_data, save_checkpoint, load_checkpoint
from src.models.evaluate import evaluate_model, get_sparsity, get_similarity
from src.utils.sampling import SamplingDataset
from src.pruning.prune_head import head_importance_prunning

In [4]:
name = "bert-4-128-yahoo"
device = torch.device("cuda:0")
checkpoint = None
batch_size = 16
num_workers = 4
num_samples = 128
ci_ratio = 0.3
seed = 44

In [5]:
config = Config(name, device)

In [6]:
model = load_model(config=config)

Loading the model.
{'architectures': 'bert',
 'dataset_name': 'YahooAnswersTopics',
 'model_name': 'models/bert-4-128-yahoo',
 'num_labels': 10,
 'tokenizer_name': 'fabriceyhc/bert-base-uncased-yahoo_answers_topics'}
The model models/bert-4-128-yahoo is loaded.


In [7]:
train_dataloader, valid_dataloader, test_dataloader = load_data(
    config,
    batch_size=batch_size,
    num_workers=num_workers,
    do_cache=True,
)

Loading cached dataset YahooAnswersTopics.
train.pkl is loaded from cache.
valid.pkl is loaded from cache.
test.pkl is loaded from cache.
The dataset YahooAnswersTopics is loaded
{'config_name': 'yahoo_answers_topics',
 'features': {'first_column': 'question_title', 'second_column': 'topic'},
 'path': 'yahoo_answers_topics'}


In [8]:
positive_samples = SamplingDataset(
    train_dataloader,
    config,
    0,
    num_samples,
    True,
    4,
    resample=False,
)

In [9]:
negative_samples = SamplingDataset(
    train_dataloader,
    config,
    0,
    num_samples,
    False,
    4,
    resample=False,
)

In [10]:
import torch
import torch.nn as nn
from scipy.stats import norm
from typing import *
from torch import Tensor
from torch.nn import Module
import torch.nn.functional as F
from functools import partial
from src.utils.sampling import SamplingDataset
from src.pruning.propagate import propagate
from src.utils.helper import Config
import gc


class Methods:
    def __init__(self, ratio: float, axis: int = 0) -> None:
        self.ratio = ratio
        self.axis = axis
        self.coefficient = None

    def ci(self, layer, inputs, outputs):
        current_weight = layer.weight.data
        importance_score = torch.abs(current_weight) * torch.abs(self.coefficient)

        W_mask = torch.zeros_like(importance_score) == 1
        sort_res = torch.sort(importance_score, dim=self.axis, stable=True)

        num_prune = int(importance_score.shape[self.axis] * self.ratio)

        if self.axis == 0:
            indices_to_prune = sort_res[1][:num_prune, :]
        else:
            indices_to_prune = sort_res[1][:, :num_prune]
        W_mask.scatter_(self.axis, indices_to_prune, True)
        current_weight[W_mask] = 0


def find_layers(
    model: Module,
    layer_types: Optional[List[Type[Module]]] = None,
    include_layers: Optional[List[str]] = None,
    exclude_layers: Optional[List[str]] = None,
    prefix: str = "",
) -> Dict[str, Module]:
    if layer_types is None:
        layer_types = [nn.Linear]
    if include_layers is None:
        include_layers = []
    if exclude_layers is None:
        exclude_layers = []
    layers_dict: Dict[str, Module] = {}

    def recursive_find(module: Module, prefix: str) -> None:
        for name, layer in module.named_children():
            layer_name = f"{prefix}.{name}" if prefix else name
            if any(exclude in layer_name for exclude in exclude_layers):
                continue
            if include_layers and not any(
                include in layer_name for include in include_layers
            ):
                if not any(isinstance(layer, t) for t in layer_types):
                    recursive_find(layer, layer_name)
                continue
            if isinstance(layer, tuple(layer_types)):
                layers_dict[layer_name] = layer
            else:
                recursive_find(layer, layer_name)

    recursive_find(model, prefix)

    return layers_dict


def get_hook(method):
    def hook(module, input, output):
        method(module, input, output)

    return hook


def get_embeddings(model, dataloader):
    embeddings_list = {"embeddings": [], "labels": [], "attention_mask": []}

    for batch in dataloader:
        input_ids = batch["input_ids"]
        labels = batch["labels"]
        attention_mask = batch["attention_mask"]
        with torch.no_grad():
            input_embeddings = model.get_input_embeddings()(input_ids)
        embeddings_list["embeddings"].append(input_embeddings)
        embeddings_list["labels"].append(labels)
        embeddings_list["attention_mask"].append(attention_mask)

        from src.utils.data_class import CustomEmbeddingDataset
    return CustomEmbeddingDataset(embeddings_list)


def prune_concern_identification(
    model: Module,
    config: Config,
    dominant_concern: SamplingDataset,
    non_dominant_concern: SamplingDataset,
    sparsity_ratio: float = 0.6,
    include_layers: Optional[List[str]] = None,
    exclude_layers: Optional[List[str]] = None,
) -> None:
    layers = find_layers(
        model, include_layers=include_layers, exclude_layers=exclude_layers
    )
    handle_list = []

    method1 = Methods(sparsity_ratio, axis=0)
    method2 = Methods(sparsity_ratio, axis=1)

    for name, layer in layers.items():
        if "intermediate" in name:
            handle = layer.register_forward_hook(method1.ci)
        else:
            handle = layer.register_forward_hook(method2.ci)
        handle_list.append(handle)

    pos_embeddings = get_embeddings(model, dominant_concern)
    neg_embeddings = get_embeddings(model, non_dominant_concern)
    dominant_batches = list(pos_embeddings)
    non_dominant_batches = list(neg_embeddings)
    combined_batches = {}
    keys = dominant_batches[0].keys()
    for key in keys:
        combined_batches[key] = torch.cat(
            [batch[key] for batch in dominant_batches + non_dominant_batches]
        )

    combined_dataloader = [combined_batches]
    method1.coefficient = calc_coefficient(combined_dataloader, dim=0).to(config.device)
    print(method1.coefficient)
    method2.coefficient = calc_coefficient(combined_dataloader, dim=1).to(config.device)
    print(method2.coefficient)
    propagate(model, combined_dataloader, config)

    for handle in handle_list:
        handle.remove()


def calc_coefficient(combined_dataloader, dim=0):
    X = combined_dataloader[0]["embeddings"]

    batch_size = X.shape[0] // 2
    concern_inputs, non_concern_inputs = (
        X[:batch_size],
        X[batch_size:],
    )

    calc_norm = lambda tensors, dim: torch.norm(
        tensors.reshape((-1, tensors.shape[-1])), dim=dim
    )

    if dim == 0:
        new_shape = (1, -1)
    else:
        new_shape = (-1, 1)
    concern_norm = calc_norm(concern_inputs, dim=0).reshape(new_shape)
    non_concern_norm = calc_norm(non_concern_inputs, dim=0).reshape(new_shape)

    cosine_similarity = F.cosine_similarity(
        concern_inputs.reshape((-1, concern_inputs.shape[-1])),
        non_concern_inputs.reshape((-1, non_concern_inputs.shape[-1])),
        dim=0,
    ).reshape(new_shape)

    sine_similarity = torch.sign(cosine_similarity) * torch.sqrt(
        1 - cosine_similarity**2
    )
    euclidean_distance = torch.sqrt(concern_norm**2 + non_concern_norm**2)
    coefficient = (
        concern_norm
        + sine_similarity
        * torch.abs(concern_norm + non_concern_norm)
        / euclidean_distance
    )
    return coefficient

In [11]:
dominant_embeddings = get_embeddings(model, positive_samples)
non_dominant_embeddings = get_embeddings(model, negative_samples)

In [12]:
import numpy as np


def pca(X, n_components=2):
    """
    주성분 분석(PCA)을 수행하는 함수.

    Parameters:
    - X: 입력 데이터 행렬 (샘플 수 x 피처 수)
    - n_components: 추출할 주성분의 개수

    Returns:
    - X_pca: 주성분으로 변환된 데이터
    - explained_variance: 각 주성분의 설명된 분산 비율
    """
    # Step 1: 데이터 중심화
    X_mean = np.mean(X, axis=0)
    X_centered = X - X_mean

    # Step 2: 공분산 행렬 계산
    covariance_matrix = np.cov(X_centered, rowvar=False)

    # Step 3: 고유값 및 고유벡터 계산 (고유값 분해)
    eigenvalues, eigenvectors = np.linalg.eigh(covariance_matrix)

    # 고유값을 내림차순으로 정렬하고, 그에 맞게 고유벡터도 정렬
    sorted_indices = np.argsort(eigenvalues)[::-1]
    sorted_eigenvalues = eigenvalues[sorted_indices]
    sorted_eigenvectors = eigenvectors[:, sorted_indices]

    # Step 4: 주성분 선택 (상위 n_components 개수만큼)
    selected_eigenvectors = sorted_eigenvectors[:, :n_components]

    # Step 5: 데이터 변환 (주성분 축으로)
    X_pca = np.dot(X_centered, selected_eigenvectors)

    # 설명된 분산 비율 계산
    explained_variance = sorted_eigenvalues / np.sum(sorted_eigenvalues)

    return X_pca, explained_variance[:n_components]

In [13]:
for batch in dominant_embeddings:
    print(batch)
    break

{'embeddings': tensor([[[ 0.0276,  0.0207,  0.0063,  ...,  0.0008,  0.0820, -0.0162],
         [ 0.0156,  0.0078,  0.0037,  ...,  0.0462, -0.0004,  0.0003],
         [ 0.0010,  0.0036,  0.0041,  ..., -0.0162, -0.0136, -0.0167],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0276,  0.0207,  0.0063,  ...,  0.0008,  0.0820, -0.0162],
         [ 0.0292,  0.0436,  0.0439,  ...,  0.0255,  0.0244,  0.0213],
         [ 0.0182, -0.0137,  0.0317,  ...,  0.0715,  0.0414, -0.0087],
         ...,
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.0276,  0.0207,  0.0063,  ...,  0.0008,  0.0820, -0.0162],
         [ 0.0300, -0.0336, -0

In [15]:
X_pca_result, explained_variance = pca(dominant_embeddings, n_components=2)
print(X_pca_result)
print(explained_variance)

TypeError: list indices must be integers or slices, not str

In [20]:
result_list = []

for concern in range(config.num_labels):
    config.init_seed()
    positive_samples = SamplingDataset(
        train_dataloader,
        config,
        concern,
        num_samples,
        True,
        4,
        resample=False,
    )
    negative_samples = SamplingDataset(
        train_dataloader,
        config,
        concern,
        num_samples,
        False,
        4,
        resample=False,
    )
    all_samples = SamplingDataset(
        train_dataloader,
        config,
        200,
        num_samples,
        False,
        4,
        resample=False,
    )

    module = copy.deepcopy(model)

    prune_concern_identification(
        module,
        config,
        positive_samples,
        negative_samples,
        include_layers=["intermediate", "output"],
        exclude_layers=["attention"],
        sparsity_ratio=0.5,
    )

    print(f"Evaluate the pruned model {concern}")
    result = evaluate_model(module, config, test_dataloader)
    result_list.append(result)
    break

tensor([[ 2.7498,  2.6586,  2.5813,  2.4375,  2.4847,  2.5719,  2.9446,  2.6361,
          3.3812,  2.5169,  2.6274,  2.6549,  2.9706,  2.6312,  2.6641,  2.6347,
          2.7622,  2.7244,  2.8207,  2.5725,  2.5118,  2.5975,  2.5501,  2.8830,
          3.1511,  2.7206,  2.5714,  2.4353,  2.5364,  2.5585,  2.8917,  2.4120,
          2.6312,  2.6279,  2.5553,  2.5716,  2.6944,  2.6097,  2.6364,  2.4436,
          2.6042,  2.9986,  2.6202,  2.5144,  2.7494,  2.6859,  2.9495,  2.6391,
          2.6282,  2.6818,  2.7705,  2.6175,  2.5933,  2.8102,  2.4410,  2.7559,
          2.5457,  2.7021,  2.7265,  2.6974,  2.9685, -0.2956,  2.7330,  2.4625,
          2.9124,  2.8119,  2.8760,  2.6163,  2.4981, -0.3212,  2.8910,  2.5707,
          2.6873,  2.5257,  2.6587,  2.5876,  2.5620,  2.5683,  2.5285,  2.6468,
          2.7260,  2.5209, -0.3453,  2.4982,  2.9877,  2.4951,  2.6440,  2.8882,
          2.6748,  3.1047,  2.5927,  2.8648,  2.5715,  2.4317,  3.5274,  2.6261,
          2.6560,  2.6035,  

Evaluating the model:   0%|          | 0/1875 [00:00<?, ?it/s]

Loss: 1.2224
Precision: 0.6499, Recall: 0.6129, F1-Score: 0.6186
              precision    recall  f1-score   support

           0     0.5405    0.4836    0.5105      2992
           1     0.6909    0.4766    0.5641      2992
           2     0.7030    0.6036    0.6495      3012
           3     0.3329    0.6468    0.4396      2998
           4     0.7237    0.7753    0.7486      2973
           5     0.8502    0.7564    0.8006      3054
           6     0.6870    0.4006    0.5061      3003
           7     0.6198    0.6398    0.6296      3012
           8     0.5890    0.7113    0.6444      2982
           9     0.7619    0.6351    0.6928      2982

    accuracy                         0.6130     30000
   macro avg     0.6499    0.6129    0.6186     30000
weighted avg     0.6502    0.6130    0.6188     30000



In [21]:
from src.utils.helper import report_to_df, append_nth_row

df_list = [report_to_df(df) for df in result_list]
new_df = append_nth_row(df_list)
new_df

Unnamed: 0,class,precision,recall,f1-score,support
0,0,0.5405,0.4836,0.5105,2992
