In [1]:
import io
import logging
from collections import defaultdict
from typing import Dict, Tuple

import lightning as L
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn.functional as F
from omegaconf import DictConfig, OmegaConf
from peft import LoraConfig, PeftModel, get_peft_model
from rich import print
from torch import Tensor
from tqdm.auto import tqdm
from transformers import CLIPModel, CLIPProcessor, CLIPVisionModel
from transformers.models.clip.modeling_clip import CLIPVisionTransformer

from fusion_bench.method import load_algorithm_from_config
from fusion_bench.models.linearized.vision_model import (
    load_l_lora_vision_model_hf,
    load_lora_vision_model_hf,
)
from fusion_bench.taskpool import load_taskpool_from_config
from fusion_bench.method.ties_merging.ties_merging_utils import state_dict_to_vector

logging.basicConfig(level=logging.WARN)

In [2]:
# replace this using `openai/clip-vit-base-patch16`
BASE_MODEL = (
    "/data0/users/tanganke/data/huggingface_models/openai/clip-vit-base-patch16"
)
DATASET_NAMES = [
    "sun397",
    "stanford-cars",
    "resisc45",
    "eurosat",
    "svhn",
    "gtsrb",
    "mnist",
    "dtd",
]


def normalize(tensor: Tensor, dim: int = 0, eps: float = 1e-8) -> Tensor:
    """
    Normalizes a tensor along a given dimension.

    Args:
        tensor (Tensor): The tensor to normalize.
        dim (int, optional): The dimension along which to normalize the tensor. Defaults to 0.
        eps (float, optional): A small value to add to the denominator to avoid division by zero. Defaults to 1e-8.

    Returns:
        Tensor: The normalized tensor.
    """
    return tensor / torch.clamp(torch.norm(tensor, dim=dim, keepdim=True), min=eps)


def compute_cos_similarity_as_df(
    task_vectors_as_state_dicts: Dict[str, Dict[str, Tensor]]
):
    task_vectors = torch.stack(
        [
            state_dict_to_vector(task_vectors_as_state_dicts[dataset_name]).double()
            for dataset_name in DATASET_NAMES
        ]
    )
    normalized_task_vectors = normalize(task_vectors, dim=1)

    results = defaultdict(lambda: list())
    for task_0_idx, task_0 in tqdm(enumerate(DATASET_NAMES), total=len(DATASET_NAMES)):
        for task_1_idx, task_1 in enumerate(DATASET_NAMES):
            results["task:0"].append(task_0)
            results["task:1"].append(task_1)
            results["cosine_similarity"].append(
                F.cosine_similarity(
                    normalized_task_vectors[task_0_idx],
                    normalized_task_vectors[task_1_idx],
                    dim=0,
                ).item()
            )
    return pd.DataFrame(results)

Task arithmetic on lora models.

$$ \theta_{merged} = \theta_{pretrained} + \lambda \sum_{i=1}^{n} \theta_{i} - \theta_{pretrained} $$

Typically, there are two ways to do this:

1. Perform the arithmetic operation on the adapters. Such as AdapterSoup, LoraHub, and L-LoRA paper. But this needs to save the initialization state of the adapters, which is not convenient for deployment.
2. Perform the arithmetic operation in the original model weight space, i.e after merge and unload the adapters.

I choose the second way here.

In [3]:
# This algorithm object can be used to merge the weights of the pre-trained model and the fine-tuned model.
alg = load_algorithm_from_config(
    DictConfig({"name": "task_arithmetic", "scaling_factor": 0.3})
)

The following taskpool is used to evaluate the performance of the merged model. Refer to `config/taskpool/clip-vit-classification_TA8.yaml`

In [4]:
# I use some local paths here, you can replace them with your own paths.
taskpool_config = f"""
type: clip_vit_classification
name: clip-vit-classification_TA8

dataset_type: huggingface_image_classification
tasks:
  - name: sun397
    dataset:
      name: tanganke/sun397
      path: /data0/users/tanganke/data/huggingface_datasets/tanganke/sun397
      split: test
  - name: stanford_cars
    dataset:
      name: tanganke/stanford_cars
      path: /data0/users/tanganke/data/huggingface_datasets/tanganke/stanford_cars
      split: test
  - name: resisc45
    dataset:
      name: tanganke/resisc45
      path: /data0/users/tanganke/data/huggingface_datasets/tanganke/resisc45
      split: test
  - name: eurosat
    dataset:
      name: tanganke/eurosat
      path: /data0/users/tanganke/data/huggingface_datasets/tanganke/eurosat
      split: test
  - name: svhn
    dataset:
      type: instantiate
      name: svhn
      object: 
        _target_: datasets.load_dataset
        _args_:
          - /data0/users/tanganke/data/huggingface_datasets/ufldl-stanford/svhn
          - cropped_digits
        split: test
  - name: gtsrb
    dataset:
      name: tanganke/gtsrb
      path: /data0/users/tanganke/data/huggingface_datasets/tanganke/gtsrb
      split: test
  - name: mnist
    dataset:
      name: mnist
      path: /data0/users/tanganke/data/huggingface_datasets/ylecun/mnist
      split: test
  - name: dtd
    dataset:
      name: tanganke/dtd
      path: /data0/users/tanganke/data/huggingface_datasets/tanganke/dtd
      split: test

clip_model: {BASE_MODEL}
batch_size: 128
num_workers: 16
fast_dev_run: false

"""
taskpool_config = OmegaConf.load(io.StringIO(taskpool_config))
taskpool = load_taskpool_from_config(taskpool_config)

## LoRA models

load the pretrained model and fine-tuned models.

In [7]:
# for task arithmetic, we need a pre-trained model and several fine-tuned models
models = {
    "_pretrained_": CLIPVisionModel.from_pretrained(BASE_MODEL).vision_model
}

# load the fine-tuned models
for dataset_name in tqdm(DATASET_NAMES):
    models[dataset_name] = load_lora_vision_model_hf(
        f"{BASE_MODEL}",
        f"tanganke/clip-vit-base-patch16_{dataset_name.replace('_','-')}_lora-16",
    ).merge_and_unload()

  0%|          | 0/8 [00:00<?, ?it/s]

merge them into a merged_model

In [8]:
# the models should be homogenous, here they are all `CLIPVisionTransformer`
merged_model = alg.run(models)

Profiler Report

--------------------------------------------------------------------------------------------------------------------------
|  Action       	|  Mean duration (s)	|  Num calls      	|  Total time (s) 	|  Percentage %   	|
--------------------------------------------------------------------------------------------------------------------------
|  Total        	|  -              	|  21             	|  63.559         	|  100 %          	|
--------------------------------------------------------------------------------------------------------------------------
|  load model   	|  0.03468        	|  11             	|  0.38148        	|  0.6002         	|
|  merge weights	|  0.024422       	|  10             	|  0.24422        	|  0.38423        	|
--------------------------------------------------------------------------------------------------------------------------



evaluate

In [25]:
report = taskpool.evaluate(merged_model)

You are using a CUDA device ('NVIDIA GeForce RTX 4090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision


Evaluating tasks:   0%|          | 0/8 [00:00<?, ?it/s]



Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/18 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/156 [00:00<?, ?it/s]



Evaluating:   0%|          | 0/63 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/50 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/22 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/204 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/99 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/79 [00:00<?, ?it/s]

Evaluating:   0%|          | 0/15 [00:00<?, ?it/s]

In [28]:
print(report)