## Model Structure

In [None]:
import nltk
#nltk.download('punkt')
import argparse
import os
os.environ["HUGGINGFACE_HUB_URL"] = "https://hf-mirror.com"
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"

from ruamel.yaml import YAML
import numpy as np
import random
import time
import datetime
import json
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torch.distributed as dist
from torch.utils.data import DataLoader

from models.blip import blip_decoder
import utils

from utils import cosine_lr_schedule

from data import create_dataset, create_sampler, create_loader

from data.utils import save_result, coco_caption_eval

parser = argparse.ArgumentParser()
parser.add_argument('--config', default='/GPFS/rhome/kejinli/workspace/multi-modal/BLIP/BLIP4caption/configs/caption_coco.yaml')
parser.add_argument('--output_dir', default='output/Caption_coco')        
parser.add_argument('--evaluate', action='store_true')    
parser.add_argument('--device', default='cuda')
parser.add_argument('--seed', default=42, type=int)
parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes')    
parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
parser.add_argument('--distributed', default=True, type=bool)
args, _ = parser.parse_known_args()

yaml = YAML(typ='rt')
with open(args.config, 'r') as f:
    config = yaml.load(f)

args.result_dir = os.path.join(args.output_dir, 'result')

Path(args.output_dir).mkdir(parents=True, exist_ok=True)
Path(args.result_dir).mkdir(parents=True, exist_ok=True)
        
yaml = YAML()
with open(os.path.join(args.output_dir, 'config.yaml'), 'w') as f:
    yaml.dump(config, f)

# model = blip_decoder(pretrained=config['pretrained'], image_size=config['image_size'], vit=config['vit'], 
#                            vit_grad_ckpt=config['vit_grad_ckpt'], vit_ckpt_layer=config['vit_ckpt_layer'], 
#                            prompt=config['prompt'])

# from torch.utils.data import Subset, DataLoader

# train_dataset, val_dataset, test_dataset = create_dataset('caption_coco', config)  
# train_subset = Subset(train_dataset, range(12))
# val_subset = Subset(val_dataset, range(120))
# test_subset = Subset(test_dataset, range(120))

# if args.distributed:
#     num_tasks = utils.get_world_size()
#     global_rank = utils.get_rank()            
#     samplers = create_sampler([train_subset,val_subset,test_subset], [True,False,False], num_tasks, global_rank)         
# else:
#     samplers = [None, None, None]
    
# train_loader, val_loader, test_loader = create_loader([train_subset, val_subset, test_subset],samplers,
#                                                           batch_size=[config['batch_size']]*3,num_workers=[4,4,4],
#                                                           is_trains=[True, False, False], collate_fns=[None,None,None])

reshape position embedding from 196 to 576
load checkpoint from /GPFS/rhome/kejinli/workspace/multi-modal/BLIP/BLIP4caption/pretrained_model/model_base_capfilt_large.pth
Using downloaded and verified file: /GPFS/rhome/kejinli/workspace/multi-modal/BLIP/BLIP4caption/caption_data/coco/annotations/coco_karpathy_train.json
Using downloaded and verified file: /GPFS/rhome/kejinli/workspace/multi-modal/BLIP/BLIP4caption/caption_data/coco/annotations/coco_karpathy_val.json
Using downloaded and verified file: /GPFS/rhome/kejinli/workspace/multi-modal/BLIP/BLIP4caption/caption_data/coco/annotations/coco_karpathy_test.json


In [None]:
for a in train_loader:
    #print(len(a))
    pass

image = a[0]
caption = a[1]

loss = model(image, caption)

torch.Size([12, 19])


## Lora Finetune

In [None]:
import nltk
#nltk.download('punkt')
import argparse
import os
os.environ["HUGGINGFACE_HUB_URL"] = "https://hf-mirror.com"
os.environ["CUDA_VISIBLE_DEVICES"] = "4,5,6,7"

from ruamel.yaml import YAML
import numpy as np
import random
import time
import datetime
import json
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torch.distributed as dist
from torch.utils.data import DataLoader

from models.blip import blip_decoder
import utils

from utils import cosine_lr_schedule

from data import create_dataset, create_sampler, create_loader

from data.utils import save_result, coco_caption_eval
from peft import get_peft_model, LoraConfig

parser = argparse.ArgumentParser()
parser.add_argument('--config', default='/GPFS/rhome/kejinli/workspace/multi-modal/BLIP/BLIP4caption/configs/caption_coco.yaml')
parser.add_argument('--output_dir', default='output/Caption_coco')        
parser.add_argument('--evaluate', action='store_true')    
parser.add_argument('--device', default='cuda')
parser.add_argument('--seed', default=42, type=int)
parser.add_argument('--world_size', default=1, type=int, help='number of distributed processes')    
parser.add_argument('--dist_url', default='env://', help='url used to set up distributed training')
parser.add_argument('--distributed', default=True, type=bool)
args, _ = parser.parse_known_args()

yaml = YAML(typ='rt')
with open(args.config, 'r') as f:
    config = yaml.load(f)

args.result_dir = os.path.join(args.output_dir, 'result')

Path(args.output_dir).mkdir(parents=True, exist_ok=True)
Path(args.result_dir).mkdir(parents=True, exist_ok=True)
        
yaml = YAML()
with open(os.path.join(args.output_dir, 'config.yaml'), 'w') as f:
    yaml.dump(config, f)

utils.init_distributed_mode(args)
    
device = torch.device(args.device)
print('*******************************************************************************************************')
print('Log:')
print('Device:', device) 
    
# fix the seed for reproducibility
seed = args.seed + utils.get_rank()
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
cudnn.benchmark = True

#### Dataset #### 
print("Creating captioning dataset")

#*******************************************************************************************************
train_dataset, val_dataset, test_dataset = create_dataset('caption_coco', config)  

if args.distributed:
    num_tasks = utils.get_world_size()
    global_rank = utils.get_rank()            
    samplers = create_sampler([train_dataset,val_dataset,test_dataset], [True,False,False], num_tasks, global_rank)         
else:
    samplers = [None, None, None]

train_loader, val_loader, test_loader = create_loader([train_dataset, val_dataset, test_dataset],samplers,
                                                        batch_size=[config['batch_size']]*3,num_workers=[4,4,4],
                                                        is_trains=[True, False, False], collate_fns=[None,None,None])
#*******************************************************************************************************
# from torch.utils.data import Subset, DataLoader

# train_dataset, val_dataset, test_dataset = create_dataset('caption_coco', config)  
# train_subset = Subset(train_dataset, range(12000))
# val_subset = Subset(val_dataset, range(2400))
# test_subset = Subset(test_dataset, range(2400))

# if args.distributed:
#     num_tasks = utils.get_world_size()
#     global_rank = utils.get_rank()            
#     samplers = create_sampler([train_subset,val_subset,test_subset], [True,False,False], num_tasks, global_rank)         
# else:
#     samplers = [None, None, None]

# train_loader, val_loader, test_loader = create_loader([train_subset, val_subset, test_subset],samplers,
#                                                       batch_size=[config['batch_size']]*3,num_workers=[4,4,4],
#                                                       is_trains=[True, False, False], collate_fns=[None,None,None])      
#*******************************************************************************************************


#### Model ####
print("Creating model")
model = blip_decoder(pretrained=config['pretrained'], image_size=config['image_size'], vit=config['vit'], 
                        vit_grad_ckpt=config['vit_grad_ckpt'], vit_ckpt_layer=config['vit_ckpt_layer'], 
                        prompt=config['prompt'])

#model = model.to(device)
print('Model is loaded')   

target_modules_image = []
target_modules_text = []
for i in range(12):  # depth=12
    target_modules_image.extend([
        f"blocks.{i}.attn.qkv",
        f"blocks.{i}.attn.proj",
        f"blocks.{i}.mlp.fc1",
        f"blocks.{i}.mlp.fc2"
    ])

for i in range(12):
    target_modules_text.extend([
        f"bert.encoder.layer.{i}.attention.self.query",
        f"bert.encoder.layer.{i}.attention.self.key",
        f"bert.encoder.layer.{i}.attention.self.value",
        f"bert.encoder.layer.{i}.attention.output.dense",
        f"bert.encoder.layer.{i}.intermediate.dense",
        f"bert.encoder.layer.{i}.output.dense",
        f"bert.encoder.layer.{i}.crossattention.self.query",
        f"bert.encoder.layer.{i}.crossattention.self.key",
        f"bert.encoder.layer.{i}.crossattention.self.value",
        f"bert.encoder.layer.{i}.crossattention.output.dense",
    ])

lora_config_image= LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=target_modules_image,
        lora_dropout=0.1
    )

lora_config_text= LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=target_modules_text,
        lora_dropout=0.1
    )

for param in model.parameters():
    param.requires_grad = False

from transformers import PretrainedConfig

visual_encoder_config = PretrainedConfig(
    img_size=384, 
    patch_size=16, 
    in_chans=3, 
    num_classes=1000, 
    embed_dim=768, 
    depth=12,
    num_heads=12, 
    mlp_ratio=4., 
    qkv_bias=True, 
    qk_scale=None, 
    representation_size=None,
    drop_rate=0., 
    attn_drop_rate=0., 
    drop_path_rate=0., 
    norm_layer=None, 
    use_grad_checkpointing=False, 
    ckpt_layer=0
)

model.visual_encoder.config = visual_encoder_config

model.visual_encoder = get_peft_model(model.visual_encoder, lora_config_image)
model.text_decoder = get_peft_model(model.text_decoder, lora_config_text)

model = model.to(device)

print('LoRA is loaded')
print('*******************************************************************************************************')
print(len(val_dataset))

Not using distributed mode
*******************************************************************************************************
Log:
Device: cuda
Creating captioning dataset
Using downloaded and verified file: /GPFS/rhome/kejinli/workspace/multi-modal/BLIP/BLIP4caption/caption_data/coco/annotations/coco_karpathy_train.json
Using downloaded and verified file: /GPFS/rhome/kejinli/workspace/multi-modal/BLIP/BLIP4caption/caption_data/coco/annotations/coco_karpathy_val.json
Using downloaded and verified file: /GPFS/rhome/kejinli/workspace/multi-modal/BLIP/BLIP4caption/caption_data/coco/annotations/coco_karpathy_test.json
Creating model
reshape position embedding from 196 to 576
load checkpoint from /GPFS/rhome/kejinli/workspace/multi-modal/BLIP/BLIP4caption/pretrained_model/model_base_capfilt_large.pth
Model is loaded
LoRA is loaded
*******************************************************************************************************
5000


: 

In [None]:
for n, p in model.named_parameters():
    if p.requires_grad:
        print(n)

visual_encoder.base_model.model.blocks.0.attn.qkv.lora_A.weight
visual_encoder.base_model.model.blocks.0.attn.qkv.lora_B.weight
visual_encoder.base_model.model.blocks.0.attn.proj.lora_A.weight
visual_encoder.base_model.model.blocks.0.attn.proj.lora_B.weight
visual_encoder.base_model.model.blocks.0.mlp.fc1.lora_A.weight
visual_encoder.base_model.model.blocks.0.mlp.fc1.lora_B.weight
visual_encoder.base_model.model.blocks.0.mlp.fc2.lora_A.weight
visual_encoder.base_model.model.blocks.0.mlp.fc2.lora_B.weight
visual_encoder.base_model.model.blocks.1.attn.qkv.lora_A.weight
visual_encoder.base_model.model.blocks.1.attn.qkv.lora_B.weight
visual_encoder.base_model.model.blocks.1.attn.proj.lora_A.weight
visual_encoder.base_model.model.blocks.1.attn.proj.lora_B.weight
visual_encoder.base_model.model.blocks.1.mlp.fc1.lora_A.weight
visual_encoder.base_model.model.blocks.1.mlp.fc1.lora_B.weight
visual_encoder.base_model.model.blocks.1.mlp.fc2.lora_A.weight
visual_encoder.base_model.model.blocks.1.ml