## Examine BLIP-2 from LAVIS and Setup Fine-Tuning

In [18]:
import torch
from lavis.models import model_zoo
from PIL import Image

In [4]:
print(model_zoo)

Architectures                  Types
albef_classification           ve
albef_feature_extractor        base
albef_nlvr                     nlvr
albef_pretrain                 base
albef_retrieval                coco, flickr
albef_vqa                      vqav2
alpro_qa                       msrvtt, msvd
alpro_retrieval                msrvtt, didemo
blip_caption                   base_coco, large_coco
blip_classification            base
blip_feature_extractor         base
blip_image_text_matching       base, large
blip_nlvr                      nlvr
blip_pretrain                  base
blip_retrieval                 coco, flickr
blip_vqa                       vqav2, okvqa, aokvqa
blip2_opt                      pretrain_opt2.7b, pretrain_opt6.7b, caption_coco_opt2.7b, caption_coco_opt6.7b
blip2_t5                       pretrain_flant5xl, pretrain_flant5xl_vitL, pretrain_flant5xxl, caption_coco_flant5xl
blip2_feature_extractor        pretrain, pretrain_vitL, coco
blip2                      

We want the feature extractor. We will take the pre-trained model and fine-tune it.

In [19]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [20]:
from lavis.models import load_model_and_preprocess
model, vis_processors, txt_processors = load_model_and_preprocess(name="blip2_feature_extractor", model_type="pretrain", is_eval=False, device=device)

  state_dict = torch.load(cached_file, map_location="cpu")
  checkpoint = torch.load(cached_file, map_location="cpu")


In [6]:
model

Blip2Qformer(
  (visual_encoder): VisionTransformer(
    (patch_embed): PatchEmbed(
      (proj): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (pos_drop): Dropout(p=0.0, inplace=False)
    (blocks): ModuleList(
      (0-38): 39 x Block(
        (norm1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        (attn): Attention(
          (qkv): Linear(in_features=1408, out_features=4224, bias=False)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=1408, out_features=1408, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (drop_path): Identity()
        (norm2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        (mlp): Mlp(
          (fc1): Linear(in_features=1408, out_features=6144, bias=True)
          (act): GELU(approximate='none')
          (fc2): Linear(in_features=6144, out_features=1408, bias=True)
          (drop): Dropout(p=0.0, inplace=False)
        )
      )
    )


In [7]:
sum(p.numel() for p in model.parameters() if p.requires_grad)

186705470

The model provides image and text pre-processors that have both train and eval modes. The image processor behaves differently in the two modes, but the text processor appears to behave the same.

In [8]:
img = Image.open('../../assets/sofa.jpg').convert('RGB')

In [9]:
vis_train_process = vis_processors['train']

In [10]:
vis_train_process.__dict__

{'normalize': Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711)),
 'transform': Compose(
     RandomResizedCrop(size=(224, 224), scale=(0.5, 1.0), ratio=(0.75, 1.3333), interpolation=bicubic, antialias=True)
     RandomHorizontalFlip(p=0.5)
     <lavis.processors.randaugment.RandomAugment object at 0x0000016E91B15AD0>
     ToTensor()
     Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711))
 )}

In [21]:
vis_input = vis_processors['train'](img)
vis_input = vis_input.unsqueeze(0).to(device)

In [18]:
vis_processors['eval'](img)

tensor([[[ 0.9522,  0.9230,  0.9230,  ...,  0.7187,  0.7187,  0.7187],
         [ 0.9084,  0.8501,  0.8938,  ...,  0.7187,  0.7187,  0.7187],
         [ 0.8501,  0.7479,  0.9522,  ...,  0.7187,  0.7187,  0.7187],
         ...,
         [ 0.2515, -0.0550, -0.2156,  ...,  0.9084,  0.8647,  0.8355],
         [ 0.2807,  0.5435,  0.2953,  ...,  0.8792,  0.9376,  0.9814],
         [ 0.5289,  0.3975,  0.3975,  ...,  0.8501,  0.8355,  0.8063]],

        [[ 1.0844,  1.0544,  1.0544,  ...,  0.8593,  0.8593,  0.8593],
         [ 1.0393,  0.9793,  1.0243,  ...,  0.8593,  0.8593,  0.8593],
         [ 0.9793,  0.8743,  1.0844,  ...,  0.8593,  0.8593,  0.8593],
         ...,
         [-0.2663, -0.5815, -0.7316,  ...,  0.9193,  0.8743,  0.8442],
         [-0.2213,  0.0638, -0.1913,  ...,  0.8893,  0.9493,  0.9943],
         [ 0.0488, -0.0862, -0.0712,  ...,  0.8593,  0.8442,  0.8142]],

        [[ 1.2358,  1.2216,  1.2358,  ...,  0.8092,  0.8092,  0.8092],
         [ 1.1932,  1.1363,  1.2074,  ...,  0

In [13]:
txt_input = txt_processors['train']('Hello, world!')

In [15]:
txt_processors['eval']('Hello, world!')

'hello, world'

In [26]:
sample = {"image": vis_input, "text_input": txt_input}
torch.distributed.init_process_group(backend="nccl", world_size=1, rank=0)
with torch.autocast(device_type="cuda"):
    losses = model(sample)
losses



ValueError: Error initializing torch.distributed using env:// rendezvous: environment variable MASTER_ADDR expected, but not set

### Setup the Dataset and DataLoader

In [29]:
from lavis.models import load_model_and_preprocess
import torch
from torch.utils.data import Dataset, WeightedRandomSampler, DataLoader
from PIL import Image
import pandas as pd
import os
from tqdm import tqdm

In [None]:
class GoogleShoppingDataset(Dataset):
    def __init__(self, image_dir: str, annotations_file: str, image_processor: object, text_processor: object):
        self.annotations = pd.read_csv(annotations_file)
        self.image_dir = image_dir
        self.image_processor = image_processor
        self.text_processor = text_processor
        
    def __len__(self):
        return len(self.annotations)
    
    def __getitem__(self, idx: int):
        image_path = os.path.join(self.image_dir, self.annotations.loc[idx, 'image_local'])
        image = Image.open(image_path).convert('RGB')
        image = self.image_processor(image)
        label = self.annotations.loc[idx, 'query'] + ': ' + self.annotations.loc[idx, 'title']
        image = self.text_processor(label)
        return image, label

In [None]:
def get_sample_weights(annotations_file: str):
    annotations_df = pd.read_csv(annotations_file)
    query_counts = annotations_df['query_id'].value_counts()
    product_counts = annotations_df['product_id'].value_counts()
    weights = []
    for idx in range(len(annotations_df)):
        query_id = annotations_df.loc[idx, 'query_id']
        product_id = annotations_df.loc[idx, 'product_id']
        weight = 1 / (query_counts[query_id] * product_counts[product_id])
        weights.append(weight)
    return weights

In [None]:
def build_dataloader(images_dir: str, annotations_file: str, mode: str, seed=42, batch_size=64, num_workers=2) -> DataLoader:
    """Note: mode is either 'train' or 'eval'
    """
    dataset = GoogleShoppingDataset(image_dir=images_dir, annotations_file=annotations_file,
                                      image_processor=vis_processors[mode], text_processor=txt_processors[mode])
    weights = get_sample_weights(annotations_file)
    generator = torch.Generator().manual_seed(seed)
    sampler = WeightedRandomSampler(weights, len(weights), replacement=True, generator=generator)
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size, sampler=sampler, num_workers=num_workers)
    return dataloader

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
model, vis_processors, txt_processors = load_model_and_preprocess(name="blip2_feature_extractor", model_type="pretrain", is_eval=False, device=device)
images_dir = 'D:/marqo-gs-10m/marqo-gs-dataset/images'
train_annotations = 'D:/marqo-gs-10m/marqo-gs-dataset/marqo_gs_full_10m/query_0_product_id_0.csv'
val_annotations = 'D:/marqo-gs-10m/marqo-gs-dataset/marqo_gs_full_10m/query_1_product_id_1.csv'

train_dataloader = build_dataloader(images_dir=images_dir, annotations_file=train_annotations,
                                    mode='train', seed=42, batch_size=64, num_workers=2)
val_dataloader = build_dataloader(images_dir=images_dir, annotations_file=val_annotations,
                                    mode='eval', seed=42, batch_size=64, num_workers=2)

In [None]:
optimizer = torch.optim.AdamW(lr=1e-5, betas=(0.9, 0.999), weight_decay=0.05)

In [None]:
def train_one_epoch(model, dataloader, device, optimizer):
    model.train()
    for images, labels in tqdm(dataloader):
        images, labels = images.to(device), labels.to(device)
        samples = {"image": images, "text_input": labels}
        