In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [1]:
%pip install -qq -U datasets transformers pyarrow torchinfo
%pip install -qq --upgrade transformers ftfy accelerate regex tqdm
%pip install git+https://github.com/openai/CLIP.git
%pip install GPUtil


[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
cuml 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
dask-cudf 23.8.0 requires cupy-cuda11x>=12.0.0, which is not installed.
apache-beam 2.46.0 requires dill<0.3.2,>=0.3.1.1, but you have dill 0.3.7 which is incompatible.
apache-beam 2.46.0 requires pyarrow<10.0.0,>=3.0.0, but you have pyarrow 14.0.2 which is incompatible.
beatrix-jupyterlab 2023.814.150030 requires jupyter-server~=1.16, but you have jupyter-server 2.12.3 which is incompatible.
beatrix-jupyterlab 2023.814.150030 requires jupyterlab~=3.4, but you have jupyterlab 4.0.10 which is incompatible.
cudf 23.8.0 requires pandas<1.6.0dev0,>=1.3, but you have pandas 2.0.3 which is incompatible.
cudf 23.8.0 requires protobuf<5,>=4.21, but you have protobuf 3.20.3 which is inc

**All the imports**

In [26]:
import os
import gc
import torch
import pickle
import json
import torchinfo
import torch.nn as nn
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer
from torch.utils.data import Dataset, DataLoader
from datasets import load_dataset
from tqdm import tqdm
from pathlib import Path
from GPUtil import showUtilization as gpu_usage
from numba import cuda


**Helper utilities**

In [3]:

def free_gpu_cache():
    print("Initial GPU Usage")
    gpu_usage()                             

    torch.cuda.empty_cache()

    #cuda.select_device(0)
    #cuda.close()
    #cuda.select_device(0)
    

    print("GPU Usage after emptying the cache")
    gpu_usage()


First, the projection layer...

In [3]:
class IdentityMap(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x, *args, **kwargs):
        return x

    @property
    def config(self):
        return {"mm_projector_type": 'identity'}


class SimpleResBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.pre_norm = nn.LayerNorm(in_channels)

        self.proj = nn.Sequential(
            nn.Linear(in_channels, out_channels),
            nn.GELU(),
            nn.Linear(out_channels, out_channels)
        )
    def forward(self, x):
        x = self.pre_norm(x)
        return x + self.proj(x)


class SimpleLinearBlock(nn.Module):
    def __init__(self, in_size, out_size, hidden_size = 50, add_residual_connection=True):
        super().__init__()
        self.pre_norm = nn.LayerNorm(in_size)
        self.proj = nn.Sequential(nn.Linear(in_size, hidden_size),
                                  nn.GELU(),
                                  nn.Linear(hidden_size, out_size))
        self.add_residual_connection = add_residual_connection
        
    def forward(self,x):
        return self.proj(x)


def build_resnet_projection_layer(in_channels, out_channels, hidden_size = 50, mlp_depth=2):
    res_block = SimpleResBlock(in_channels, out_channels, hidden_size = hidden_size)
    for _ in range(1, mlp_depth):
        modules.append(res_block)
    return nn.Sequential(*modules)

and then the model....

In [34]:

def model_summary(model, input_size):
    torchinfo.summary(model,
                      input_size = input_size,
                      batch_dim=0,
                      col_names=("kernel_size",
                                 "input_size",
                                 "output_size",
                                 "num_params",
                                 "mult_adds"),
                       verbose=1,)



class MultiModalGPT(nn.Module):
    """
    Pytorch Lightning module for Transformer

    """
    def __init__(self,
                 llm_model,
                 tokenizer,
                 projection_layer_in_channels,
                 projection_layer_out_channels,
                 device,
                 hidden_size = 32,
                 ):
        super(MultiModalGPT, self).__init__()
        self.llm_model = None
        self.tokenizer = None
        self.tokenizer = tokenizer
        self.projection_layer = SimpleLinearBlock(projection_layer_in_channels,projection_layer_out_channels, hidden_size=hidden_size)
        self.llm_model = llm_model
        self.device = device
        
        # freeze the llm
        for param in self.llm_model.parameters():
            param.requires_grad = False
    
    
    def forward(self, x, max_length=1):
        print(f"beginning of projection: {x.shape}")
        x = self.projection_layer(x)
        print(f"end of projection: {x.shape}")
        with torch.no_grad():  
            x = self.llm_model(inputs_embeds = x, return_dict=False)
        print(f"end of llm: logits: {x[0].shape}")
        return x


**Data loader**

In [45]:
def get_absolute_paths(directory_path, max_files = None):
    absolute_paths = []
    image_ids = []

    # Check if the given path is a valid directory
    if os.path.isdir(directory_path):
        # Iterate over all files in the directory
        for root, _, files in os.walk(directory_path):
            for file in tqdm(files):
                # extract image ID
                image_ids.append(Path(file).stem)
                # Construct the absolute path for each file
                absolute_path = os.path.abspath(os.path.join(root, file))
                absolute_paths.append(absolute_path)
                if max_files is not None and len(absolute_paths) > max_files:
                    break
    return absolute_paths, image_ids


def parse_captions_file(captions_path, captions_key):
    """
    Read a JSON file and return its contents as a dictionary.

    Parameters:
    - file_path (str): The path to the JSON file.

    Returns:
    - dict: The contents of the JSON file as a dictionary.
    """
    try:
        with open(captions_path, 'r') as file:
            data = json.load(file)
        captions = {}
        annotations = data[captions_key]
        for annotation in annotations:
            captions[annotation['image_id']] = annotation['caption']
        return captions
    except FileNotFoundError:
        print(f"Error: File not found - {captions_path}")
    except json.JSONDecodeError:
        print(f"Error: Unable to decode JSON in file - {captions_path}")

        
def load_pickle_file(file_path):
    with open(file_path, 'rb') as fh:
        data = pickle.load(fh)
    keys = list(data.keys()) 
    assert len(keys) == 1
    return data[keys[0]]


class PickleDataset(Dataset):

    def __init__(self, 
                 all_images,
                 image_ids,
                 captions_path,
                 captions_key,
                 tokenizer, 
                 max_len_of_sentence=2048):
        super().__init__()
        self.tokenizer = tokenizer
        self.ds = None
        self.image_file_names = None
        self.captions_key = captions_key
        self.images_path = images_path
        self.bos_token = self.tokenizer.bos_token
        self.eos_token = self.tokenizer.eos_token
        self.pad_token = self.tokenizer.pad_token
        self.max_len_of_sentence = max_len_of_sentence
        self.all_images = all_images
        self.image_ids = image_ids
        self.captions = parse_captions_file(captions_path, captions_key)
        

    def __len__(self):
        return len(self.image_ids)


    def __getitem__(self, idx):

        # get image embeddings
        img_embds = load_pickle_file(self.all_images[idx])
        img_embds = torch.tensor(np.expand_dims(img_embds,1))
        this_img_id = self.image_ids[idx]
        
        # get caption
        caption = self.captions[int(this_img_id)]
        tokenized_caption = self.tokenize_caption(caption)
        
        return {
            "image_embeddings": img_embds,
            "image_id": this_img_id,
            "caption": caption,
            "tokenized_caption": tokenized_caption
        }
    
    def tokenize_caption(self, caption):
        tokens = self.tokenizer(caption)
        caption_encoded = self.tokenizer(caption, return_tensors="pt", return_attention_mask=False)
        num_padding_tokens = self.max_len_of_sentence - 2
        # Add <s> and </s> token
        tokenized_caption = torch.cat(
            [
                self.bos_token,
                caption_encoded,
                self.eos_token,
                torch.tensor([self.pad_token] * num_padding_tokens, dtype=torch.int64),
            ],
            dim=0,)

        ## Add only the <s>
        #y = torch.cat(
        #    [
        #        caption_encoded,
        #        self.eos_token,
        #        torch.tensor([self.pad_token] * num_padding_tokens, dtype=torch.int64),
        #    ],
        #    dim=0,
        #)

        return tokenized_caption


In [None]:
def train(
    config,
    model,
    state: dict,
    train_dataloader: DataLoader,
    val_dataloader: DataLoader,
) -> None:

    if val_dataloader is not None:
        validate(model, val_dataloader)  # sanity check

    for state["iter_num"], train_data in enumerate(train_dataloader, state["iter_num"]):
        if state["iter_num"] >= max_iters or state["iter_num"] % state["save_interval"] == 0:
            checkpoint_path = out_dir / f"iter-{state['iter_num']:06d}-ckpt.pth"
            print(f"Saving checkpoint to {str(checkpoint_path)!r}")
            save(checkpoint_path, state)
            break

        # determine and set the learning rate for this iteration
        lr = get_lr(state["iter_num"]) if decay_lr else learning_rate
        for param_group in optimizer.param_groups:
            param_group["lr"] = lr

        iter_t0 = time.perf_counter()

        image_embeddings = train_data['image_embeddings'][:, 0 : config["max_seq_length"].contiguous()
        targets = train_data['tokenized_caption'][:, 1 : config["max_seq_length"] + 1].contiguous()

        is_accumulating = (state["iter_num"] + 1) % gradient_accumulation_steps != 0
        with fabric.no_backward_sync(model, enabled=is_accumulating):
            logits = model(input_ids)
            loss = chunked_cross_entropy(logits, targets, chunk_size=0)
            fabric.backward(loss / gradient_accumulation_steps)

        # return

        if not is_accumulating:
            fabric.clip_gradients(model, optimizer, max_norm=grad_clip)
            optimizer.step()
            optimizer.zero_grad()
            state["step_count"] += 1

        t1 = time.perf_counter()
        total_lengths += input_ids.size(1)
        speed_monitor.on_train_batch_end(
            (state["iter_num"] + 1) * micro_batch_size,
            t1 - total_t0,
            # this assumes that device FLOPs are the same and that all devices have the same batch size
            fabric.world_size,
            flops_per_batch=measured_flops,
            lengths=total_lengths,
        )
        if state["iter_num"] % log_interval == 0:
            fabric.print(
                f"iter {state['iter_num']} step {state['step_count']}: loss {loss.item():.4f}, LR: {lr:.6f}, iter time:"
                f" {(t1 - iter_t0) * 1000:.2f}ms{' (optimizer.step)' if not is_accumulating else ''}"
            )

        if val_dataloader is not None and not is_accumulating and state["step_count"] % eval_interval == 0:
            t0 = time.perf_counter()
            val_loss = validate(fabric, model, val_dataloader)
            t1 = time.perf_counter() - t0
            speed_monitor.eval_end(t1)
            fabric.print(f"step {state['iter_num']}: val loss {val_loss.item():.4f}, val time: {t1 * 1000:.2f}ms")
            fabric.barrier()
        if not is_accumulating and state["step_count"] % save_interval == 0:
            checkpoint_path = out_dir / f"iter-{state['iter_num']:06d}-ckpt.pth"
            fabric.print(f"Saving checkpoint to {str(checkpoint_path)!r}")
            fabric.save(checkpoint_path, state)

In [None]:
def chunked_cross_entropy(
    logits: Union[torch.Tensor, List[torch.Tensor]], targets: torch.Tensor, chunk_size: int = 128
) -> torch.Tensor:
    # with large max_sequence_lengths, the beginning of `backward` allocates a large memory chunk which can dominate
    # the memory usage in fine-tuning settings with low number of parameters.
    # as a workaround hack, the cross entropy computation is chunked to force it to deallocate on the go, reducing
    # the memory spike's magnitude

    # lm_head was chunked (we are fine-tuning)
    if isinstance(logits, list):
        # don't want to chunk cross entropy
        if chunk_size == 0:
            logits = torch.cat(logits, dim=1)
            logits = logits.reshape(-1, logits.size(-1))
            targets = targets.reshape(-1)
            return torch.nn.functional.cross_entropy(logits, targets, ignore_index=-1)

        # chunk cross entropy
        logit_chunks = [logit_chunk.reshape(-1, logit_chunk.size(-1)) for logit_chunk in logits]
        target_chunks = [target_chunk.reshape(-1) for target_chunk in targets.split(logits[0].size(1), dim=1)]
        loss_chunks = [
            torch.nn.functional.cross_entropy(logit_chunk, target_chunk, ignore_index=-1, reduction="none")
            for logit_chunk, target_chunk in zip(logit_chunks, target_chunks)
        ]
        return torch.cat(loss_chunks).mean()

    # no chunking at all
    logits = logits.reshape(-1, logits.size(-1))
    targets = targets.reshape(-1)
    if chunk_size == 0:
        return torch.nn.functional.cross_entropy(logits, targets, ignore_index=-1)

    # lm_head wasn't chunked, chunk cross entropy
    logit_chunks = logits.split(chunk_size)
    target_chunks = targets.split(chunk_size)
    loss_chunks = [
        torch.nn.functional.cross_entropy(logit_chunk, target_chunk, ignore_index=-1, reduction="none")
        for logit_chunk, target_chunk in zip(logit_chunks, target_chunks)
    ]
    return torch.cat(loss_chunks).mean()

In [44]:
import torch
from pytorch_lightning import LightningModule

class LitMultiModalGPT(LightningModule):
    """
    Pytorch Lightning module for Transformer

    """
    def __init__(self,
                 multimodal_gpt,
                 loss_criterion,
                 tokenizer,
                 num_validation_examples=10,
                 num_training_steps=100000):
        super().__init__()
        self.loss_criterion = loss_criterion
        self.tokenizer = tokenizer
        self.num_validation_examples = num_validation_examples
        self.num_training_steps = num_training_steps
        self._vocab_len = tokenizer.get_vocab_size()
        self.model = multimodal_gpt
        self.scheduler = None
        self.scheduler_dict = {}
        self.optimizer = None
        self.this_step_train_loss = None
        self.predicted_list = []
        self.expected_list = []
        self.save_hyperparameters(ignore=['loss_criterion', 'epoch'])


    def set_optimizer(self, optimizer):
        self.optimizer = optimizer

    def set_scheduler_dict(self, scheduler, freq='step'):
        self.scheduler = scheduler
        self.scheduler_dict = {
            "scheduler": self.scheduler,
            "interval": freq,
        }

    def configure_optimizers(self):
        if self.scheduler_dict:
            return {"optimizer": self.optimizer, "lr_scheduler": self.scheduler_dict}
        return {"optimizer": self.optimizer}

    def forward(self, x):
        outputs = self.model(x, return_dict=False)
        return outputs

        
    def evaluate(self, batch, stage=None):
        """
        Evaluate the model on validation dataset.
        """
        #model_out = self.greedy_decode(encoder_input, encoder_mask)

        #model_out_text = self.tokenizer_tgt.decode(model_out.detach().cpu().numpy())


        #if stage:
        #    # print the source, target, and the model output
        #    print("*****************************************")
        #    print(f"{f'SOURCE: ' :>12}{source_text}")
        #    print(f"{f'TARGET: ' :>12}{target_text}")
        #    print(f"{f'PREDICTED: ' :>12}{model_out_text}")
        #    print("*****************************************\n")
        return None

    def training_step(self, batch):
        tokenized_caption = batch['tokenized_caption']  # (B, seq_len)
        logits, outputs = self(batch['image_embeddings'])
        loss = chunked_cross_entropy(logits, targets, chunk_size=0)
        self.log("train_loss", loss.item(), prog_bar=True)
        self.this_step_train_loss = loss.item()
        return loss

    def validation_step(self, batch, batch_idx):
        if batch_idx < self.num_validation_examples:
            predicted, expected = self.evaluate(batch, "val")
            self.predicted_list.append(predicted)
            self.expected_list.append(expected)


    def test_step(self, batch, batch_idx):
        if batch_idx < self.num_validation_examples:
            self.evaluate(batch, "test")




**Download the LLM and tokenizer**

In [7]:
phi_tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-2")
phi_model = AutoModelForCausalLM.from_pretrained("microsoft/phi-2")

tokenizer_config.json:   0%|          | 0.00/7.34k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/866 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/35.7k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/phi-2 were not used when initializing PhiForCausalLM: ['model.layers.0.self_attn.v_proj.weight', 'model.layers.25.self_attn.v_proj.weight', 'model.layers.19.self_attn.k_proj.bias', 'model.layers.7.self_attn.k_proj.weight', 'model.layers.19.self_attn.v_proj.weight', 'model.layers.31.self_attn.q_proj.bias', 'model.layers.18.self_attn.v_proj.weight', 'model.layers.1.self_attn.k_proj.bias', 'model.layers.8.self_attn.v_proj.weight', 'model.layers.22.self_attn.v_proj.weight', 'model.layers.24.self_attn.k_proj.weight', 'model.layers.31.self_attn.v_proj.bias', 'model.layers.13.self_attn.k_proj.weight', 'model.layers.8.self_attn.k_proj.weight', 'model.layers.28.self_attn.q_proj.bias', 'model.layers.13.self_attn.v_proj.bias', 'model.layers.2.self_attn.v_proj.weight', 'model.layers.18.self_attn.q_proj.bias', 'model.layers.8.self_attn.q_proj.bias', 'model.layers.9.self_attn.k_proj.bias', 'model.layers.9.self_attn.k_proj.weight', 'model.layers.22.se

generation_config.json:   0%|          | 0.00/74.0 [00:00<?, ?B/s]

**Define Hyperparameters**

In [49]:
train_dataset_path = '/kaggle/input/coco2017-clip-image-embeddings/coco_embeddings_clip_vision_1x768'
captions_path = '/kaggle/input/coco-2017-dataset/coco2017/annotations/captions_train2017.json'
captions_key = 'annotations'
batch_size = 1
device = 'cuda' if torch.cuda.is_available() else 'cpu'
val_split_size = 0.1


**Define train dataset and train dataloader**

In [None]:
files_list, images_ids_list = get_absolute_paths(train_dataset_path)
rand_indices = np.arange(len(all_filepaths))
np.random.shuffle(rand_indices)

val_split = int(len(all_filepaths)*val_split_size)

val_filepaths, train_filepaths = files_list[rand_indices[:split]], files_list[rand_indices[split:]] 
val_image_ids, train_image_ids = images_ids_list[rand_indices[:split]], images_ids_list[rand_indices[split:]]

print(f"Train dataset size: {len(train_filepaths)}")
print(f"Valid dataset size: {len(valid_filepaths)}")

In [9]:
train_ds = PickleDataset(train_dataset_path, captions_path, captions_key, phi_tokenizer)


100%|██████████| 118285/118285 [00:01<00:00, 68370.61it/s]


In [10]:
#val_ds = HindiAestheticsDataset(val_dataset_path, tokenizer, block_size=block_size)
train_dataloader = DataLoader(dataset = train_ds,
                              batch_size = batch_size,
                              num_workers = 1,
                              collate_fn = None,
                              shuffle = True)
#val_dataloader = DataLoader(dataset = val_ds,
#                            batch_size = 1,
#                            num_workers = 1,
#                            collate_fn = None,
#                            shuffle = False)

In [11]:
cc = next(iter(train_dataloader))
input_embeds = cc['image_embeddings'].to(device)

In [12]:
input_embeds.shape

torch.Size([1, 768, 1])

In [37]:
multimodal_gpt_model = MultiModalGPT(phi_model, phi_tokenizer, 1, 2560, device, hidden_size = 32)
multimodal_gpt_model = multimodal_gpt_model.to(device)

In [38]:
outputs = multimodal_gpt_model(input_embeds)


beginning of projection: torch.Size([1, 768, 1])
end of projection: torch.Size([1, 768, 2560])
end of llm: logits: torch.Size([1, 768, 51200])


In [42]:
multimodal_gpt_model.llm_model.max_seq_length

AttributeError: 'PhiForCausalLM' object has no attribute 'max_seq_length'

In [15]:
logits, preds = outputs

In [16]:
print(logits.shape) 

torch.Size([1, 768, 51200])


In [23]:
preds[0][0].shape

torch.Size([1, 32, 768, 80])

In [24]:
multimodal_gpt_model

MultiModalGPT(
  (llm_model): PhiForCausalLM(
    (model): PhiModel(
      (embed_tokens): Embedding(51200, 2560)
      (embed_dropout): Dropout(p=0.0, inplace=False)
      (layers): ModuleList(
        (0-31): 32 x PhiDecoderLayer(
          (self_attn): PhiAttention(
            (query_key_value): Linear(in_features=2560, out_features=7680, bias=True)
            (dense): Linear(in_features=2560, out_features=2560, bias=True)
            (attention_dropout): Dropout(p=0.0, inplace=False)
            (rotary_emb): PhiRotaryEmbedding()
          )
          (mlp): PhiMLP(
            (activation_fn): NewGELUActivation()
            (fc1): Linear(in_features=2560, out_features=10240, bias=True)
            (fc2): Linear(in_features=10240, out_features=2560, bias=True)
          )
          (input_layernorm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (final_layernorm): LayerNorm((2560,), eps=1e-

In [36]:
#del phi_model
print(torch.cuda.memory_allocated())
multimodal_gpt_model.to("cpu")
gc.collect()
#del input_embeds
#del phi_model
torch.cuda.empty_cache()
print(torch.cuda.memory_allocated())
#print(torch.cuda.memory_reserved())

13539339264
1172442112


In [None]:
input_embeds

In [None]:
output = multimodal_gpt_model(input_embeds)

In [None]:
!nvidia-smi