In [1]:
# !pip install -q -U trl transformers accelerate git+https://github.com/huggingface/peft.git
# !pip install -q datasets bitsandbytes einops wandb

In [2]:
import pandas as pd

In [3]:
import torch
from transformers import  AutoTokenizer
from transformers import AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig

device = 'cuda:1'

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    
)

model_name = "microsoft/phi-2"
phi2_model_pretrained = AutoModelForCausalLM.from_pretrained(
    model_name,
    trust_remote_code=True,  
    quantization_config=bnb_config, 
    device_map={"": 1}
)

phi2_model_pretrained.config.use_cache = False


tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.bos_token = tokenizer.eos_token


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
from undecorated import undecorated
from types import MethodType

In [5]:
# generate_with_grad = undecorated(phi2_model_pretrained.generate)
# phi2_model_pretrained.generate_with_grad = MethodType(generate_with_grad, phi2_model_pretrained)

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CustomGELU(nn.Module):
    def forward(self, x):
        return F.gelu(x.clone())
    
class SimpleResBlock(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.pre_norm = nn.LayerNorm(input_size)
        self.proj = nn.Sequential(
            nn.Linear(input_size, input_size),
            nn.GELU(),
            nn.Linear(input_size, input_size)
        )
    def forward(self, x):
        x = self.pre_norm(x)
        return x + self.proj(x)
    
class Phi2wrapper(nn.Module):
    
    #This defines the structure of the NN.
    def __init__(self, input_dim_CLIP=768, input_dim_phi2=2560, 
                 phi2_model=phi2_model_pretrained, 
                tokenizer=tokenizer, teacher_forcing = 3, device=device):
        
        super(Phi2wrapper, self).__init__()

        self.input_dim_CLIP = input_dim_CLIP
        self.input_dim_phi2 = input_dim_phi2
        self.projection_img = nn.Linear(self.input_dim_CLIP, self.input_dim_phi2, 
                                        bias=False)
                                                                                                                                                           
        self.resblock = SimpleResBlock(self.input_dim_phi2)
        self.phi2_model = phi2_model
        self.tokenizer = tokenizer

        self.device = device
        self.teacher_forcing = teacher_forcing

        bos = self.tokenizer("Context: ", return_tensors="pt", return_attention_mask=False)
        eoi = self.tokenizer(" Question: ", return_tensors="pt", return_attention_mask=False)
        eoa = self.tokenizer(" Answer: ", return_tensors="pt", return_attention_mask=False)
    
        self.bos_embedding = self.phi2_model.get_input_embeddings()(bos.input_ids.to(self.device)).squeeze(0)
        self.eoi_embedding = self.phi2_model.get_input_embeddings()(eoi.input_ids.to(self.device)).squeeze(0)
        self.eoa_embedding = self.phi2_model.get_input_embeddings()(eoa.input_ids.to(self.device)).squeeze(0)
        
        self.eos_embedding = self.phi2_model.get_input_embeddings()(torch.tensor(self.tokenizer.eos_token_id).to(self.device)).unsqueeze(0)

    def forward(self, x, input_q, target_a):

        x = x.detach()

        x = self.projection_img(x)
        x = self.resblock(x)

        batch_size = x.shape[0]
        
        input_q_embedding = self.phi2_model.get_input_embeddings()(input_q)
        
        x = torch.cat((self.bos_embedding.repeat(batch_size,1,1), x, 
                    self.eoi_embedding.repeat(batch_size,1,1), input_q_embedding, 
                    self.eoa_embedding.repeat(batch_size,1,1)), dim=1).detach()
                
        loss = 0 
        word_output_pred_tokens = None
        
        # iterate over max_answer length
        for idx in range(target_a.shape[1]): 
            
            # print(f'Input shape {x.shape}')
            out_phi = self.phi2_model.base_model.model.model.layers[0](x.to(torch.float16))
            
            for layer_idx in range(1, 32): 
                out_phi = self.phi2_model.base_model.model.model.layers[layer_idx](out_phi[0])
                
            out_phi = self.phi2_model.base_model.model.model.final_layernorm(out_phi[0])
            out_phi = self.phi2_model.base_model.model.lm_head(out_phi) ## torch.Size([batch, 55, 50297])
            
            next_word = torch.argmax(out_phi[:, -1, :], dim=-1) ## [batch]
            
            caption_word_token = target_a[:,idx].detach()
            caption_word_embedding = self.phi2_model.get_input_embeddings()(next_word).unsqueeze(1)
            
            ## instead of append like instruct image output words.. instruct image w1 out, instruct image w2 output ..
            x = torch.cat((x, caption_word_embedding), dim=1)
            
            loss_val = F.cross_entropy(out_phi[:, -1, :], caption_word_token, 
                        ignore_index=self.tokenizer.pad_token_id)
            
            loss += loss_val
            
            if word_output_pred_tokens is None: 
                word_output_pred_tokens = next_word.unsqueeze(1) 
            else:

                word_output_pred_tokens = torch.cat((word_output_pred_tokens, next_word.unsqueeze(1)), dim=1)
                
        loss_result = loss/idx

        return loss_result, word_output_pred_tokens

In [7]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as F

# class CustomGELU(nn.Module):
#     def forward(self, x):
#         return F.gelu(x.clone())
    
# class SimpleResBlock(nn.Module):
#     def __init__(self, input_size):
#         super().__init__()
#         self.pre_norm = nn.LayerNorm(input_size)
#         self.proj = nn.Sequential(
#             nn.Linear(input_size, input_size),
#             nn.GELU(),
#             nn.Linear(input_size, input_size)
#         )
#     def forward(self, x):
#         x = self.pre_norm(x)
#         return x + self.proj(x)
    
# class Phi2wrapper(nn.Module):
    
#     #This defines the structure of the NN.
#     def __init__(self, input_dim_CLIP=768, input_dim_phi2=2560, 
#                  phi2_model=phi2_model_pretrained, 
#                 tokenizer=tokenizer, teacher_forcing = 3, device=device):
        
#         super(Phi2wrapper, self).__init__()

#         self.input_dim_CLIP = input_dim_CLIP
#         self.input_dim_phi2 = input_dim_phi2
#         self.projection_img = nn.Linear(self.input_dim_CLIP, self.input_dim_phi2, 
#                                         bias=False)
                                                                                                                                                           
#         self.resblock = SimpleResBlock(self.input_dim_phi2)
#         self.phi2_model = phi2_model
#         self.tokenizer = tokenizer

#         self.device = device
#         self.teacher_forcing = teacher_forcing

#         bos = self.tokenizer("Image: ", return_tensors="pt", return_attention_mask=False)
#         eoi = self.tokenizer(" Question: ", return_tensors="pt", return_attention_mask=False)
#         eoa = self.tokenizer(" Answer: ", return_tensors="pt", return_attention_mask=False)
    
#         self.bos_embedding = self.phi2_model.get_input_embeddings()(bos.input_ids.to(self.device)).squeeze(0)
#         self.eoi_embedding = self.phi2_model.get_input_embeddings()(eoi.input_ids.to(self.device)).squeeze(0)
#         self.eoa_embedding = self.phi2_model.get_input_embeddings()(eoa.input_ids.to(self.device)).squeeze(0)
        
#         self.eos_embedding = self.phi2_model.get_input_embeddings()(torch.tensor(self.tokenizer.eos_token_id).to(self.device)).unsqueeze(0)

#     def forward(self, x, input_q, target_a):

#         x = self.projection_img(x)
#         x = self.resblock(x)

#         batch_size = x.shape[0]
        
#         #### TODO: Remove pad from question
#         input_q_embedding = self.phi2_model.get_input_embeddings()(input_q)
        
#         x = torch.cat((self.bos_embedding.repeat(batch_size,1,1), x, 
#                     self.eoi_embedding.repeat(batch_size,1,1), input_q_embedding, 
#                     self.eoa_embedding.repeat(batch_size,1,1)), dim=1)
        
#         prediction = self.phi2_model.base_model.generate(inputs_embeds=x.to(torch.float16), 
#                                      max_new_tokens=target_a.shape[1]+1, 
#                                      output_scores=True, return_dict_in_generate = True, 
#                                      pad_token_id=self.tokenizer.eos_token_id,
#                                      bos_token_id=self.tokenizer.bos_token_id)
        
#         print(self.phi2_model.base_model.model.model.layers[0].self_attn.q_proj.lora_A.default.weight)
# #         print(f'Predict shape {len(prediction.scores)},{prediction.keys()}')
# #         print(f'q: {self.tokenizer.batch_decode(input_q)}, shape {input_q.shape}')
# #         print(f'gt: {self.tokenizer.batch_decode(target_a)}, shape {target_a.shape}')
# #         print(f't: {self.tokenizer.batch_decode(prediction.sequences)}')
        
#         prediction_scores = torch.cat(prediction['scores'], axis=0)[1:, :]
#         target_a = target_a[0, :]
        
#         loss = F.cross_entropy(prediction_scores, target_a, 
#                         ignore_index=self.tokenizer.pad_token_id)
#         prediction_tokens = torch.argmax(prediction_scores, dim=-1)
        
#         return loss, prediction_tokens

## Dataset

In [8]:
# ###### DATASET 
# import json 

# llava_json_fname = '/media/App/amaranth/lavanya/Capstone_data/llava_instruct_150k.json'

# # Opening JSON file
# f = open(llava_json_fname)
 
# # returns JSON object as 
# # a dictionary
# llava_json = json.load(f)
 
# # Closing file
# f.close()

In [9]:
# import pandas as pd 

# llava_json_df  = []

# for l_entry in llava_json:
#     img_id = l_entry['id']
#     conversations = l_entry['conversations']
#     len_conv = len(conversations)
    
#     ## the first one is human, the second one is gpt 
#     for idx in range(0, len_conv, 2): 
#         Q_human = conversations[idx]['value'].replace('<image>', '').replace('\n', '')
#         A_GPT = conversations[idx+1]['value']
#         llava_json_df.append([img_id, Q_human, A_GPT])
        
# #         if conversations[idx]['from'] != 'human': 
# #             print(conversations[idx])
# #         if conversations[idx+1]['from'] != 'gpt': 
# #             print(conversations[idx+1])

# llava_json_df = pd.DataFrame(data=llava_json_df, columns = ['image_id', 'Q_human', 'A_GPT'])

In [10]:
# def get_tokens_text(img_caption): 
#     img_caption_tokenized = tokenizer(img_caption, return_tensors="pt", 
#                                                return_attention_mask=False).input_ids.squeeze(0).tolist()
#     return img_caption_tokenized
                                           
# llava_json_df['Q_human_tokenized'] = llava_json_df['Q_human'].apply(lambda x: get_tokens_text(x))
# llava_json_df['A_GPT_tokenized'] = llava_json_df['A_GPT'].apply(lambda x: get_tokens_text(x))

In [11]:
# llava_json_df.to_csv('/media/App/amaranth/lavanya/Capstone_data/llava_instruct_150k_df.csv')

In [12]:
llava_json_df = pd.read_csv('/media/App/amaranth/lavanya/Capstone_data/llava_instruct_150k_df.csv')
# llava_json_df.head()

In [13]:
import ast

# llava_json_df['len_token_q'] = llava_json_df['Q_human_tokenized'].apply(lambda x: len(ast.literal_eval(x)))
# llava_json_df['len_token_a'] = llava_json_df['A_GPT_tokenized'].apply(lambda x: len(ast.literal_eval(x)))
# llava_json_df.to_csv('/media/App/amaranth/lavanya/Capstone_data/llava_instruct_150k_df.csv')

In [14]:
# hist = llava_json_df['len_token_q'].hist()
# hist = llava_json_df['len_token_a'].hist()
# max(llava_json_df['len_token_q']), max(llava_json_df['len_token_a'])

In [15]:
max_token_len_q = 15 
max_token_len_a = 40

In [16]:
llava_data_subset = llava_json_df[(llava_json_df['len_token_q'] <= max_token_len_q) & \
                                 (llava_json_df['len_token_a'] <= max_token_len_a)]

In [17]:
llava_data_subset = llava_data_subset.sort_values(by=['len_token_a'], ascending=True)

In [18]:
###### DATASET

from torch.utils.data import Dataset, DataLoader
import pandas as pd 
import json
import os 
import h5py

import h5py    
import numpy as np    

class LLAVA_150k_Dataset_Instruct(Dataset):

    def __init__(
        self, embedding_path, llava_json_df, 
        tokenizer=tokenizer, max_token_len_q=max_token_len_q, max_token_len_a=max_token_len_a):
        
        self.embedding_path = embedding_path
        self.llava_json_df = llava_json_df
        
        self.tokenizer = tokenizer
        self.max_token_len_q = max_token_len_q
        self.max_token_len_a = max_token_len_a

    def __len__(self):
        return len(self.llava_json_df)
    
    def __getitem__(self, index):

        row = self.llava_json_df.iloc[[index]]

        df_img = row['image_id'].values[0]
        img_base_name = '0'*(12-len(str(df_img))) + str(df_img)
        img_base_name = img_base_name.replace(' ', '0')
        img_clip_embedding_path = os.path.join(self.embedding_path, f'{img_base_name}.h5')

        np_array_embed_img = h5py.File(img_clip_embedding_path,'r+')['image_features'][()]
        
        Q_human_tokenized = ast.literal_eval(row['Q_human_tokenized'].values[0])
        A_gpt_tokenized = ast.literal_eval(row['A_GPT_tokenized'].values[0])
        
#         if len(Q_human_tokenized) != self.max_token_len_q: 
#             pad_q_len = self.max_token_len_q - len(Q_human_tokenized)
#             Q_human_tokenized = Q_human_tokenized + [self.tokenizer.pad_token_id]*pad_q_len
        
#         if len(A_gpt_tokenized) != self.max_token_len_a: 
#             pad_a_len = self.max_token_len_a - len(A_gpt_tokenized)
#             A_gpt_tokenized = A_gpt_tokenized + [self.tokenizer.pad_token_id]*pad_a_len
        
        return (torch.tensor(np_array_embed_img).squeeze(0), torch.tensor(Q_human_tokenized)), torch.tensor(A_gpt_tokenized)


dataset = LLAVA_150k_Dataset_Instruct('/media/App/amaranth/lavanya/Capstone_data/clip_features_base_patch32/', 
                            llava_data_subset)

# (x1, x2), y = dataset[0]
# print(x1.shape, x2.shape, y.shape)
# phi2_projection_model(x1.unsqueeze(0).to(device), x2.unsqueeze(0).to(device), y.unsqueeze(0).to(device))

In [19]:
batch_size_train = 1
train_dataloader = DataLoader(dataset, batch_size=batch_size_train, shuffle=False, num_workers=8)
print(f'Number of batches {len(train_dataloader)}')

Number of batches 187169


In [20]:
def train(model, num_epochs, train_dataloader, optimizer):
    
    model.train()
    
    for epoch in range(num_epochs):

        print(f"Working on epoch {epoch}")

        for iteration, batch in enumerate(train_dataloader):

            optimizer.zero_grad()

            input_, input_q_ = batch[0]
            gt = batch[1] 

            loss, output_pred_tokens = model(input_.to(device), 
                                             input_q_.to(device), 
                                             gt.to(device))
            
            # loss.backward(retain_graph=True)
            # loss.requires_grad = True
            loss.backward()
            optimizer.step()

            if (iteration % 1000) == 0: 
                print("Iteration:", iteration, " Loss:", loss)
                print("Question:", tokenizer.batch_decode(input_q_))
                print("Predictions:", tokenizer.batch_decode(output_pred_tokens))
                print("Gt answer:", tokenizer.batch_decode(gt))

        print("")
        print(f"Epoch {epoch} finished")
        print("")

In [21]:
import peft
from peft import LoraConfig

lora_alpha = 16
lora_dropout = 0.1
lora_r = 64

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "dense",
        "fc1",
        "fc2",
    ]
)


In [22]:
import copy
torch.set_grad_enabled(True)  

peft_model = peft.get_peft_model(phi2_model_pretrained, peft_config)
peft_model.print_trainable_parameters()

trainable params: 94,371,840 || all params: 2,874,055,680 || trainable%: 3.2835773035545364


In [23]:
phi2_projection_model = Phi2wrapper(input_dim_CLIP=768, input_dim_phi2=2560, 
                 phi2_model=peft_model, 
                tokenizer=tokenizer, teacher_forcing = 3, device=device).to(device=device)

# load projection_img, resblock from stage 1 
phi2_projection_model.projection_img.load_state_dict(torch.load('/media/App/amaranth/lavanya/Capstone_data/weights/p1_epoch_1_iteration_5906.pth'))
phi2_projection_model.resblock.load_state_dict(torch.load('/media/App/amaranth/lavanya/Capstone_data/weights/p2_epoch_1_iteration_5906.pth'))

optimizer = optimizer = torch.optim.Adam(phi2_projection_model.parameters(), 
                            lr=1e-5, eps=1e-9) 

In [24]:
# for name, param in phi2_projection_model.named_parameters():
#     if param.requires_grad: 
#         print(name)

In [25]:
print(f'Len train_dataloader {len(train_dataloader)}')

Len train_dataloader 187169


In [26]:
%time train(phi2_projection_model, num_epochs=1, train_dataloader=train_dataloader, optimizer=optimizer)

Working on epoch 0
Iteration: 0  Loss: tensor(11.9609, device='cuda:1', dtype=torch.float16, grad_fn=<DivBackward0>)
Question: ['How many bears are in the image?']
Predictions: [' There']
Gt answer: ['There are']
Iteration: 1000  Loss: tensor(0.5420, device='cuda:1', dtype=torch.float16, grad_fn=<DivBackward0>)
Question: ['What color is the tennis court?']
Predictions: ['The tennis court is red.']
Gt answer: ['The tennis court is blue.']
Iteration: 2000  Loss: tensor(0.5215, device='cuda:1', dtype=torch.float16, grad_fn=<DivBackward0>)
Question: ['What colors are on the cow?']
Predictions: ['The cow is brown and brown.']
Gt answer: ['The cow is brown and white.']
Iteration: 3000  Loss: tensor(1.1006, device='cuda:1', dtype=torch.float16, grad_fn=<DivBackward0>)
Question: ['Is the motorcycle on a highway or a city street?']
Predictions: ['The motorcycle is on a city street']
Gt answer: ['The motorcycle is on a highway.']
Iteration: 4000  Loss: tensor(0.1503, device='cuda:1', dtype=torch

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 1; 47.54 GiB total capacity; 10.39 GiB already allocated; 2.00 MiB free; 10.80 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [46]:
len(llava_data_subset[34000:])

153169

In [47]:
dataset = LLAVA_150k_Dataset_Instruct('/media/App/amaranth/lavanya/Capstone_data/clip_features_base_patch32/', 
                            llava_data_subset[34000:])

In [48]:
batch_size_train = 1
train_dataloader = DataLoader(dataset, batch_size=batch_size_train, shuffle=False, num_workers=8)
print(f'Number of batches {len(train_dataloader)}')

Number of batches 153169


In [49]:
%time train(phi2_projection_model, num_epochs=1, train_dataloader=train_dataloader, optimizer=optimizer)

Working on epoch 0
Iteration: 0  Loss: tensor(0.0764, device='cuda:1', dtype=torch.float16, grad_fn=<DivBackward0>)
Question: ['How many bear cubs are in the image?']
Predictions: ['There are three bear cubs in the image.']
Gt answer: ['There are three bear cubs in the image.']
Iteration: 1000  Loss: tensor(3.0234, device='cuda:1', dtype=torch.float16, grad_fn=<DivBackward0>)
Question: ['What type of train is depicted in the image?']
Predictions: ['The train train a image train train in the image.']
Gt answer: ['The image shows a steam train traveling along the tracks.']
Iteration: 2000  Loss: tensor(3.0371, device='cuda:1', dtype=torch.float16, grad_fn=<DivBackward0>)
Question: ['What is the time shown on the clock in the image?']
Predictions: ['The clock shows the the the the the the image.']
Gt answer: ['The time shown on the clock is 3:50.']
Iteration: 3000  Loss: tensor(1.4189, device='cuda:1', dtype=torch.float16, grad_fn=<DivBackward0>)
Question: ['Is the truck moving legally th

## saving the model 


In [None]:
# for name, param in phi2_projection_model.phi2_model.base_model.named_parameters():
#     if "lora" not in name:
#         continue
#     print(f"New parameter {name:<13} | {param.numel():>5} parameters | updated")

In [None]:
f_name = f'/media/App/amaranth/lavanya/Capstone_data/weights/stage_2/run3_projection_img.pth'
torch.save(phi2_projection_model.projection_img.state_dict(), f_name) 

In [None]:
f_name = f'/media/App/amaranth/lavanya/Capstone_data/weights/stage_2/run3_resblock.pth'
torch.save(phi2_projection_model.resblock.state_dict(), f_name) 

In [None]:
f_name = f'/media/App/amaranth/lavanya/Capstone_data/weights/stage_2/run3_phi2.pth'
torch.save(phi2_projection_model.phi2_model.state_dict(), f_name) 

In [None]:
# from huggingface_hub import notebook_login
# token = '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'
# notebook_login()

In [None]:
user = "LN1996"  # put your user name here
model_name = "peft-qlora-run3"
model_id = f"{user}/{model_name}"

phi2_projection_model.phi2_model.push_to_hub(model_id)

adapter_model.safetensors:   0%|          | 0.00/378M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/LN1996/peft-qlora-run2/commit/fef582d446d2e4a92e58c825fc651f481c0e228e', commit_message='Upload model', commit_description='', oid='fef582d446d2e4a92e58c825fc651f481c0e228e', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
pass