In [10]:
import os
import av
import fsspec
import shutil
import numpy as np

from transformers import Trainer, TrainingArguments, Seq2SeqTrainingArguments, DataCollatorForLanguageModeling
from transformers import AutoProcessor, BitsAndBytesConfig, LlavaNextVideoForConditionalGeneration
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from transformers import BitsAndBytesConfig, LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor

import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from huggingface_hub import snapshot_download, hf_hub_download, HfFileSystem
from datasets import load_dataset, concatenate_datasets


MAX_LENGTH = 512
BATCH_SIZE = 4
NUM_FRAMES = 16 # more frames -> more VRAM needed
DATASET_PATH = ""
OUTPUT_DIR = "hamidreza_files/checkpoints/run_aug_dataset/"
MODEL_ID = "llava-hf/LLaVa-NeXT-Video-7b-hf"
REPO_ID = "hamidra"

USE_LORA = False
USE_QLORA = True

In [11]:
def collate_fn(example, caption):
    processor = AutoProcessor.from_pretrained(MODEL_ID, use_fast=False)
    processor.tokenizer.padding_side = "right" 
    video_clip = example # change to the video decoder you want
    answer_caption = caption['answer']
    # ped_look = caption['look']
    # ped_action = caption['action']
    # ped_bbox = caption['bbox']

    conversation = [
            {
          "role": "user",
          "content": [
              {"type": "text", "text": "Above are 16 frames of a driving scenario captured by the ego vehicle camera based on the video taken, in which the pedestrian of interest is located with a red bounding box "},
            #   {"type": "text", "text": "The normalized bounding boxes of the pedestrian in these consequetive 16 frames are provided as follows in 8 lists each containing 4 elements, with format of [x1, y1, x2, y2] in which x1 and y1 are coordinates of top left corner and x2 and y2 are coordinates of the bottom right corner of the bounding box: "},
            #   {"type": "text", "text": ped_bbox + ". "},
            #   {"type": "text", "text": ped_look + ". "},
            #   {"type": "text", "text": ped_action + ". "},
              {"type": "text", "text": "Using these frames, provided context and pedestrian bounding box, and your reasoning, answer the following question only with ‘Yes’ or ‘No’. You may use your knowledge if needed. DO NOT EXPLAIN your reasoning, be confident.\nQuestion: Does the indicated pedestrian intend to cross the intersection in future frames of this video?"},
              {"type": "video"},
              ],
            },
            {
                "role": "assistant",
                "content": [
                    {"type": "text", "text": answer_caption},
                     ],
            },
        ]
    # conversation = [
    #         {
    #             "role": "user",
    #             "content": [
    #                 {"type": "text", "text": "Provide a detailed caption for this video."},
    #                 {"type": "video"},
    #                 ],
    #         },
    #         {
    #             "role": "assistant",
    #             "content": [
    #                 {"type": "text", "text": caption},
    #                  ],
    #         },
    #     ]

    prompt = processor.apply_chat_template(conversation, add_generation_prompt=False)

    batch = processor(
        text=prompt,
        videos=video_clip,
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

    return batch

In [12]:
processor = AutoProcessor.from_pretrained(MODEL_ID, use_fast=False)
processor.tokenizer.padding_side = "right" 

In [8]:
import json
file_path = 'hamidreza_files/new_ped_dict/ped_train_dict.json'
with open(file_path, 'r') as json_file:
    ped_dict = json.load(json_file)

In [8]:


# test_ped = ped_dict['1_1_1']
# ped_answer = test_ped['action_binary']

# ped_bbox_list = test_ped['ped_bbox_list']
# ped_bbox_list = np.array(ped_bbox_list)
# # down-sampling 
# total_frames = len(ped_bbox_list)
# indices = np.arange(0, total_frames, total_frames / NUM_FRAMES).astype(int)
# ped_bbox_list = ped_bbox_list[indices]
# w_scale = 960
# h_scale = 540
# ped_bbox_list[:, [0, 2]] /= w_scale
# ped_bbox_list[:, [1, 3]] /= h_scale

# ped_bbox_str = " ".join(map(str, ped_bbox_list))
# ped_answer_str = "Yes" if ped_answer == 1 else "No"

# # ped_gesture = test_ped['avg_gesture']

# # ped_gesture_str = "The gesture of the pedestrian in this video is " + ped_gesture

# ped_action = test_ped['avg_action']
# ped_action_str = "The indicated pedestrian is in the " + ped_action + " position"

# ped_look = test_ped['avg_look']
# ped_look_str = "Moreover, the pedestrian is " + ped_look + " in the direction of the ego-vehicle"

# caption = {'bbox': ped_bbox_str, 'answer': ped_answer_str, 'look': ped_look_str, 
#            'action': ped_action_str}

# print(test_caption)
# print(test_clip.shape)

# x = collate_fn(test_clip, caption)
# print(type(x))
# x.keys()

In [15]:
test_clip = np.load('hamidreza_files/dataset_new_16frames/train_clips/1_1_1.npy')

In [16]:
test_clip.shape

(16, 672, 672, 3)

## Converting .npy to huggingface dataset format for fine-tuning 

In [9]:
# converting the costum dataset into hf dataset format
import os
import pandas as pd
from datasets import Dataset, DatasetDict, Features, ClassLabel, Value
import json
from tqdm import tqdm

file_path = 'hamidreza_files/new_ped_dict/ped_val_dict.json'
with open(file_path, 'r') as json_file:
    ped_dict = json.load(json_file)

# reading npy clips from the dir, finding corresponding lable for that video, getting the output of collate_fn and concatenate them
base_dir = 'hamidreza_files/dataset_new_16frames/original/val_clips/'
npy_files_list = os.listdir(base_dir)

datasets_combined = []
for i, nfl in tqdm(enumerate(npy_files_list)):
    ped_id = nfl[:-4]
    cur_ped = ped_dict[ped_id]
    ped_answer = cur_ped['action_binary']
    
    # only capturing negative samples
    if ped_answer == 1.0:
        continue

    file_dir = base_dir + nfl
    pie_clip = np.load(file_dir)

    

    # ped_bbox_list = cur_ped['ped_bbox_list']
    # ped_bbox_list = np.array(ped_bbox_list)
    # down-sampling 
    # total_frames = len(ped_bbox_list)
    # indices = np.arange(0, total_frames, total_frames / NUM_FRAMES).astype(int)
    # ped_bbox_list = ped_bbox_list[indices]
    # w_scale = 672
    # h_scale = 672
    # ped_bbox_list[:, [0, 2]] /= w_scale
    # ped_bbox_list[:, [1, 3]] /= h_scale

    # ped_bbox_str = " ".join(map(str, ped_bbox_list))
    ped_answer_str = "Yes" if ped_answer == 1.0 else "No"

    # ped_gesture = test_ped['avg_gesture']

    # ped_gesture_str = "The gesture of the pedestrian in this video is " + ped_gesture

    # ped_action = cur_ped['avg_action']
    # ped_action_str = "The indicated pedestrian is in the " + ped_action + " position"

    # ped_look = cur_ped['avg_look']
    # ped_look_str = "Moreover, the pedestrian is " + ped_look + " in the direction of the ego-vehicle"

    # caption = {'bbox': ped_bbox_str, 'answer': ped_answer_str, 'look': ped_look_str, 
    #         'action': ped_action_str}
    caption = {'answer': ped_answer_str}
    # caption = ped_answer
    
    collate = collate_fn(pie_clip, caption)

    hf_dataset = Dataset.from_dict(collate)
    datasets_combined.append(hf_dataset)

val_n_original_dataset = concatenate_datasets(datasets_combined).with_format("torch")


0it [00:00, ?it/s]

243it [02:32,  1.59it/s]


20it [00:02,  7.41it/s]


In [16]:
x

Dataset({
    features: ['input_ids', 'attention_mask', 'pixel_values_videos'],
    num_rows: 1
})

In [None]:
x = x.map(unsqueeze_pixel_values)

In [2]:
# converting the costum dataset into hf dataset format
import os
import pandas as pd
from datasets import Dataset, DatasetDict, Features, ClassLabel, Value
import json
from tqdm import tqdm

file_path = 'hamidreza_files/new_ped_dict/ped_val_dict.json'
with open(file_path, 'r') as json_file:
    ped_dict = json.load(json_file)

# reading npy clips from the dir, finding corresponding lable for that video, getting the output of collate_fn and concatenate them
base_dir = 'hamidreza_files/dataset_new_16frames/original/val_clips/'
npy_files_list = os.listdir(base_dir)
p_ped_id = []
for i, nfl in tqdm(enumerate(npy_files_list)):
    ped_id = nfl[:-4]
    cur_ped = ped_dict[ped_id]
    ped_answer = cur_ped['action_binary']
    if ped_answer == 0.0:
        continue
    p_ped_id.append(ped_id)
print(p_ped_id)

243it [00:00, 1196262.76it/s]

['5_1_1749', '6_4_1845', '6_2_1770', '6_9_1970', '6_2_1765', '6_2_1777', '6_9_1986', '6_4_1842', '6_2_1778', '6_1_1757', '6_9_1975', '6_7_1905', '6_9_1977', '6_4_1844', '6_4_1835', '6_2_1780', '6_8_1919', '6_9_1983', '6_6_1891', '6_9_1944', '6_9_1971', '6_5_1862', '6_9_1940', '6_9_1976', '6_6_1893', '6_9_1981', '6_2_1771', '6_8_1920', '6_2_1803', '6_2_1764', '6_7_1899', '6_9_1984', '6_4_1829', '6_4_1833', '5_1_1746', '6_7_1911', '6_7_1910', '5_1_1744', '6_9_1950', '6_4_1851', '6_2_1786', '6_6_1884', '6_7_1909', '6_4_1823', '6_5_1861', '6_4_1826', '6_2_1801', '6_2_1782', '6_6_1885', '6_9_1985', '6_3_1815', '6_7_1917', '6_7_1906', '6_7_1903', '6_2_1789', '6_2_1790', '6_9_1955', '6_5_1863', '6_4_1836', '6_9_1962', '5_1_1739', '6_8_1922', '6_9_1978', '6_2_1784', '5_1_1732', '6_7_1898', '6_3_1809', '6_8_1925', '6_9_1953', '6_9_1980', '6_3_1814', '6_5_1857', '6_9_1966', '6_4_1830', '6_4_1832', '6_9_1979', '6_6_1887', '6_6_1879', '6_2_1797', '6_8_1934', '6_9_1967', '6_5_1864', '6_2_1805', '6_




In [4]:
p_ped_id[12], p_ped_id[19], p_ped_id[21]

('6_9_1977', '6_9_1944', '6_5_1862')

In [10]:
print(val_n_original_dataset[0].keys())
print(val_n_original_dataset[0]['pixel_values_videos'].shape)

dict_keys(['input_ids', 'attention_mask', 'pixel_values_videos'])
torch.Size([16, 3, 336, 336])


In [11]:
len(val_n_original_dataset)

62

In [17]:
from tqdm import tqdm

def unsqueeze_pixel_values(example):
    for k in example.keys():
        if example[k].shape[0] == 1:
            continue
        else:
            example[k] = example[k].unsqueeze(0)
    return example

x = x.map(unsqueeze_pixel_values)

# splitted_train = train_p_warp_dataset.train_test_split(test_size=0.5)
# t1 = splitted_train['train']
# t2 = splitted_train['test']


Map: 100%|██████████| 1/1 [00:00<00:00,  2.51 examples/s]


In [9]:
# t1 = t1.map(unsqueeze_pixel_values)
# t2 = t2.map(unsqueeze_pixel_values)
# train_p_warp_dataset = concatenate_datasets([t1, t2]).with_format("torch")

Map: 100%|██████████| 319/319 [01:50<00:00,  2.90 examples/s]
Map: 100%|██████████| 319/319 [01:50<00:00,  2.89 examples/s]


In [18]:
# saving the hf dataset in local dir
save_dir = 'hamidreza_files/hf_datasets_16frames/tmp/'
x.save_to_disk(save_dir)

Saving the dataset (1/1 shards): 100%|██████████| 1/1 [00:00<00:00, 66.58 examples/s]


In [7]:
# processor.batch_decode(val_dataset[0]["input_ids"])

In [67]:
from tqdm import tqdm
import torch

# # Assuming 'pixel_values_videos' is a tensor and you want to unsqueeze it
# for i in tqdm(range(len(dataset_list['pixel_values_videos']))):
#     dataset_list['pixel_values_videos'][i] = dataset_list['pixel_values_videos'][i].unsqueeze(0)

# test_dataset = Dataset.from_dict(dataset_list)

In [24]:
# saving the hf dataset in local dir
# save_dir = 'hamidreza_files/hf_datasets_16frames/val/'
# val_dataset.save_to_disk(save_dir)

Saving the dataset (11/11 shards): 100%|██████████| 243/243 [00:04<00:00, 50.06 examples/s]


In [17]:
# from datasets import load_from_disk
# test = load_from_disk('hamidreza_files/hf_datasets_16frames/val')


In [8]:

# processor.batch_decode(test[100]["input_ids"])

Map: 100%|██████████| 243/243 [00:37<00:00,  6.45 examples/s]


In [9]:
# splitted_train = train_dataset.train_test_split(test_size=0.5)

In [11]:
# t1, t2 = splitted_train['train'], splitted_train['test']
# t1 = t1.map(unsqueeze_pixel_values)
# t2 = t2.map(unsqueeze_pixel_values)

Map: 100%|██████████| 439/439 [01:11<00:00,  6.12 examples/s]
Map: 100%|██████████| 439/439 [01:09<00:00,  6.28 examples/s]


In [12]:
# train_dataset = concatenate_datasets([t1, t2]).with_format("torch")

dict_keys(['input_ids', 'attention_mask', 'pixel_values_videos'])
torch.Size([1, 8, 3, 336, 336])


In [23]:
save_dir = 'hamidreza_files/dataset_augmented/test/'
val_dataset.save_to_disk(save_dir)

Saving the dataset (6/6 shards): 100%|██████████| 243/243 [00:01<00:00, 185.06 examples/s]


In [42]:
class LlavaNextVideoDataCollatorWithPadding:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, features):
        padded_inputs = self.processor.tokenizer.pad(
            {
                "input_ids": [feat['input_ids'][0] for feat in features], # each element is one batch only so we slice [0]
                "attention_mask": [feat['attention_mask'][0] for feat in features],
            },
            padding=True,
            return_tensors="pt",
        )

        labels = padded_inputs["input_ids"].clone()
        labels[labels == self.processor.tokenizer.pad_token_id] = -100
        padded_inputs["labels"] = labels
        padded_inputs["pixel_values_videos"] = torch.cat([feat['pixel_values_videos'] for feat in features], dim=0)

        return padded_inputs

In [11]:
# example: seeing one of the videos in training set
from matplotlib import pyplot as plt
from matplotlib import animation
from IPython.display import HTML

# convert to image from proceessed tensors
example = x
clip = example["pixel_values_videos"][0] * 255
print(type(clip))
clip = clip.permute(0, 2, 3, 1).clamp(0, 255)

# np array with shape (frames, height, width, channels)
video = np.array(clip).astype(np.uint8)

fig = plt.figure()
im = plt.imshow(video[0,:,:,:])

plt.close() # this is required to not display the generated image

def init():
    im.set_data(video[0,:,:,:])

def animate(i):
    im.set_data(video[i,:,:,:])
    return im

anim = animation.FuncAnimation(fig, animate, init_func=init, frames=video.shape[0],
                               interval=100)
HTML(anim.to_html5_video())

IndexError: too many indices for tensor of dimension 5

In [43]:
## Load model
# Three options for training, from the lowest precision training to the highest precision training:
# QLoRA: model uses 4-bit quantization, which helps in reducing memory usage while maintaining performance.
# Standard LoRA:  model is loaded with standard LoRA adaptations.
# Full Fine-Tuning: no memory optimization are done. In that case Flash Attention is used to speed up training, if hardware supports it.

if USE_QLORA or USE_LORA:
    if USE_QLORA:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
        )
    model = LlavaNextVideoForConditionalGeneration.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        quantization_config=bnb_config,
        device_map="auto",
    )
else:
    # for full fine-tuning, we can speed up the model using Flash Attention
    # only available on certain devices, see https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#installation-and-features
    model = LlavaNextVideoForConditionalGeneration.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        _attn_implementation="flash_attention_2",
        device_map="auto",
    )

You are using a model of type llava_next to instantiate a model of type llava_next_video. This is not supported for all configurations of models and can yield errors.
Unrecognized keys in `rope_scaling` for 'rope_type'='linear': {'type'}
Downloading shards: 100%|██████████| 3/3 [02:49<00:00, 56.53s/it]
Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.05s/it]


In [44]:
# merging train and test hf datasets and change the split size
new_train_dataset = concatenate_datasets([train_dataset, test_dataset]).with_format("torch")


In [60]:
save_dir = 'hamidreza_files/hf_datasets/new_test/'
val_dataset.save_to_disk(save_dir)

Saving the dataset (6/6 shards): 100%|██████████| 243/243 [00:01<00:00, 133.97 examples/s]


: 

In [50]:
def find_all_linear_names(model):
    cls = torch.nn.Linear
    lora_module_names = set()
    multimodal_keywords = ['multi_modal_projector', 'vision_model']
    for name, module in model.named_modules():
        if any(mm_keyword in name for mm_keyword in multimodal_keywords):
            continue
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names: # needed for 16-bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)


lora_config = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.1,
    target_modules=find_all_linear_names(model),
    init_lora_weights="gaussian",
)

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

In [53]:
OUTPUT_DIR = 'hamidreza_files/checkpoints/'
args = TrainingArguments(

    # args related to training
    output_dir = OUTPUT_DIR,
    eval_strategy = 'steps',
    eval_steps=100,
    num_train_epochs=2,
    per_device_train_batch_size = BATCH_SIZE,
    per_device_eval_batch_size = BATCH_SIZE,
    gradient_accumulation_steps = 8,
    learning_rate = 2e-05,
    max_steps = 1000, # adjust this depending on your dataset size
    lr_scheduler_type = 'cosine',
    warmup_ratio = 0.1,
    # num_train_epochs=2,
    # args related to eval/save
    logging_steps = 100,
    save_strategy = 'steps',
    save_steps=100,
    save_total_limit = 1,
    fp16 = True, # we have the model train and eval with fp16 precision
    fp16_full_eval = True,
    optim = 'adamw_bnb_8bit', # adam in lower-bits to save memory, consider changing to 'adamw_torch' if model is not converging
    report_to = "wandb", # install wand to use this
    hub_model_id = REPO_ID,
    push_to_hub = True, # wel'll push the model to hub after each epoch

    # model that was wrapped for QLORA training with peft will not have arguments listed in its signature
    # so we need to pass lable names explicitly to calculate val loss
    label_names=["labels"],
    dataloader_num_workers=4, # let's get more workers since iterating on video datasets might be slower in general
)

torch.Size([1, 62])

In [57]:
trainer = Trainer(
    model = model,
    tokenizer = processor,
    data_collator = LlavaNextVideoDataCollatorWithPadding(processor=processor),
    train_dataset = new_train_dataset['train'],
    eval_dataset = new_train_dataset['test'],
    args=args,
)

trainer.train()

max_steps is given, it will override any value given in num_train_epochs


[2024-08-14 16:29:47,488] [INFO] [real_accelerator.py:158:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mha-alikhani-k[0m ([33mha-alikhani-k-uc[0m). Use [1m`wandb login --relogin`[0m to force relogin


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Step,Training Loss,Validation Loss


KeyboardInterrupt: 

In [25]:
# base_model = AutoModelForCausalLM.from_pretrained(“base_model”, load_in_8bit=True, torch_dtype=torch.float16, device_map=“auto”)
from peft import PeftModel

lora_config = LoraConfig(
    r=8,
    lora_alpha=8,
    lora_dropout=0.1,
    target_modules=find_all_linear_names(model),
    init_lora_weights="gaussian",
)

base_model = LlavaNextVideoForConditionalGeneration.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        quantization_config=bnb_config,
        device_map="auto",
    )
model_to_merge = PeftModel.from_pretrained(LlavaNextVideoForConditionalGeneration.from_pretrained(base_model).to(“cuda”), lora_adapter)

NameError: name 'model' is not defined