# Imports

In [4]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "True"
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
import torch
import shutil

from transformers import Trainer, TrainingArguments
from transformers import AutoProcessor, BitsAndBytesConfig, LlavaNextVideoForConditionalGeneration
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from huggingface_hub import hf_hub_download, HfFileSystem
from datasets import load_dataset, concatenate_datasets

from utils.utils import read_video_decord

MAX_LENGTH = 256

MODEL_ID = "llava-hf/LLaVa-NeXT-Video-7b-hf"
REPO_ID = "sm47466863/LLaVa-NeXT-Video-GENAI-FT-Project"

USE_LORA = False
USE_QLORA = True

In [2]:
read_video_decord

<function utils.utils.read_video_decord(video_path, num_frames=16)>

## Prepare dataset

Load dataset and use small part for fine tuning

In [2]:
dataset = load_dataset("ShareGPT4Video/ShareGPT4Video", cache_dir='./cache/')

In [3]:
dataset

DatasetDict({
    train: Dataset({
        features: ['video_id', 'video_path', 'timestamp', 'keyframe', 'captions', 'zip_folder'],
        num_rows: 40178
    })
})

Let's create an collator function and processor (tokenizer)

In [5]:
processor = AutoProcessor.from_pretrained(MODEL_ID, use_fast=False, cache_dir='./cache_processor/')
processor.tokenizer.padding_side = "right"

In [2]:
class LlavaNextVideoDataCollatorWithPadding:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, features):
        padded_inputs = self.processor.tokenizer.pad(
            {
                "input_ids": [feat['input_ids'][0] for feat in features],
                "attention_mask": [feat['attention_mask'][0] for feat in features],
            },
            padding=True,
            return_tensors="pt",
        )

        labels = padded_inputs["input_ids"].clone()
        labels[labels == self.processor.tokenizer.pad_token_id] = -100
        padded_inputs["labels"] = labels
        padded_inputs["pixel_values_videos"] = torch.cat([feat['pixel_values_videos'] for feat in features], dim=0)

        return padded_inputs

def collate_fn(example, path):
    """
    Function of data collator
    """
    video_file = example["video_path"].split("/")[-1]
    video_clip = read_video_decord(f'{path}/{video_file}')

    # overall caption to summarize all scene well
    captions_all = [caption for caption in example['captions'] if caption['idx'] == '-1']
    caption = captions_all[0]['content']

    conversation = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Provide a detailed summary for this video."},
                    {"type": "video"},
                    ],
            },
            {
                "role": "assistant",
                "content": [
                    {"type": "text", "text": caption},
                     ],
            },
        ]

    prompt = processor.apply_chat_template(conversation, add_generation_prompt=False)

    batch = processor(
        text=prompt,
        videos=video_clip,
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )

    return batch

Let's cut a small piece of dataset $\approx$ 5G.

In [6]:
datasets_combined = []
fs = HfFileSystem()
DATASET_PATH = "./dataset/"
directory = f"{DATASET_PATH}/temp_dir"
# ego4d-32.2G, mixit-22.2G, bdd100k-12.1G, pixabay - 20.1G, pexels - 14.3G
zip_folders = { "mixit", "pexels" }

downloaded_cache = {}

for zip_folder in zip_folders:
    print(f"Working with {zip_folder}")
    zip_files = fs.ls(f"datasets/ShareGPT4Video/ShareGPT4Video/zip_folder/{zip_folder}", detail=False)
    
    for zip_file in zip_files:
        zip_file = zip_file.split("/")[-1]

        if zip_file in downloaded_cache:
            print(f"File {zip_file} is already downloaded, skipping...")
            path = downloaded_cache[zip_file]

        else:
            print(f"{zip_file} is downloading")
            path = hf_hub_download(
                repo_id='ShareGPT4Video/ShareGPT4Video',
                repo_type="dataset",
                filename=f"zip_folder/{zip_folder}/{zip_file}",
                local_dir=f"{DATASET_PATH}/{zip_folder}",
                cache_dir=DATASET_PATH,
            )
            downloaded_cache[zip_file] = path  # Add to cache dictionary

        subdataset_name = zip_file.split("_")[0]

        if os.path.exists(directory):
            shutil.rmtree(directory)
        os.makedirs(directory)

        if path.endswith(".zip"):
            shutil.unpack_archive(path, directory)
            print(f"combining {zip_folder} to one df")
            curr_video_files = os.listdir(directory)
            small_dataset = dataset.filter(lambda example: example["video_path"].split("/")[-1] in curr_video_files)

            small_dataset = small_dataset.map(
                collate_fn,
                batched=False,
                fn_kwargs={"path": directory},
                num_proc=24,
                remove_columns=["captions", "keyframe", "timestamp", "video_id", "video_path"],
                writer_batch_size=500,
            )
            datasets_combined.append(small_dataset['train'])
            print(f"finished for {zip_folder}")

In [17]:
dataset_processed = concatenate_datasets(datasets_combined)
dataset_processed = dataset_processed.shuffle(seed=42)
dataset = dataset_processed.train_test_split(test_size=0.2)

In [18]:
train_dataset, test_dataset = dataset['train'].with_format("torch"), dataset['test'].with_format("torch")

In [19]:
train_dataset, test_dataset

(Dataset({
     features: ['zip_folder', 'input_ids', 'attention_mask', 'pixel_values_videos'],
     num_rows: 1082
 }),
 Dataset({
     features: ['zip_folder', 'input_ids', 'attention_mask', 'pixel_values_videos'],
     num_rows: 271
 }))

# Visualize Dataset

In [37]:
example = train_dataset[0]

In [19]:
processor.batch_decode(example["input_ids"]) # let's check a description

["<|im_start|> user \n <video> \n Provide a detailed summary for this video . <|im_end|> \n <|im_start|> assistant \n The video depicts a sequence of events on an urban street from the perspective of a vehicle's dashboard camera . Initially , a black SUV is seen directly ahead with its brake lights on , indicating it is either stopped or slowing down . A pedestrian is observed on the sidewalk , seemingly waiting to cross . The scene includes a black sedan to the left of the SUV , a yellow taxi farther ahead and to the left , an overhead pedestrian bridge , and various urban buildings under an overcast sky . The camera view is slightly obstructed at the top right , likely due to a mounted accessory or reflection . \n \n As the video progresses , the waiting pedestrian begins crossing the street , passing in front of the stationary black SUV , and continues his journey across the crosswalk at a consistent pace , moving from right to left across the viewer’s perspective . Traffic remains 

In [38]:
from matplotlib import pyplot as plt
from matplotlib import animation
from IPython.display import HTML

clip = example["pixel_values_videos"][0] * 255
clip = clip.permute(0, 2, 3, 1).clamp(0, 255)
video = np.array(clip).astype(np.uint8)

fig = plt.figure()
im = plt.imshow(video[0,:,:,:])
plt.close()

def init():
    im.set_data(video[0,:,:,:])

def animate(i):
    im.set_data(video[i,:,:,:])
    return im

anim = animation.FuncAnimation(fig, animate, init_func=init, frames=video.shape[0],
                               interval=100)
HTML(anim.to_html5_video())

## Load Model + fine-tuning via Q-LoRa

In [24]:
## We will use Qlora since 7b it's to much, we will train linear layers
if USE_QLORA or USE_LORA:
    if USE_QLORA:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.float16,
        )
    model = LlavaNextVideoForConditionalGeneration.from_pretrained(
        MODEL_ID,
        torch_dtype=torch.float16,
        quantization_config=bnb_config,
        device_map="auto",
        cache_dir="./model_/"
    )

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
def find_all_linear_names(model):
    cls = torch.nn.Linear
    lora_module_names = set()
    multimodal_keywords = ['multi_modal_projector', 'vision_model']
    for name, module in model.named_modules():
        if any(mm_keyword in name for mm_keyword in multimodal_keywords):
            continue
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    if 'lm_head' in lora_module_names:
        lora_module_names.remove('lm_head')
    return list(lora_module_names)


lora_config = LoraConfig(
    r=16,
    lora_alpha=8,
    lora_dropout=0.1,
    target_modules=find_all_linear_names(model),
    init_lora_weights="gaussian",
)


In [25]:
model.padding_side='right' 
processor.tokenizer.padding_side='right' # for training purposes

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)

## Train

Training is done with Huggingface Trainer

In [28]:
args = TrainingArguments(
    output_dir = "./output/",
    eval_strategy = 'steps',
    eval_steps=20,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size = 4,
    gradient_accumulation_steps = 8,
    learning_rate = 2e-05,
    max_steps = 100,
    lr_scheduler_type = 'cosine',
    warmup_ratio = 0.1,

    logging_steps = 20,
    save_strategy = 'steps',
    save_steps=20,
    save_total_limit = 1,
    fp16 = True,
    fp16_full_eval = True,
    optim = 'adamw_bnb_8bit',
    hub_model_id = REPO_ID,
    push_to_hub = True,

    label_names=["labels"], # pass label because QLORA with peft not save it in signature 
    dataloader_num_workers=4,
)

In [29]:
trainer = Trainer(
    model = model,
    tokenizer = processor,
    data_collator = LlavaNextVideoDataCollatorWithPadding(processor=processor),
    train_dataset = train_dataset,
    eval_dataset = test_dataset,
    args=args,
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs


In [7]:
# trainer.train()

In [31]:
trainer.model.push_to_hub(REPO_ID) # let's push to hub the last ckpt

adapter_model.safetensors:   0%|          | 0.00/169M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/sm47466863/LLaVa-NeXT-Video-GENAI-FT-Project/commit/6f3599fbd7994a87da6d1cf1f943eb45c59d5e92', commit_message='Upload model', commit_description='', oid='6f3599fbd7994a87da6d1cf1f943eb45c59d5e92', pr_url=None, repo_url=RepoUrl('https://huggingface.co/sm47466863/LLaVa-NeXT-Video-GENAI-FT-Project', endpoint='https://huggingface.co', repo_type='model', repo_id='sm47466863/LLaVa-NeXT-Video-GENAI-FT-Project'), pr_revision=None, pr_num=None)

## Inference with tuned model

In [20]:
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
    REPO_ID,
    torch_dtype=torch.float16,
    device_map="auto",
    cache_dir="./model_ft/"
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
video_clip = read_video_decord('./video_test.mp4', num_frames=16)

In [11]:
video_clip.shape # (16, 3, 336, 336)

(16, 464, 848, 3)

In [19]:
from matplotlib import pyplot as plt
from matplotlib import animation
from IPython.display import HTML

    
def run_inference(video_clip, model):
    conversation = [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Provide a summary for this video."},
                    {"type": "video"},
                    ],
            },
        ]

    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)

    batch = processor(
        text=prompt,
        videos=None,
        return_tensors="pt"
    ).to(model.device)
    video_clip = video_clip.to(model.device)

    out = model.generate(**batch, pixel_values_videos=video_clip, max_length=MAX_LENGTH, do_sample=True)
    generated_text = processor.batch_decode(out, skip_special_tokens=True)
    return generated_text
    
def return_video(path):
    video_clip = read_video_decord(path, num_frames=16)
    print("Video is loaded")
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Provide a summary for this video."},
                {"type": "video"},
            ],
        }
    ]
    
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=False)
    
    video_clip_ = processor(
        text=prompt,
        videos=video_clip,
        truncation=True,
        max_length=MAX_LENGTH,
        return_tensors="pt"
    )
    return video_clip_

### Test custom examples

In [15]:
video_clip = read_video_decord('./video_test.mp4', num_frames=16)
clip = torch.from_numpy(video_clip).permute(0, 3, 1, 2)  #* 255
clip = clip.permute(0, 2, 3, 1).clamp(0, 255)

video = np.array(clip).astype(np.uint8)

fig = plt.figure()
im = plt.imshow(video[0,:,:,:])

plt.close()

def init():
    im.set_data(video[0,:,:,:])

def animate(i):
    im.set_data(video[i,:,:,:])
    return im

anim = animation.FuncAnimation(fig, animate, init_func=init, frames=video.shape[0],
                               interval=100)
HTML(anim.to_html5_video())

In [16]:
video_clip = read_video_decord('./video_test_2.mp4', num_frames=16)
clip = torch.from_numpy(video_clip).permute(0, 3, 1, 2)  #* 255
clip = clip.permute(0, 2, 3, 1).clamp(0, 255)

video = np.array(clip).astype(np.uint8)

fig = plt.figure()
im = plt.imshow(video[0,:,:,:])

plt.close()

def init():
    im.set_data(video[0,:,:,:])

def animate(i):
    im.set_data(video[i,:,:,:])
    return im

anim = animation.FuncAnimation(fig, animate, init_func=init, frames=video.shape[0],
                               interval=100)
HTML(anim.to_html5_video())

In [17]:
video_clip = read_video_decord('./video_test_3.mp4', num_frames=16)
clip = torch.from_numpy(video_clip).permute(0, 3, 1, 2)  #* 255
clip = clip.permute(0, 2, 3, 1).clamp(0, 255)

video = np.array(clip).astype(np.uint8)

fig = plt.figure()
im = plt.imshow(video[0,:,:,:])

plt.close()

def init():
    im.set_data(video[0,:,:,:])

def animate(i):
    im.set_data(video[i,:,:,:])
    return im

anim = animation.FuncAnimation(fig, animate, init_func=init, frames=video.shape[0],
                               interval=100)
HTML(anim.to_html5_video())

In [11]:
test_video_1 = return_video('./video_test.mp4')
test_video_2 = return_video('./video_test_2.mp4')
test_video_3 = return_video('./video_test_3.mp4')

Expanding inputs for image/video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.


Video is loaded
Video is loaded
Video is loaded


In [26]:
run_inference(test_video_1["pixel_values_videos"], model)

['USER:  <video> \nProvide a summary for this video. ASSISTANT: The video showcases a man and a woman engaged in a casual conversation, specifically discussing elements related to a product or topic of interest, facilitated by a laptop placed on a table, with the man using a microphone. The setup appears to be a typical recording or production studio setup, featuring sound equipment such as microphones and a laptop connected to headphones, which is being used to control the recording process. The man wears a hat, likely representing his persona or style, and he speaks in an animated and engaged manner, suggesting enthusiasm for the discussion. The woman responds to his conversation, offering her perspectives or questions.\n\nBetween moments of engagement, both individuals appear to be checking their phone discreetly, possibly indicating a pause in the conversation or a change in the discussion. As the video progresses, the man continues to engage with the woman, sharing more informatio

In [27]:
run_inference(test_video_2["pixel_values_videos"], model)

["USER:  <video> \nProvide a summary for this video. ASSISTANT: The video captures a scene of a person working on a laptop in an office-like setting, with the focus on the task of writing or typing on a document, followed by a transition to a different document on the same screen. A series of close-up shots and long views of the laptop and the individual's movements are shown as the person interacts with the device. These include typing or writing text, adjusting the screen angle, and occasional glimpses of the person's face and the surrounding environment. The sequence of actions appears to emphasize the individual's interaction with technology and their engagement with the digital space, highlighting the process of composing or editing text, reflecting a commonplace work environment likely involving tasks such as drafting, revising, or data entry. Throughout, the surroundings remain consistent, suggesting no significant variation or change in setting, but the specific activity or det

In [28]:
run_inference(test_video_3["pixel_values_videos"], model)

["USER:  <video> \nProvide a summary for this video. ASSISTANT: The video captures a spontaneous moment of two musicians performing in an urban environment, where an underground subway station or transit area serves as their stage. The musicians, a cellist and a flautist, share a talent and a passion for playing their instruments in public spaces, blending seamlessly into the rhythm of city life. The cellist, positioned on the left side of the frame, is actively strumming their instrument while exuding a tranquil, focused expression. Meanwhile, the flautist stands on the right, leaning in with a dynamic posture as they play, their body leaning forward as if bending over their instrument to play with full concentration and engagement.\n\nThe setting's ambient lighting and the visible architecture create a scene in stark contrast to the traditional concert space, creating an intimate atmosphere that transforms the public into a makeshift audience. Despite the absence of seating, individu

## Test with not tuned model

In [9]:
old_model = LlavaNextVideoForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.float16,
    device_map="auto",
    cache_dir="./model_/"
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [12]:
run_inference(test_video_1["pixel_values_videos"], old_model)

Expanding inputs for image.video tokens in LLaVa-NeXT-Video should be done in processing. Please add `patch_size` and `vision_feature_select_strategy` to the model's processing config or set directly with `processor.patch_size = {{patch_size}}` and processor.vision_feature_select_strategy = {{vision_feature_select_strategy}}`. Using processors without these attributes in the config is deprecated and will throw an error in v4.47.
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)


['USER:  <video> \nProvide a summary for this video. ASSISTANT: The video features two individuals sitting close together, engaged in conversation and laughing. One is wearing a hat and seems to be the focus of attention, while the other is facing away from them, sitting behind the person in the hat. The scene includes a laptop and some decorative furnishings, with lighting that highlights their expressions and creates a warm and cozy atmosphere. The nature of their interaction, body language, and the setting suggest they are perhaps a couple or close friends sharing a pleasant interaction.']

In [13]:
run_inference(test_video_2["pixel_values_videos"], old_model)

['USER:  <video> \nProvide a summary for this video. ASSISTANT: The video shows a person sitting at a workstation with a laptop and keyboard in front of them. The person is typing quickly on the laptop and seems focused on the task. There is a computer monitor nearby, displaying a desktop with various open windows that appear to be applications or documents being worked on. A chair is visible behind the person, positioned at a 90-degree angle suggesting the viewer is sitting on it and facing the laptop.']

In [14]:
run_inference(test_video_3["pixel_values_videos"], old_model)

['USER:  <video> \nProvide a summary for this video. ASSISTANT: This video features a musical performance in a subway setting. The musicians are playing instruments in a subway station, creating a unique and intimate atmosphere. The performers are arranged on the platform area, inviting the audience to join them or simply provide a serene musical interlude for subway passengers. One person plays a violin, captivating and delicate in its sound, while another plays the cello, adding a deep, resonant base to the musical progression. Their playing captures the passersby and subway passengers, filling their trip with moments of harmony and expression. The musicians share smiles and interactions with their audience, engaging both individuals in their performance and the environment. The station platform becomes their stage, offering a public space for music-loving travelers to enjoy and share a musical moment.']

In [8]:
# as we can see our FT model answers with more detailed and complex answers