The notebook contains the code for downloading, installation and inferencing LLava model.

# Imports and Installations

In [None]:
import os
import random
import numpy as np
import pandas as pd
from IPython.display import clear_output
from time import time

In [None]:
!git clone https://github.com/PKU-YuanGroup/Video-LLaVA

In [None]:
cd Video-LLaVA

In [None]:
!pip install --upgrade pip -q
!pip install -e . -q
!pip install -e ".[train]" -q

In [None]:
import numpy as np
import pandas as pd

import torch
from transformers import pipeline

from videollava.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN
from videollava.conversation import conv_templates, SeparatorStyle
from videollava.model.builder import load_pretrained_model
from videollava.utils import disable_torch_init
from videollava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria

disable_torch_init() # запускать только один раз

# Inference

In [None]:
DATA_PATH = '' # Your data Path here

In [None]:
model_path = 'LanguageBind/Video-LLaVA-7B'
cache_dir = 'cache_dir'
device = 'cuda'

load_4bit, load_8bit = True, False
model_name = get_model_name_from_path(model_path)
tokenizer, model, processor, _ = load_pretrained_model(model_path, 
                                                       None, 
                                                       model_name, 
                                                       load_8bit, 
                                                       load_4bit, 
                                                       device=device, 
                                                       cache_dir=cache_dir)
clear_output()

In [None]:
classes = [
    'Промо/Нет/Нет',
    'Имидж/Нет/Нет',
    'Имидж/Нет/Да',
    'Промо/Доставка/Нет',
    'Промо/Нет/Да',
    'Имидж/Доставка/Нет',
    'промо/Нет/Нет',
    'Имидж',
    'Кредитование',
    'Range',
    'Дебетовые карты',
    'Услуги бизнесу',
    'Кредитные карты',
    'Инвестиционные продукты',
    'Экосистемные сервисы',
    'Музыка',
    'Колонки+Голосовой помощник',
    'Клипы',
    'Соц сети'
    ]
d = {i:cls for i, cls in enumerate(classes)}

In [None]:
all_files = []
base_path = 'YOUR_PATH_HERE'

df_resampled = pd.read_csv('/kaggle/input/hack-chunk-desc/resampled_data (1).csv')
df_new = pd.DataFrame({
    'Advertisement ID': df_resampled['Advertisement ID'],
    'text': df_resampled['text'],
     'label': df_resampled['Segment_num']})
df_new['no_sound'] = df_new['text'].str.contains('субтитры|динамичная|позитивная', case=False, na=False)

for i in ('0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10-stranger', '11', '12-stranger', '13', '14', '15', '16', '17'):
    folder_path = f"{base_path}{i}"
    for root, dirs, files in os.walk(folder_path):
        for file in files:
            full_path = os.path.join(root, file)
            all_files.append(full_path)

len(all_files) == len(df_new)

id_to_path = {}
for file_path in all_files:
    file_id = int(file_path.split('/')[-1].split('.')[0])
    id_to_path[file_id] = file_path

def get_path(ad_id):
    return id_to_path.get(ad_id, None)

df_new['path'] = df_new['Advertisement ID'].apply(get_path)

In [None]:
def getPrediction(video, inp):
    
    video_processor = processor['video']
    conv_mode = "llava_v1"
    conv = conv_templates[conv_mode].copy()
    roles = conv.roles

    video_tensor = video_processor(video, return_tensors='pt')['pixel_values']
    if type(video_tensor) is list:
        tensor = [video.to(model.device, dtype=torch.float16) for video in video_tensor]
    else:
        tensor = video_tensor.to(model.device, dtype=torch.float16)
        
    inp = ' '.join([DEFAULT_IMAGE_TOKEN] * model.get_video_tower().config.num_frames) + '\n' + inp

    conv.append_message(conv.roles[0], inp)
    conv.append_message(conv.roles[1], None)

    prompt = conv.get_prompt()
    input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()

    stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2
    keywords = [stop_str]
    stopping_criteria = KeywordsStoppingCriteria(keywords, tokenizer, input_ids)

    with torch.inference_mode():
        output_ids = model.generate(
            input_ids,
            images=tensor,
            do_sample=True,
            temperature=0.3,
            max_new_tokens=1000,
            use_cache=True,
            stopping_criteria=[stopping_criteria])

    outputs = tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip()

    return outputs

In [None]:
#Saved promts to get data for the further possible classification

inp_saved1 = "Describe the AD. What's the marketing purpose of this ad? Avoid general words"
inp_saved2 = "Your role is an advertisment specialist. Make the merketing analysis and describe the AD. What's the marketing purpose of this ad? Avoid general words"
inp_saved3 = f"What is the product in the from the following {classes}"
inp_saved4 = f"Describe the video. What do you think it advertises?"

In [None]:
promt = ... 

for i in range(5):
    start = time.time()
    sample = df_new[df_new.no_sound == True].sample()
    video = sample.path.values[0]
    label = sample.label
    print(getPrediction(video, promt))
    end = time.time()
    print(d[label.values[0]])
    print(end - start)
    print('*'*50)


In [None]:
rus_to_eng_dict = {
    0: 'Promotion',
    1: 'Имидж/Нет/Нет',
    2: 'Имидж/Нет/Да',
    3: 'Промо/Доставка/Нет',
    4: 'Промо/Нет/Да',
    5: 'Имидж/Доставка/Нет',
    7: 'Company Image',
    8: 'Lending',
    9: 'Range',
    10: 'Debit Cards',
    11: 'Business services',
    12: 'Credit Cards',
    13: 'Investment products',
    14: 'Ecosystem services',
    15: 'Music apps',
    16: 'Speakers+Voice Assistant',
    17: 'Short video app',
    18: 'Social Networks'
}

# Creative Advisor

In [None]:
for i in range(5):
    start = time.time()
    sample = df_new[df_new.no_sound == True].sample()
    video = sample.path.values[0]
    label = d[sample.label.values[0]]
    promt = f"What is attractive in this ad? Can you create a motto for such a product so I can use it."
    print(getPrediction(video, promt))
    end = time.time()
    print(label)
    print(end - start)
    print('*'*50)

In [None]:
for i in range(5):
    start = time.time()
    sample = df_new[df_new.no_sound == True].sample()
    video = sample.path.values[0]
    label = rus_to_eng_dict[sample.label.values[0]]
    promt = f"The ad purpose is to promote: {label}. What is attractive in this ad? Can you create a motto for such a product so I can use it for mine."
    print(getPrediction(video, promt))
    end = time.time()
    print(label)
    print(end - start)
    print('*'*50)