In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

MODEL_NAME = "Qwen/Qwen1.5-1.8B" 

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    device_map="auto"
)

def extract_subjects_actions(query: str) -> str:
    system = (
        "You are a helpful assistant that specializes in natural language. "
        "Try to extract and list out all the subject, its features and some actions inside of the query. "
        "Only return the compact list like in the example; no explanations; do not add any special character"

        "Example 1:\n"
        "- Input: On a white round plate is a glass of panna cotta. A hand places two more glasses of panna cotta on the plate. Each panna cotta has a smooth ivory cream layer, decorated with a few slices of red grapes and green mint leaves for a fresh highlight. Next to the plate are two edible flowers (red and yellow) to add to the beauty."
        "- Output: white round plate, panna cotta glass, there is hand, panna cotta glasses, placing on plate, panna cotta, ivory white smooth cream, topped with red grape slices and green mint leaves plate, edible flowers (red and yellow), enhance portion"

        "Example 2:\n"
        "- Input: Find a cycling video, shot from an aerial drone, showing a cyclist in a blue and white jersey passing three other cyclists and taking the lead. Then, know that this cyclist leads the rest of the way to the finish line."
        "- Output: bicycle racing video, overhead drone angle, athlete, blue and white shirt, overtaking three athletes, athlete, blue and white shirt, taking lead, athlete, blue and white shirt, led to finish"
    )

    prompt = f"{system}\nInput: {query}\nOutput:"

        
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    input_length = inputs.input_ids.shape[1]

    if tokenizer.pad_token_id is None:
        tokenizer.pad_token = tokenizer.eos_token
        
    outputs = model.generate(
        **inputs,
        max_new_tokens=50,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        top_k=0,
        return_dict_in_generate=True,
        output_scores=True
    )

    text = tokenizer.decode(
        outputs.sequences[0, input_length:],
        skip_special_tokens=True
    )
    
    return text.strip().splitlines()[0].strip()

ValueError: Using a `device_map`, `tp_plan`, `torch.device` context manager or setting `torch.set_default_device(device)` requires `accelerate`. You can install it with `pip install accelerate`

In [None]:
query = ("Video footage narrating a bicycle race. Find a scene with a head-on angle from above and follow the riders. In the frame, there are 3 riders pedaling in a straight line. All 3 riders are from the same team, with white uniforms and blue yellow pants. The first rider wears a white hat, the second rider wears a red hat, and the last rider wears a black hat.")
print(extract_subjects_actions(query))
torch.cuda.empty_cache()