In [None]:
!pip install -q opencv-python
!apt install -q ffmpeg
!pip install -q openai
!pip install opencv-python
!apt install ffmpeg
!pip install easydict
!pip install einops
!pip install peft
!pip install timm
!pip install -U bitsandbytes
!pip install moviepy
!pip install decord
!pip install av
!pip install -q bitsandbytes 
!pip install -q accelerate==0.34.2

In [None]:

import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
import cv2
import os
import requests
from openai import OpenAI
import av
import torch
from transformers import BitsAndBytesConfig
from transformers import LlavaNextVideoForConditionalGeneration, LlavaNextVideoProcessor
from huggingface_hub import hf_hub_download


# Video Sequences

In [None]:
from IPython.display import Video
!ffmpeg -i /input/path/scenariobased/Scenario1.mp4 -vf "scale=800:450" -c:v libx264 -preset fast -crf 23 -c:a copy /output_path/Scenario_resized.mp4

Video('./Scenario1_resized.mp4',embed=True,width=800, height=450)

In [None]:
from IPython.display import Video
!ffmpeg -i /input/scenariobased/Scenario2.mp4 -vf "scale=800:450" -c:v libx264 -preset fast -crf 23 -c:a copy /output_path/Scenario2_.mp4

Video('./Scenario2_.mp4',embed=True,width=800, height=450)

from IPython.display import Video
!ffmpeg -i /input/scenariobased/Scenario3.mp4 -vf "scale=800:450" -c:v libx264 -preset fast -crf 23 -c:a copy /output_path/Scenario3_.mp4

Video('./Scenario3_.mp4',embed=True,width=800, height=450)

# Question Generation

https://colab.research.google.com/drive/1I2fenmHDFsTf6CuSIQm6ClA3QG1fA_aw?usp=sharing

 # LLaVA-NeXT-Video-7B-hf

In [None]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

def read_video_pyav(container, indices):
    '''
    Decode the video with PyAV decoder.
    Args:
        container (`av.container.input.InputContainer`): PyAV container.
        indices (`List[int]`): List of frame indices to decode.
    Returns:
        result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
    '''
    frames = []
    container.seek(0)
    start_index = indices[0]
    end_index = indices[-1]
    for i, frame in enumerate(container.decode(video=0)):
        if i > end_index:
            break
        if i >= start_index and i in indices:
            frames.append(frame)
    return np.stack([x.to_ndarray(format="rgb24") for x in frames])

# Load the model with a custom device map to manage GPU/CPU allocation
model = LlavaNextVideoForConditionalGeneration.from_pretrained(
    "llava-hf/LLaVA-NeXT-Video-7B-hf",
    quantization_config=quantization_config,
    torch_dtype=torch.float16,
    device_map="auto"  # Allow the model to determine the device map
)

processor = LlavaNextVideoProcessor.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")

# Load the video as an np.array, sampling uniformly 8 frames (can sample more for longer videos)
video_path =  "./Scenario1_resized.mp4"
container = av.open(video_path)
total_frames = container.streams.video[0].frames
indices = np.arange(0, total_frames, total_frames / 8).astype(int)
video = read_video_pyav(container, indices)
#change the questions for each question type
question_tree = [
   {
    "sub_questions": [
        {"id": 1, "question": "Is the pedestrian using a crosswalk?"},
        {"id": 2, "question": "How many cars can you see at the start?"},
        {"id": 3, "question": "Is it day or night?"},
        {"id": 4, "question": "How many yellow cars are there?"},
        {"id": 5, "question": "How many traffic lights are visible?"},
        {"id": 6, "question": "How many pedestrians cross the road in total?"},
        {"id": 7, "question": "How many vehicles stopped for pedestrians crossing the road?"},
        {"id": 8, "question": "Was the pedestrian crossing from left to right or right to left?"},
        {"id": 9, "question": "Which car arrived first at the intersection?"},
        {"id": 10, "question": "Was the pedestrian in the middle of the road at any point?"},
        {"id": 11, "question": "What impact does the pedestrian's position have on vehicle flow in the first half of the video?"},
        {"id": 12, "question": "How many vehicles stopped for the pedestrian crossing after turning?"},
        {"id": 13, "question": "How many vehicles were present in the middle of the intersection after the pedestrian crossed?"},
        {"id": 14, "question": "What is the longest gap between vehicle movement and pedestrian crossing?"},
        {"id": 15, "question": "Was the traffic light timing optimal for the pedestrian crossing and vehicle flow? Analyze the situation."}
    ]
},
]

# Prepare to hold outputs
outputs = []

for question in question_tree[0]['sub_questions']:
    # Create a conversation template for each question
    conversation = [
        {
            "role": "user",
            "content": [
                {"type": "text", "text": question["question"]},
                {"type": "video"},
            ],
        },
    ]
    
    # Apply the conversation template to prepare the prompt
    prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
    
    
    inputs = processor(text=prompt, videos=video, return_tensors="pt").to(model.device)  # Ensure inputs are on the same device

    out = model.generate(**inputs, max_new_tokens=60)
   
    answer = processor.batch_decode(out, skip_special_tokens=True, clean_up_tokenization_spaces=True)[0]
    

    outputs.append({
        "question_id": question["id"],
        "question": question["question"],
        "answer": answer
    })

    print(f"Question: {question['question']}\nAnswer: {answer}\n")


for output in outputs:
    print(f"Question ID: {output['question_id']}, Question: {output['question']}, Answer: {output['answer']}")


# InterVL

In [None]:
import os
import torch
from transformers import AutoTokenizer, AutoModel, BitsAndBytesConfig

# Model path
path = "OpenGVLab/InternVL2-8B"

# BitsAndBytes configuration for 8-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True  # Enable 8-bit quantization
)

# Load the model using BitsAndBytesConfig for quantization
model = AutoModel.from_pretrained(
    path,
    torch_dtype=torch.bfloat16,
    quantization_config=bnb_config,  # Pass the BitsAndBytesConfig object here
    low_cpu_mem_usage=True,
    trust_remote_code=True
).eval()

Referenced from hugging face https://huggingface.co/spaces/OpenGVLab/InternVL

In [None]:
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
from transformers import AutoModel, AutoTokenizer

IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)

def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform

def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio

def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])

    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)

    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]

    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images




# video multi-round conversation (视频多轮对话)
def get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
    if bound:
        start, end = bound[0], bound[1]
    else:
        start, end = -100000, 100000
    start_idx = max(first_idx, round(start * fps))
    end_idx = min(round(end * fps), max_frame)
    seg_size = float(end_idx - start_idx) / num_segments
    frame_indices = np.array([
        int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
        for idx in range(num_segments)
    ])
    return frame_indices

def load_video(video_path, bound=None, input_size=448, max_num=1, num_segments=32):
    vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
    max_frame = len(vr) - 1
    fps = float(vr.get_avg_fps())

    pixel_values_list, num_patches_list = [], []
    transform = build_transform(input_size=input_size)
    frame_indices = get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
    for frame_index in frame_indices:
        img = Image.fromarray(vr[frame_index].asnumpy()).convert('RGB')
        img = dynamic_preprocess(img, image_size=input_size, use_thumbnail=True, max_num=max_num)
        pixel_values = [transform(tile) for tile in img]
        pixel_values = torch.stack(pixel_values)
        num_patches_list.append(pixel_values.shape[0])
        pixel_values_list.append(pixel_values)
    pixel_values = torch.cat(pixel_values_list)
    return pixel_values, num_patches_list

In [None]:
tokenizer = AutoTokenizer.from_pretrained(path, trust_remote_code=True, use_fast=False)
generation_config = dict(max_new_tokens=1024, do_sample=True)

Synthetic Sequence 1

In [None]:
question_tree = [
   {
    "sub_questions": [
        {"id": 1, "question": "Is the pedestrian using a crosswalk?"},
        {"id": 2, "question": "How many cars can you see at the start?"},
        {"id": 3, "question": "Is it day or night?"},
        {"id": 4, "question": "How many yellow cars are there?"},
        {"id": 5, "question": "How many traffic lights are visible?"},
        {"id": 6, "question": "How many pedestrians cross the road in total?"},
        {"id": 7, "question": "How many vehicles stopped for pedestrians crossing the road?"},
        {"id": 8, "question": "Was the pedestrian crossing from left to right or right to left?"},
        {"id": 9, "question": "Which car arrived first at the intersection?"},
        {"id": 10, "question": "Was the pedestrian in the middle of the road at any point?"},
        {"id": 11, "question": "What impact does the pedestrian's position have on vehicle flow in the first half of the video?"},
        {"id": 12, "question": "How many vehicles stopped for the pedestrian crossing after turning?"},
        {"id": 13, "question": "How many vehicles were present in the middle of the intersection after the pedestrian crossed?"},
        {"id": 14, "question": "What is the longest gap between vehicle movement and pedestrian crossing?"},
        {"id": 15, "question": "Was the traffic light timing optimal for the pedestrian crossing and vehicle flow? Analyze the situation."}
    ]
},
]
video_path ='./Scenario1_resized.mp4'
pixel_values,num_patches_list = load_video(video_path,num_segments=8,max_num=1)
pixel_values=pixel_values.to(torch.bfloat16).cuda()
video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
for sub_question in question_tree[0]["sub_questions"]:
    question_text = sub_question["question"]
    question = video_prefix + question_text
    response, history = model.chat(
        tokenizer, pixel_values, question, generation_config,
        num_patches_list=num_patches_list, history=None, return_history=True
    )
    print(f'User: {question}\nAssistant: {response}')

Real World Sequence 1

In [None]:
question_tree = [
   {
    "sub_questions": [
        {"id": 1, "question": "Is the cyclist following the bike lane?"},
        {"id": 2, "question": "Are there any pedestrians in the video?"},
        {"id": 3, "question": "Is there a traffic light in the video?"},
        {"id": 4, "question": "Is there a person walking a dog?"},
        {"id": 5, "question": "How many cyclists are there?"},
        {"id": 6, "question": "Was there any vehicle that signaled before turning?"},
        {"id": 7, "question": "How many cars were in motion while the cyclist crossed?"},
        {"id": 8, "question": "How many vehicles are stopped at the intersection when the cyclist crosses?"},
        {"id": 9, "question": "Which vehicle is closest to the cyclist when they pass through the intersection?"},
        {"id": 10, "question": "Was there a pedestrian near the cyclist when they crossed?"},
        {"id": 11, "question": "Between 0:08 and 0:20, how many cars are moving?"},
        {"id": 12, "question": "How does the cyclist's violation impact overall traffic safety at the intersection?"},
        {"id": 13, "question": "What design elements of the intersection could be improved to better segregate bicycle and vehicle traffic?"},
        {"id": 14, "question": "What impact does the cyclist's position have on vehicle flow through the intersection?"},
        {"id": 15, "question": "Did the cyclist appear to slow down when approaching the intersection?"}
    ]
}
,
]
video_path ='./Scenario2_.mp4'
pixel_values,num_patches_list = load_video(video_path,num_segments=8,max_num=1)
pixel_values=pixel_values.to(torch.bfloat16).cuda()
video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
for sub_question in question_tree[0]["sub_questions"]:
    question_text = sub_question["question"]
    question = video_prefix + question_text
    response, history = model.chat(
        tokenizer, pixel_values, question, generation_config,
        num_patches_list=num_patches_list, history=None, return_history=True
    )
    print(f'User: {question}\nAssistant: {response}')

Real World Sequence 2

In [None]:
question_tree = [
  {
    "sub_questions": [
        {"id": 1, "question": "How many lamp posts are visible?"},
        {"id": 2, "question": "What color is the first car that turns onto the bike lane?"},
        {"id": 3, "question": "How many pedestrians are there?"},
        {"id": 4, "question": "Are there any bicycles?"},
        {"id": 5, "question": "Are there any trucks?"},
        {"id": 6, "question": "How many cars turn onto the bike lane?"},
        {"id": 7, "question": "Is there a manhole cover next to the tracks?"},
        {"id": 8, "question": "Do any vehicles use a signal before entering the bike lane?"},
        {"id": 9, "question": "Does the bike lane have a clear sign for shared usage with vehicles?"},
        {"id": 10, "question": "Do tram tracks pass through the middle of the road?"},
        {"id": 11, "question": "Explain the make of each car turning onto the bike lane."},
        {"id": 12, "question": "Which color car is most commonly seen turning into the bike lane?"},
        {"id": 13, "question": "What are the different types of vehicles?"},
        {"id": 14, "question": "What is the current design of the intersection that causes the cars to turn into bike lanes?"},
        {"id": 15, "question": "Do the pedestrians at the start return at the end?"}
    ]
}

,
]
video_path ='./Scenario3_.mp4'
pixel_values,num_patches_list = load_video(video_path,num_segments=8,max_num=1)
pixel_values=pixel_values.to(torch.bfloat16).cuda()
video_prefix = ''.join([f'Frame{i+1}: <image>\n' for i in range(len(num_patches_list))])
for sub_question in question_tree[0]["sub_questions"]:
    question_text = sub_question["question"]
    question = video_prefix + question_text
    response, history = model.chat(
        tokenizer, pixel_values, question, generation_config,
        num_patches_list=num_patches_list, history=None, return_history=True
    )
    print(f'User: {question}\nAssistant: {response}')

# VideoLLAMA-2

In [None]:
!git clone https://github.com/DAMO-NLP-SG/VideoLLaMA2
!cd VideoLLaMA2
!pip install -e .
!pip install flash-attn

In [None]:
import sys
sys.path.append('./')
from videollama2 import model_init, mm_infer
from videollama2.utils import disable_torch_init

Test run

In [None]:
#From the Github : https://github.com/DAMO-NLP-SG/Video-LLaMA
import sys
sys.path.append('./')
from videollama2 import model_init, mm_infer
from videollama2.utils import disable_torch_init


def inference():
    disable_torch_init()

    # Video Inference
    modal = 'video'
    modal_path = 'assets/cat_and_chicken.mp4' 
    instruct = 'What animals are in the video, what are they doing, and how does the video feel?'
    # Reply:
    # The video features a kitten and a baby chick playing together. The kitten is seen laying on the floor while the baby chick hops around. The two animals interact playfully with each other, and the video has a cute and heartwarming feel to it.

    model_path = 'DAMO-NLP-SG/VideoLLaMA2.1-7B-16F'
    # Base model inference (only need to replace model_path)
    # model_path = 'DAMO-NLP-SG/VideoLLaMA2.1-7B-16F-Base'
    model, processor, tokenizer = model_init(model_path)
    output = mm_infer(processor[modal](modal_path), instruct, model=model, tokenizer=tokenizer, do_sample=False, modal=modal)

    print(output)

if __name__ == "__main__":
    inference()

In [None]:
modal = 'video'
modal_path = 'assets/cat_and_chicken.mp4' 
instruct = 'What animals are in the video, what are they doing, and how does the video feel?'
model_path = 'DAMO-NLP-SG/VideoLLaMA2.1-7B-16F'
model, processor, tokenizer = model_init(model_path) 
output = mm_infer(processor[modal](modal_path), instruct, model=model, tokenizer=tokenizer, do_sample=False, modal=modal)

Synthetic Video Sequence

In [None]:
modal = 'video'
modal_path = './Scenario1_resized.mp4'
question_tree = [
   {
    "sub_questions": [
        {"id": 1, "question": "Is the pedestrian using a crosswalk?"},
        {"id": 2, "question": "How many cars can you see at the start?"},
        {"id": 3, "question": "Is it day or night?"},
        {"id": 4, "question": "How many yellow cars are there?"},
        {"id": 5, "question": "How many traffic lights are visible?"},
        {"id": 6, "question": "How many pedestrians cross the road in total?"},
        {"id": 7, "question": "How many vehicles stopped for pedestrians crossing the road?"},
        {"id": 8, "question": "Was the pedestrian crossing from left to right or right to left?"},
        {"id": 9, "question": "Which car arrived first at the intersection?"},
        {"id": 10, "question": "Was the pedestrian in the middle of the road at any point?"},
        {"id": 11, "question": "What impact does the pedestrian's position have on vehicle flow in the first half of the video?"},
        {"id": 12, "question": "How many vehicles stopped for the pedestrian crossing after turning?"},
        {"id": 13, "question": "How many vehicles were present in the middle of the intersection after the pedestrian crossed?"},
        {"id": 14, "question": "What is the longest gap between vehicle movement and pedestrian crossing?"},
        {"id": 15, "question": "Was the traffic light timing optimal for the pedestrian crossing and vehicle flow? Analyze the situation."}
    ]
},
]
for question_data in question_tree[0]['sub_questions']:

    question_text = question_data["question"]
    

    output = mm_infer(processor[modal](modal_path), question_text, model=model, tokenizer=tokenizer, do_sample=False, modal=modal)
  
    print(output)
    print("-------------------------------------------")

Real World Sequence 1

In [None]:
modal = 'video'
modal_path = './Scenario2_.mp4'
question_tree = [
   {
    "sub_questions": [
        {"id": 1, "question": "Is the cyclist following the bike lane?"},
        {"id": 2, "question": "Are there any pedestrians in the video?"},
        {"id": 3, "question": "Is there a traffic light in the video?"},
        {"id": 4, "question": "Is there a person walking a dog?"},
        {"id": 5, "question": "How many cyclists are there?"},
        {"id": 6, "question": "Was there any vehicle that signaled before turning?"},
        {"id": 7, "question": "How many cars were in motion while the cyclist crossed?"},
        {"id": 8, "question": "How many vehicles are stopped at the intersection when the cyclist crosses?"},
        {"id": 9, "question": "Which vehicle is closest to the cyclist when they pass through the intersection?"},
        {"id": 10, "question": "Was there a pedestrian near the cyclist when they crossed?"},
        {"id": 11, "question": "Between 0:08 and 0:20, how many cars are moving?"},
        {"id": 12, "question": "How does the cyclist's violation impact overall traffic safety at the intersection?"},
        {"id": 13, "question": "What design elements of the intersection could be improved to better segregate bicycle and vehicle traffic?"},
        {"id": 14, "question": "What impact does the cyclist's position have on vehicle flow through the intersection?"},
        {"id": 15, "question": "Did the cyclist appear to slow down when approaching the intersection?"}
    ]
}
,
]
for question_data in question_tree[0]['sub_questions']:

    question_text = question_data["question"]
    

    output = mm_infer(processor[modal](modal_path), question_text, model=model, tokenizer=tokenizer, do_sample=False, modal=modal)
  
    print(output)

Real world sequence -2

In [None]:
modal = 'video'
modal_path = './Scenario3_resized.mp4'
question_tree = [
  {
    "sub_questions": [
        {"id": 1, "question": "How many lamp posts are visible?"},
        {"id": 2, "question": "What color is the first car that turns onto the bike lane?"},
        {"id": 3, "question": "How many pedestrians are there?"},
        {"id": 4, "question": "Are there any bicycles?"},
        {"id": 5, "question": "Are there any trucks?"},
        {"id": 6, "question": "How many cars turn onto the bike lane?"},
        {"id": 7, "question": "Is there a manhole cover next to the tracks?"},
        {"id": 8, "question": "Do any vehicles use a signal before entering the bike lane?"},
        {"id": 9, "question": "Does the bike lane have a clear sign for shared usage with vehicles?"},
        {"id": 10, "question": "Do tram tracks pass through the middle of the road?"},
        {"id": 11, "question": "Explain the make of each car turning onto the bike lane."},
        {"id": 12, "question": "Which color car is most commonly seen turning into the bike lane?"},
        {"id": 13, "question": "What are the different types of vehicles?"},
        {"id": 14, "question": "What is the current design of the intersection that causes the cars to turn into bike lanes?"},
        {"id": 15, "question": "Do the pedestrians at the start return at the end?"}
    ]
}

,
]
for question_data in question_tree[0]['sub_questions']:

    question_text = question_data["question"]
    

    output = mm_infer(processor[modal](modal_path), question_text, model=model, tokenizer=tokenizer, do_sample=False, modal=modal)
  
    print(output)
    print("-------------------------------------------")

# GPT-4o

Suppports Video Input in the website https://chatgpt.com/ .
Upload the video and the questions for each video and prompt the model to answer

# Gemini -1.5 Pro

You will need to create an API here :https://aistudio.google.com/apikey

In [None]:
import os
import time
import google.generativeai as genai


In [None]:

genai.configure(api_key=os.environ["GEMINI_API_KEY"])


In [None]:

def upload_to_gemini(path, mime_type=None):
  """Uploads the given file to Gemini.

  See https://ai.google.dev/gemini-api/docs/prompting_with_media
  """
  file = genai.upload_file(path, mime_type=mime_type)
  print(f"Uploaded file '{file.display_name}' as: {file.uri}")
  return file

def wait_for_files_active(files):
  """Waits for the given files to be active.

  Some files uploaded to the Gemini API need to be processed before they can be
  used as prompt inputs. The status can be seen by querying the file's "state"
  field.

  This implementation uses a simple blocking polling loop. Production code
  should probably employ a more sophisticated approach.
  """
  print("Waiting for file processing...")
  for name in (file.name for file in files):
    file = genai.get_file(name)
    while file.state.name == "PROCESSING":
      print(".", end="", flush=True)
      time.sleep(10)
      file = genai.get_file(name)
    if file.state.name != "ACTIVE":
      raise Exception(f"File {file.name} failed to process")
  print("...all files ready")
  print()


In [None]:

# Create the model
generation_config = {
  "temperature": 1,
  "top_p": 0.95,
  "top_k": 40,
  "max_output_tokens": 8192,
  "response_mime_type": "text/plain",
}

model = genai.GenerativeModel(
  model_name="gemini-1.5-pro",
  generation_config=generation_config,
)

# TODO Make these files available on the local file system
# You may need to update the file paths
files = [
  upload_to_gemini("Scenario2.mp4", mime_type="video/mp4"),
  upload_to_gemini("Scenario3.mp4", mime_type="video/mp4"),
  upload_to_gemini("Scenario1.mp4", mime_type="video/mp4"),
]

# Some files have a processing delay. Wait for them to be ready.
wait_for_files_active(files)


In [None]:

chat_session = model.start_chat(
  history=[
    {
      "role": "user",
      "parts": [
        files[0],
      ],
    },
    {
      "role": "user",
      "parts": [
        "For the given input video Stream answer the following questions",
        "1.Is the cyclist following the bike lane?\n2.Are there any pedestrians in the video?\n3.Is there a traffic light in the video?\n4.Is there a person walking a dog?\n5.How many cyclists are there?\n1.Was there any vehicle that signaled before turning?\n\n2.How many cars were in motion while the cyclist crossed?\n\n\n\n3.How many vehicles are stopped at the intersection when the cyclist crosses?\n\n\n\n4.Which vehicle is closest to the cyclist when they pass through the intersection?\n\n\n\n5.Was there a pedestrian near the cyclist when they crossed?\n\n\n1.Between 0:08 and 0:20, how many cars are moving?\n\n\n\n2.How does the cyclist's violation impact overall traffic safety at the intersection?\n\n\n\n3.What design elements of the intersection could be improved to better segregate bicycle and vehicle traffic?\n\n\n\n\n\n4.What impact does the cyclist's position have on vehicle flow through the intersection?\n\n\n5.Did the cyclist appear to slow down when approaching the intersection?\n\n\n\n",
      ],
    },
    {
      "role": "model",
      "parts": [
        "Here are the answers to your questions:\n\n**First Set of Questions**\n\n1. **Is the cyclist following the bike lane?** No, the cyclist is riding against the flow of traffic in the bike lane.\n2. **Are there any pedestrians in the video?** Yes, there are two pedestrians visible. One walks across the road near the palm trees, and the other walks on the left sidewalk.\n3. **Is there a traffic light in the video?** Yes, there is a traffic light visible on the left side of the frame.\n4. **Is there a person walking a dog?** No.\n5. **How many cyclists are there?** One.\n6. **Was there any vehicle that signaled before turning?** No. The cars don't appear to signal.\n\n**Second Set of Questions**\n\n1. **How many cars were in motion while the cyclist crossed?** Three cars appear to be in motion as the cyclist crosses the intersection. The blue and gray cars are moving on the further street, and a red car passes on the closer street.\n2. **How many vehicles are stopped at the intersection when the cyclist crosses?** There are no vehicles stopped at the intersection at that moment.\n3. **Which vehicle is closest to the cyclist when they pass through the intersection?** The red car on the near street is closest to the cyclist as they cross.\n4. **Was there a pedestrian near the cyclist when they crossed?** Yes, there is a pedestrian on the left sidewalk that crosses paths with the cyclist as both pass through the intersection.\n5. **How many cars are moving between 0:08 and 0:20?** At least four cars are moving. The blue, gray, and red cars cross the intersection, and a white car makes a turn.\n6. **How does the cyclist's violation impact overall traffic safety at the intersection?** The cyclist's violation increases the risk of a collision with vehicles traveling through the intersection. Because the cyclist is moving against traffic in the bike lane, drivers may not anticipate their movement and have less time to react.\n7. **What design elements of the intersection could be improved to better segregate bicycle and vehicle traffic?**  A physically separated bike lane (with a barrier or raised curb) would improve safety, as would dedicated bicycle signals.  Improved signage and pavement markings would clearly communicate bicycle traffic rules.\n8. **What impact does the cyclist's position have on vehicle flow through the intersection?** The cyclist's position doesn't significantly impact the flow of traffic in this instance because there are no vehicles stopped. If other vehicles had been stopped at the light, however, the cyclist riding against traffic could have caused confusion or delayed their start, potentially impeding traffic flow. \n9. **Did the cyclist appear to slow down when approaching the intersection?** No, the cyclist appears to maintain a steady pace through the intersection without slowing down.\n\nLet me know if you have any other questions about this video.",
      ],
    },
    {
      "role": "user",
      "parts": [
        files[1],
      ],
    },
    {
      "role": "user",
      "parts": [
        "for the given video answer the following",
        "1.How many lamp posts are visible?\n\n\n\n2.What color is the first car that turns onto the bike lane?\n\n\n\n3.How many pedestrians are there in the video?\n\n\n\n4.Are there any bicycles in the video?\n\n\n\n\n\n5.Are there any trucks in the video?\n\n\n\n1.How many cars turn onto the bike lane in the video?\n\n\n\n2.Is there a manhole cover next to the tracks?\n\n\n\n3.Do any vehicles use a signal before entering the bike lane?\n\n\n\n4.Does the bike lane have a clear sign for shared usage with vehicles?\n\n\n\n5.Do tram tracks pass through the middle of the road?\n\n\n\n1.Explain the make of each car turning onto the bike lane.\n\n\n\n2.Which color car is most commonly seen turning into the bike lane?\n\n\n\n3.What are the different types of vehicles?\n\n\n\n4.What is the current design of the intersection that causes the cars to turn into bike lanes? \n\n\n5.Do the pedestrians at the start return at the end?\n\n\n\n\n\n",
      ],
    },
    {
      "role": "model",
      "parts": [
        "Here are the answers to your questions:\n\n**First Set of Questions:**\n1. **How many lamp posts are visible?** There are three lamp posts visible in the foreground on the left and several more on the far side of the intersection.\n2. **What color is the first car that turns onto the bike lane?** White.\n3. **How many pedestrians are there in the video?**  At least six different pedestrians are visible throughout the video. There are two at the beginning, two in the middle crossing the road, and two toward the end crossing the crosswalk.\n4. **Are there any bicycles in the video?** Yes, there is a bicycle parked at the bottom left of the frame.\n5. **Are there any trucks in the video?** No.\n\n**Second Set of Questions:**\n1. **How many cars turn onto the bike lane in the video?**  Six cars turn onto the bike lane, two turning off immediately.\n2. **Is there a manhole cover next to the tracks?** Yes, there's a circular manhole cover or access point visible in the roadway next to the tracks.\n3. **Do any vehicles use a signal before entering the bike lane?**  No, it does not appear so.\n4. **Does the bike lane have a clear sign for shared usage with vehicles?** No, there is no clear signage indicating shared usage.  The bike symbol implies it's exclusively for bicycles.\n5. **Do tram tracks pass through the middle of the road?** Yes, what appear to be tram or light rail tracks run through the center of the road.\n\n**Third Set of Questions:**\n1. **Explain the make of each car turning onto the bike lane.** Due to the distance and angle of the camera, it's challenging to definitively identify makes.  However, the vehicles turning onto the bike lane appear to be various SUV crossovers, and sedans. \n2. **Which color car is most commonly seen turning into the bike lane?** White cars.\n3. **What are the different types of vehicles?** Sedans, SUVs/crossovers, and a single parked bicycle are visible.\n4. **What is the current design of the intersection that causes the cars to turn into bike lanes?** It seems likely that the roadway markings and geometry are not clear enough to drivers, leading them to inadvertently turn into the bike lane. The left turn lane might be too short or poorly marked, and the bike lane might not be visually distinct enough from the adjacent traffic lane.\n5. **Do the pedestrians at the start return at the end?** No, the pedestrians at the start do not reappear at the end of the video. The two at the beginning walk toward the business on the right. The pedestrians near the end appear to be different people walking along the sidewalk to the left of the intersection.\n\nI hope this is helpful! Let me know if you have any other questions.\n",
      ],
    },
    {
      "role": "user",
      "parts": [
        files[2],
      ],
    },
    {
      "role": "user",
      "parts": [
        "for the given video answer the following",
        "Is the pedestrian using a crosswalk?\nHow many cars can you see at the start?\nIs it day or night?\nHow many yellow cars are there?\n\n\n\n5.How many traffic lights are visible?\nHow many pedestrians cross the road in total?\nHow many vehicles stopped for pedestrians crossing the road?\nWas the pedestrian crossing from left to right or right to left?\nWhich car arrived first at the intersection?\nWas the pedestrian in the middle of the road at any point\nWhat impact does the pedestrian's position have on vehicle flow in the first half of the video?\nHow many vehicles stopped for the pedestrian crossing after turning?\n.How many vehicles were present in the middle of the intersection after the pedestrian crossed?\nWhat is the longest gap between vehicle movement and pedestrian crossing?\nWas the traffic light timing optimal for the pedestrian crossing and vehicle flow? Analyze the situation.\n\n",
      ],
    },
    {
      "role": "model",
      "parts": [
        "Here are the answers to your questions:\n\n1. **Is the pedestrian using a crosswalk?** Yes, the pedestrian uses the designated crosswalk.\n2. **How many cars can you see at the start?** Five cars, including the bus, are initially visible. Two parked cars are seen to the left.\n3. **Is it day or night?** It is nighttime.\n4. **How many yellow cars are there?** One yellow car (a taxi) is visible.\n5. **How many traffic lights are visible?** Two traffic lights are partially visible.\n6. **How many pedestrians cross the road in total?** One pedestrian crosses the road.\n7. **How many vehicles stopped for pedestrians crossing the road?** One vehicle (the white bus) stops for the pedestrian.\n8. **Was the pedestrian crossing from left to right or right to left?** The pedestrian crosses from the bottom left to the top right of the frame.\n9. **Which car arrived first at the intersection?** The white bus appears to arrive at the intersection first.\n10. **Was the pedestrian in the middle of the road at any point?** Yes, the pedestrian is in the middle of the road while crossing in the crosswalk.\n11. **What impact does the pedestrian's position have on vehicle flow in the first half of the video?** The pedestrian's presence in the crosswalk causes the white bus to stop, briefly interrupting the flow of traffic from the right side of the intersection.\n12. **How many vehicles stopped for the pedestrian crossing after turning?** Zero. No vehicles turn and then stop for the pedestrian.\n13. **How many vehicles were present in the middle of the intersection after the pedestrian crossed?** One (the yellow taxi).\n14. **What is the longest gap between vehicle movement and pedestrian crossing?**  The longest gap appears to be after the bus moves through the intersection and before the taxi enters, but the pedestrian is already crossing by then. There's no significant gap where the pedestrian is waiting and no vehicles are present.\n15. **Was the traffic light timing optimal for the pedestrian crossing and vehicle flow? Analyze the situation.** It appears that traffic lights function as intended, as the bus stops when the pedestrian is crossing. However, due to the limited view and short duration of the video, we can not see the whole situation. It is not possible to fully assess the optimality of the timing.  We would need to see a longer video or have more information about the traffic light cycles to determine if the timing could be improved to minimize delays for both vehicles and pedestrians.  \n\nLet me know if you have any further questions!",
      ],
    },
  ]
)



In [None]:
response = chat_session.send_message("INSERT_INPUT_HERE")
print(response.text)