## OpenAI Vision 4 Preview Image -> Text Descriptions of First Frames

In [None]:
!pip install opencv-python
!pip install openai
!pip install pytube

In [None]:
from IPython.display import display, Image, Audio

import cv2
import base64
import time
import os
import requests

from openai import OpenAI
client = OpenAI(api_key="")

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

In [None]:
import os
import base64
import pandas as pd
import numpy as np
import time
import requests
import json
from openai import BadRequestError, RateLimitError

# Function to convert image to base64
def encode_image(image_path):
  with open(image_path, "rb") as image_file:
    return base64.b64encode(image_file.read()).decode('utf-8')

# Directory containing images
image_dir = "/content/drive/My Drive/236/test_first_frames/"

In [None]:
# Load train.csv into a DataFrame
train_df = pd.read_csv("/content/drive/My Drive/236/train.csv")
train_df["OpenAICaption"] = np.nan

In [None]:
# Load test.csv, at inference time
test_df = pd.read_csv("/content/drive/My Drive/236/test.csv")
test_df["OpenAICaption"] = np.nan

### Batch calls

In [None]:
# TRAIN SET

id_set = set(train_df['Id'].values)
progress = 0
batch_size = 10
batch = []

for filename in os.listdir(image_dir):
    if filename.endswith(".jpg"):
        identifier = filename.split("_", 1)[1][:-4]  # Removes the ".jpg" at the end

        if identifier in id_set and not pd.isna(train_df.loc[train_df['Id'] == identifier, 'OpenAICaption'].iloc[0]):
            continue

        base64_image = encode_image(image_dir + filename)

        # Append to the batch
        batch.append({
            "image": base64_image,
            "identifier": identifier
        })


        if len(batch) == batch_size:
            message = {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Each image is the first frame of a YouTube video. Generate an around 15-word description for each image that would be useful for creating a good thumbnail image. List each description on a new line without numbering. If you are unable to provide a description, return 'no description available'",
                    }
                ]
            }

            for item in batch:
                batch_image = {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{item['image']}"
                            }
                        }
                message["content"].append(batch_image)

            params = {
                "model": "gpt-4-vision-preview",
                "messages": [message],
                "max_tokens": 4000
            }

            try:
              result = client.chat.completions.create(**params)
              result_list = result.choices[0].message.content.split("\n")

              for i in range(batch_size):
                description = result_list[i]
                identifier = batch[i]["identifier"]
                train_df.loc[train_df['Id'] == identifier, 'OpenAICaption'] = description
                print(f"{identifier}: {description}")

              progress += 10
              print(progress)

              train_df.to_csv("/content/drive/My Drive/train_images_exp2_openai.csv", index=False)
              time.sleep(30)

            except (RateLimitError) as e:
                print(f"Error: {e}, waiting...")
                time.sleep(90)
            except (BadRequestError) as e:
                print(f"Error: {e}, skipping batch.")
                batch = []
                continue
            except Exception as e:
                print(f"Unexpected error: {e}, skipping batch.")
                batch = []
                continue

            # Reset the batch
            batch = []

# Save the updated DataFrame back to CSV
train_df.to_csv("/content/drive/My Drive/train_images_exp2_openai.csv", index=False)


In [None]:
#  TEST SET

id_set = set(test_df['Id'].values)
progress = 0
batch_size = 10
batch = []

for filename in os.listdir(image_dir):
    if filename.endswith(".jpg"):
        identifier = filename.split("_", 1)[1][:-4]  # Removes the ".jpg" at the end

        if identifier in id_set and not pd.isna(test_df.loc[test_df['Id'] == identifier, 'OpenAICaption'].iloc[0]):
            continue

        base64_image = encode_image(image_dir + filename)

        # Append to the batch
        batch.append({
            "image": base64_image,
            "identifier": identifier
        })


        if len(batch) == batch_size:
            message = {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": "Each image is the first frame of a YouTube video. Generate an around 15-word description for each image that would be useful for creating a good thumbnail image. List each description on a new line without numbering. If you are unable to provide a description, return 'no description available'",
                    }
                ]
            }

            for item in batch:
                batch_image = {
                            "type": "image_url",
                            "image_url": {
                                "url": f"data:image/jpeg;base64,{item['image']}"
                            }
                        }
                message["content"].append(batch_image)

            params = {
                "model": "gpt-4-vision-preview",
                "messages": [message],
                "max_tokens": 4000
            }

            try:
              result = client.chat.completions.create(**params)
              result_list = result.choices[0].message.content.split("\n")

              for i in range(batch_size):
                description = result_list[i]
                identifier = batch[i]["identifier"]
                test_df.loc[test_df['Id'] == identifier, 'OpenAICaption'] = description
                print(f"{identifier}: {description}")

              progress += 10
              print(progress)

              test_df.to_csv("/content/drive/My Drive/test_images_exp2_openai.csv", index=False)
              time.sleep(30)

            except (RateLimitError) as e:
                print(f"Error: {e}, waiting...")
                time.sleep(90)
            except (BadRequestError) as e:
                print(f"Error: {e}, skipping batch.")
                batch = []
                continue
            except Exception as e:
                print(f"Unexpected error: {e}, skipping batch.")
                batch = []
                continue

            # Reset the batch
            batch = []

# Save the updated DataFrame back to CSV
test_df.to_csv("/content/drive/My Drive/test_images_exp2_openai.csv", index=False)


### Individual calls

In [None]:
import os
import base64
import pandas as pd
import time
from openai import BadRequestError, RateLimitError

# Function to convert image to base64
def image_to_base64(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

# Directory containing images
image_dir = "/content/drive/My Drive/236/train_first_frames"
train_df = pd.read_csv("/content/drive/My Drive/train_images_exp2_openai.csv")

id_set = set(train_df['Id'].values)
progress = 0

for filename in os.listdir(image_dir):
    if filename.endswith(".jpg"):
        # Extract the identifier from the filename
        identifier = filename.split("_", 1)[1][:-4]  # Removes the ".jpg" at the

        if identifier in id_set and not pd.isna(train_df.loc[train_df['Id'] == identifier, 'OpenAICaption'].iloc[0]):
            continue

        image_path = os.path.join(image_dir, filename)
        base64_image = image_to_base64(image_path)

        # Create the prompt message
        prompt_message = {
            "role": "user",
            "content": [
                "This is the first frame from a YouTube video. Generate a less than 20 word description of the video that would be useful for creating a good thumbnail image.",
                {"image": base64_image, "resize": 256}
            ],
        }

        # Prepare parameters for the API call
        params = {
            "model": "gpt-4-vision-preview",
            "messages": [prompt_message],
            "max_tokens": 200
        }

        try:
            result = client.chat.completions.create(**params)
            description = result.choices[0].message.content

            if "I'm sorry" in description or "I cannot" in description:
                train_df.loc[train_df['Id'] == identifier, 'OpenAICaption'] = "no description available"
            else:
                train_df.loc[train_df['Id'] == identifier, 'OpenAICaption'] = description

            progress += 1
            print(progress)

            if progress % 10 == 0:  # Save every 10 iterations
                train_df.to_csv("/content/drive/My Drive/train_images_exp2_openai.csv", index=False)

            time.sleep(30)

        except (RateLimitError) as e:
            print(f"Error: {e}, waiting...")
            time.sleep(30)
        except (BadRequestError) as e:
            print(f"Error: {e}, skipping image.")
            train_df.loc[train_df['Id'] == identifier, 'OpenAICaption'] = "no description available"
            continue
        except Exception as e:
            print(f"Unexpected error: {e}, skipping image.")
            continue

# Save the updated DataFrame back to CSV
train_df.to_csv("/content/drive/My Drive/train_images_exp2_openai.csv", index=False)


### See Results

In [None]:
# Save the updated DataFrame back to CSV
train_df.to_csv("/content/drive/My Drive/train_images_exp2_openai.csv", index=False)

In [None]:
num = 0
for index, row in train_df.iterrows():
    if pd.notna(row['OpenAICaption']) and row['OpenAICaption'] != '':
        print(f"Row {index}: {row['OpenAICaption']}")
        num += 1
print (len(train_df))

## Other Misc Code: Process a single video from link -> description

### 0. Process YouTube Videos

In [None]:
from pytube import YouTube
import os

# sample YouTube video URL
youtube_url = 'https://www.youtube.com/watch?v=UNo0TG9LwwI'

# Download video
yt = YouTube(youtube_url)
stream = yt.streams.filter(progressive=True, file_extension='mp4').order_by('resolution').desc().first()
if not os.path.exists('videos'):
    os.makedirs('videos')
download_path = stream.download('videos')

print(f"Downloaded video to {download_path}")

In [None]:
import cv2
import base64

video = cv2.VideoCapture(download_path)

# Get the total number of frames in the video
total_frames = int(video.get(cv2.CAP_PROP_FRAME_COUNT))

# Calculate the interval for sampling frames
interval = total_frames // 100

base64Frames = []
current_frame = 0

while video.isOpened():
    # Set the position of the next frame to be read
    video.set(cv2.CAP_PROP_POS_FRAMES, current_frame)

    success, frame = video.read()
    if not success:
        break

    _, buffer = cv2.imencode(".jpg", frame)
    base64Frames.append(base64.b64encode(buffer).decode("utf-8"))

    # Move to the next interval
    current_frame += interval

    # Stop if we've reached the end of the video
    if current_frame >= total_frames:
        break

video.release()
print(len(base64Frames), "frames read.")

In [None]:
PROMPT_MESSAGES = [
    {
        "role": "user",
        "content": [
            "These are frames from a video that I want to create a representative thumbnail image for. Generate a short description of the video that would be important for creating a good thumbnail image.",
            *map(lambda x: {"image": x, "resize": 768}, base64Frames[0::50]),
        ],
    },
]
params = {
    "model": "gpt-4-vision-preview",
    "messages": PROMPT_MESSAGES,
    "max_tokens": 200,
}


result = client.chat.completions.create(**params)
print(result.choices[0].message.content)
