- **Import libraries**

In [42]:
import os
from moviepy import VideoFileClip
import cv2
import pandas as pd
from math import ceil
import base64
from openai import OpenAI


In [43]:
def timestamp_to_seconds(timestamp, fps=30):
    hh, mm, ss, ff = map(int, timestamp.split(':'))
    total_seconds = hh * 3600 + mm * 60 + ss + ff / fps
    return total_seconds

- **Function for extracting screenshots in certain time**

In [44]:
def get_screenshots_and_processed_df(video_id):
    def extract_screenshots(video_path, timestamps, output_folder):
        # Check if the output folder exists, if not, create it
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        
        video = VideoFileClip(video_path)
        screenshot_paths = []

        # Iterate through timestamps
        for index, timestamp in enumerate(timestamps):
            # Set the video to the specified timestamp
            try:
                timestamp_float = timestamp_to_seconds(timestamp)
                frame = video.get_frame(timestamp_float)  # Ensure timestamp is a float
            except Exception as e:
                print(f"Error getting frame at timestamp {timestamp} : {e}")
                screenshot_paths.append(None)
                continue  # Skip to the next timestamp if there's an error

            # Convert the frame to BGR format (OpenCV uses BGR)
            frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
            print(f"Frame Shape: {frame_bgr.shape}, Type: {frame_bgr.dtype}")

            # Save the screenshot
            output_path = f"{output_folder}/screenshot_{index+1}.png"
            success = cv2.imwrite(output_path, frame_bgr)
            if success:
                print(f"Screenshot saved: {output_path}")
                screenshot_paths.append(output_path)
            else:
                print(f"Failed to save screenshot: {output_path}")
                screenshot_paths.append(None)

        # Close the video file
        video.close()
        return screenshot_paths

    def process_num(n):
        return '0' + str(n) if n < 10 else str(n)

    def process_timestamps(timestamps):
        res = []
        for ts in timestamps: 
            ts = ts.split(' ')
            first = int(ts[0][-2:])
            second = int(ts[1][-2:])
            mid = round(abs(int(first) - int(second)) / 2)
            millisec = process_num(first+mid)
            res.append(ts[0][:-2] + millisec)
        return res

    # Load your dataset (assuming it's a CSV file with 'second' and 'video_link' columns)
    df = pd.read_csv(f'./data/{video_id}/transcript_{video_id}.csv')

    # Remove duplicate columns
    df = df.loc[:, ~df.columns.duplicated()]

    # Extract timestamps and video link
    timestamps_unprocessed = df['Zeit'].tolist()[1:]
    video_link = df['video_src'].iloc[0]  # Assuming all rows have the same video link

    # Specify the output folder for screenshots
    output_folder = f'./data/{video_id}/screenshots'

    processed_timestamps = process_timestamps(timestamps_unprocessed)
    
    # Ensure the lengths match
    if len(processed_timestamps) != len(timestamps_unprocessed):
        raise ValueError("Processed timestamps length does not match unprocessed timestamps length")

    screenshot_paths = extract_screenshots(video_link, processed_timestamps, output_folder)

    # Ensure the lengths match
    if len(screenshot_paths) != len(df) - 1:  # Adjust for the header row
        # Adjust the length of screenshot_paths to match the DataFrame
        screenshot_paths += [None] * ((len(df) - 1) - len(screenshot_paths))

    # Generate a DataFrame with the required information
    df_processed = df.iloc[1:][['Zeit', 'Übersetzung', 'Lexem/Gebärde', 'Lexem/Gebärde.1', 'Mund', 'video_src', 'video_id']].copy()
    df_processed['processed_timestamps'] = processed_timestamps
    df_processed['screenshot_path'] = screenshot_paths

    # Group by 'Übersetzung'
    df_grouped = df_processed.groupby('Übersetzung').agg({
        'Zeit': list,
        'Lexem/Gebärde': list,
        'Lexem/Gebärde.1': list,
        'Mund': list,
        'video_src': 'first',
        'video_id': 'first',
        'processed_timestamps': list,
        'screenshot_path': list
    }).reset_index()

    return df_grouped

In [100]:
video_id = '1176340'
df_grouped = get_screenshots_and_processed_df(video_id)

{'video_found': True, 'audio_found': False, 'metadata': {'major_brand': 'isom', 'minor_version': '512', 'compatible_brands': 'isomiso2avc1mp41', 'encoder': 'Lavf59.27.100', 'copyright': '2010-2023 DGS-Korpus, Universität Hamburg'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [1280, 360], 'bitrate': 574, 'fps': 50.0, 'codec_name': 'h264', 'profile': '(Constrained Baseline)', 'metadata': {'Metadata': '', 'handler_name': 'VideoHandler', 'vendor_id': '[0][0][0][0]', 'encoder': 'Lavc59.37.100 libx264'}}], 'input_number': 0}], 'duration': 718.74, 'bitrate': 576, 'start': 0.0, 'default_video_input_number': 0, 'default_video_stream_number': 0, 'video_codec_name': 'h264', 'video_profile': '(Constrained Baseline)', 'video_size': [1280, 360], 'video_bitrate': 574, 'video_fps': 50.0, 'video_duration': 718.74, 'video_n_frames': 35937}
c:\Users\javie\Desktop\ABERDEEN\RESEARCH\sign-language-experiment\venv\Lib\site



Frame Shape: (360, 1280, 3), Type: uint8
Screenshot saved: ./data/1176340/screenshots/screenshot_3563.png




In [101]:
df_grouped.to_json(f'./data/{video_id}/processed_transcript_{video_id}.json', index=False)

- **Calculate tokens**
    - According to: https://community.openai.com/t/how-do-i-calculate-image-tokens-in-gpt4-vision/492318/2

In [45]:
from PIL import Image
from math import ceil

def resize(width, height):
    if width > 1024 or height > 1024:
        if width > height:
            height = int(height * 1024 / width)
            width = 1024
        else:
            width = int(width * 1024 / height)
            height = 1024
    return width, height

def count_image_tokens(width: int, height: int):
    width, height = resize(width, height)
    h = ceil(height / 512)
    w = ceil(width / 512)
    total = 85 + 170 * h * w
    return total

def calculate_tokens_from_image(src: str):
    with Image.open(src) as img:
        width, height = img.size
    return count_image_tokens(width, height)

# Example usage
src = './data/1176340/screenshots/screenshot_1.png'
tokens = calculate_tokens_from_image(src)
print(f'Tokens: {tokens}')
total_image_tokens = 3569*tokens
print(f'Total Image Tokens: {total_image_tokens}')

Tokens: 425
Total Image Tokens: 1516825


In [46]:
def calculate_transcript_stats(data_dir='./data'):
    total_rows = 0
    transcript_counts = []
    total_videos = 0

    for folder_name in os.listdir(data_dir):
        folder_path = os.path.join(data_dir, folder_name)
        if os.path.isdir(folder_path):
            transcript_path = os.path.join(folder_path, f'transcript_{folder_name}.csv')
            if os.path.exists(transcript_path):
                df = pd.read_csv(transcript_path)
                row_count = len(df)
                total_rows += row_count
                transcript_counts.append(row_count)
                total_videos += 1

    average_rows = total_rows / len(transcript_counts) if transcript_counts else 0
    return total_rows, average_rows, total_videos

# Example usage
total_rows, average_rows, total_videos = calculate_transcript_stats()
print(f'Total videos: {total_videos}')
print(f'Total rows: {total_rows}')
print(f'Average rows per transcript: {average_rows}')

Total videos: 423
Total rows: 746367
Average rows per transcript: 1764.4609929078015


- **Function to call open ai with the images**

In [67]:
def analyze_image(image_path, prompt):
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

    with open(image_path, "rb") as image_file:
        base64_image = base64.b64encode(image_file.read()).decode('utf-8')

    messages = [
        {
            "role": "system",
            "content": prompt
        },
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Analyze the image and provide insights."},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}",
                        "detail": "high"
                    },
                },
            ],
        }
    ]

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        max_tokens=1024,
    )

    return response.choices[0].message.content


- **Prompt that will be used**
    - tokens: 114

In [48]:
prompt = """
    I have an image with two people using German Sign Language (DGS). 
    Please identify the DGS glosses based on the hand movements without 
    interpreting the meaning. I only want the gloss, the direct word 
    translation in German, the person (either 'left' or 'right'), and 
    the hand used (either 'left' or 'right'). Provide the information 
    in the following JSON format:

    {
        "gloss": "",
        "word": "",
        "person": "",
        "hand": ""
    }
"""

prompt_tokens = 114
total_prompt_tokens = 3569*prompt_tokens
print(f'Total Prompt Tokens: {total_prompt_tokens}')
print(f"Total tokens used for the image and prompt: {total_image_tokens + total_prompt_tokens}")

Total Prompt Tokens: 406866
Total tokens used for the image and prompt: 1923691


## Experiment with images
- Sample taken from video: 1176340

- **Import dataset with images and the sources to images**

In [49]:
grouped_images_df = pd.read_json('./data/1176340/processed_transcript_1176340.json')
grouped_images_df.head()

Unnamed: 0,Übersetzung,Zeit,Lexem/Gebärde,Lexem/Gebärde.1,Mund,video_src,video_id,processed_timestamps,screenshot_path
0,"Aber das ist eigentlich auch egal, weil ich de...","[00:00:34:01 00:00:34:09, 00:00:34:09 00:00:34...","[None, EGAL3*, None, $GEST-ABWINKEN1^*, None, ...","[None, EGAL3*, None, $GEST-ABWINKEN1^*, None, ...","[None, egal, None, None, None, da, None, mut, ...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:00:34:05, 00:00:34:12, 00:00:34:18, 00:00:...",[./data/1176340/screenshots/screenshot_198.png...
1,Aber was?,"[00:03:37:13 00:03:37:19, 00:03:37:19 00:03:38...","[None, $ORAL^, None]","[None, $ORAL^, None]","[None, ??, None]",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:03:37:16, 00:03:37:21, 00:03:38:16]",[./data/1176340/screenshots/screenshot_1148.pn...
2,"Ach ja, das war mit dir zusammen.","[00:02:58:28 00:02:58:41, 00:02:58:41 00:02:58...","[None, None, DU1, None, None, UNTER1A^*, None,...","[None, None, None, None, None, None, None, Non...","[None, None, None, None, None, None, None, Non...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:02:58:34, 00:02:58:45, 00:02:58:72, 00:02:...",[./data/1176340/screenshots/screenshot_958.png...
3,"Als meine Schwester dann wegging, kam daraufhi...","[00:06:35:17 00:06:35:21, 00:06:35:21 00:06:35...","[$INDEX1, None, SCHWESTER1A*, None, PLÖTZLICH4...","[None, None, SCHWESTER1A*, None, None, None, N...","[None, None, schwester, None, [MG], None, [MG]...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:06:35:19, 00:06:35:24, 00:06:35:31, 00:06:...",[./data/1176340/screenshots/screenshot_2016.pn...
4,Als meine Schwester und ich klein waren/,"[00:02:01:05 00:02:01:13, 00:02:01:13 00:02:01...","[None, GRUND4B*, None, $INDEX1, $INDEX1, None,...","[None, GRUND4B*, None, None, None, None, None,...","[None, grund, None, None, None, None, None, No...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:02:01:09, 00:02:01:18, 00:02:01:27, 00:02:...",[./data/1176340/screenshots/screenshot_658.png...


In [50]:
grouped_images_df.size

2349

- **Get sample of 10 sentences**
    - Use 4 as the seed number

In [51]:
seed_number = 4
sampled_df = grouped_images_df.sample(n=10, random_state=seed_number)
sampled_df

Unnamed: 0,Übersetzung,Zeit,Lexem/Gebärde,Lexem/Gebärde.1,Mund,video_src,video_id,processed_timestamps,screenshot_path
134,Ich kann bei der Arbeit nicht durchgehend so t...,"[00:01:35:00 00:01:35:08, 00:01:35:08 00:01:35...","[None, ICH1, None, ARBEITEN1*, None, IMMER1C, ...","[None, None, None, ARBEITEN1*, None, None, Non...","[None, None, None, arbeite, None, immer, None,...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:01:35:04, 00:01:35:10, 00:01:35:13, 00:01:...",[./data/1176340/screenshots/screenshot_550.png...
83,"Es ist auch eine schöne Erinnerung, wie wir fr...","[00:03:42:04 00:03:42:11, 00:03:42:11 00:03:42...","[None, SCHÖN1A*, None, WAR1*, None, JA1A, None...","[None, None, None, None, None, None, None, Non...","[None, None, None, war, None, ja, None, schön,...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:03:42:08, 00:03:42:15, 00:03:42:20, 00:03:...",[./data/1176340/screenshots/screenshot_1155.pn...
218,Sie wollten das zunächst erstmal unter sich be...,"[00:10:38:40 00:10:38:41, 00:10:38:41 00:10:39...","[None, $GEST^, None, WIMMELN1^*, None, BITTE1A...","[None, $GEST^, None, WIMMELN1^*, None, None, N...","[None, None, None, None, None, [MG], None, Non...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:10:38:40, 00:10:38:55, 00:10:39:19, 00:10:...",[./data/1176340/screenshots/screenshot_3207.pn...
118,Ich blieb zu Hause.,"[00:10:25:10 00:10:25:11, 00:10:25:11 00:10:25...","[None, ICH1, None, BLEIBEN2*, None]","[None, None, None, None, None]","[None, None, None, bleibe, None]",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:10:25:10, 00:10:25:15, 00:10:25:23, 00:10:...",[./data/1176340/screenshots/screenshot_3129.pn...
101,"Für mich war es wichtig, dass ich mich mit mei...","[00:06:09:19 00:06:09:27, 00:06:09:27 00:06:09...","[None, ICH1, None, WICHTIG1, None, ICH1, None,...","[None, None, None, WICHTIG1, None, None, None,...","[None, [MG], None, wichtig, None, schwester, s...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:06:09:23, 00:06:09:33, 00:06:09:42, 00:06:...",[./data/1176340/screenshots/screenshot_1849.pn...
242,"Wenn wir gebärdeten, machte er kleine Fehler u...","[00:06:48:40 00:06:48:48, 00:06:48:48 00:06:49...","[None, VERGEBÄRDEN2*, None, ICH1, None]","[None, VERGEBÄRDEN2*, None, None, None]","[None, [MG] schief, None, None, None]",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:06:48:44, 00:06:48:58, 00:06:49:31, 00:06:...",[./data/1176340/screenshots/screenshot_2109.pn...
215,Sie ist meine Schwester/,"[00:06:38:37 00:06:38:45, 00:06:38:45 00:06:38...","[None, SCHWESTER1A*, None, MEIN1, None, SCHWES...","[None, SCHWESTER1A*, None, None, None, SCHWEST...","[None, sch{wester}, None, meine, None, schwest...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:06:38:41, 00:06:38:47, 00:06:38:71, 00:06:...",[./data/1176340/screenshots/screenshot_2042.pn...
6,"Also über meine Firma, da habe ich 2006 zum er...","[00:00:12:21 00:00:12:29, 00:00:12:29 00:00:12...","[None, FIRMA1B*, None, ZUSAMMENHANG1A*, None, ...","[None, None, None, ZUSAMMENHANG1A*, None, None...","[None, zusammenhang, zusammenhang, zusammenhan...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:00:12:25, 00:00:12:33, 00:00:12:40, 00:00:...","[./data/1176340/screenshots/screenshot_69.png,..."
258,"Zuerst waren wir zusammen in Schleswig, bis si...","[00:02:09:26 00:02:09:33, 00:02:09:33 00:02:09...","[None, SCHLESWIG1*, None, ZUERST1A*, None, SCH...","[None, None, None, None, None, None, None, ZUS...","[None, schleswig, None, zuerst, None, schleswi...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:02:09:30, 00:02:09:39, 00:02:09:65, 00:02:...",[./data/1176340/screenshots/screenshot_721.png...
108,"Ich beginne das Thema von Anfang an, so wie bi...","[00:00:07:48 00:00:08:11, 00:00:08:11 00:00:08...","[None, ÜBER1, None, THEMA1*, None, ANFANG1A*, ...","[None, None, None, None, None, ANFANG1A*, None...","[None, über, None, thema, None, anfangen, anfa...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:00:07:66, 00:00:08:13, 00:00:08:18, 00:00:...","[./data/1176340/screenshots/screenshot_43.png,..."


- **Ungroup sampled dataset**

In [78]:
import pandas as pd

# Create an empty DataFrame with the desired columns
ungrouped_df = pd.DataFrame(columns=['Zeit', 'Lexem/Gebärde', 'Lexem/Gebärde.1', 'Mund', 'processed_timestamps', 'screenshot_path', 'Übersetzung', 'video_id'])
ungrouped_list = []

# Iterate over the sampled DataFrame and ungroup the data
for index, row in sampled_df.iterrows():
    group_cols = ['Zeit', 'Lexem/Gebärde', 'Lexem/Gebärde.1', 'Mund', 'processed_timestamps', 'screenshot_path']
    ubersetzung = row['Übersetzung']
    print("video: ", row['video_id'])
    print("Translation words number: ", len(ubersetzung.split(' ')))
    print("Zeit number: ", len(row['Zeit']))
    print("glosses number: ", len([x for x in row['Lexem/Gebärde']+row['Lexem/Gebärde.1'] if x is not None]))
    ungrouped_per_sentence = pd.DataFrame(columns=['Zeit', 'Lexem/Gebärde', 'Lexem/Gebärde.1', 'Mund', 'processed_timestamps', 'screenshot_path', 'Übersetzung', 'video_id'])
    for i in range(len(row['Zeit'])):
        group_data = {col: row[col][i] for col in group_cols}
        group_data['Übersetzung'] = row['Übersetzung']
        group_data['video_id'] = row['video_id']
        
        # Append the ungrouped data to the DataFrame
        ungrouped_df = pd.concat([ungrouped_df, pd.DataFrame([group_data])], ignore_index=True)
        ungrouped_per_sentence = pd.concat([ungrouped_per_sentence, pd.DataFrame([group_data])], ignore_index=True)
    ungrouped_list.append(ungrouped_per_sentence)

video:  1176340
Translation words number:  14
Zeit number:  14
glosses number:  9
video:  1176340
Translation words number:  18
Zeit number:  34
glosses number:  23
video:  1176340
Translation words number:  16
Zeit number:  20
glosses number:  14
video:  1176340
Translation words number:  4
Zeit number:  5
glosses number:  2
video:  1176340
Translation words number:  15
Zeit number:  16
glosses number:  12
video:  1176340
Translation words number:  10
Zeit number:  5
glosses number:  3
video:  1176340
Translation words number:  4
Zeit number:  9
glosses number:  6
video:  1176340
Translation words number:  12
Zeit number:  19
glosses number:  11
video:  1176340
Translation words number:  16
Zeit number:  19
glosses number:  12
video:  1176340
Translation words number:  10
Zeit number:  12
glosses number:  10


In [65]:
ungrouped_df.head()

Unnamed: 0,Zeit,Lexem/Gebärde,Lexem/Gebärde.1,Mund,processed_timestamps,screenshot_path,Übersetzung,video_id
0,00:01:35:00 00:01:35:08,,,,00:01:35:04,./data/1176340/screenshots/screenshot_550.png,Ich kann bei der Arbeit nicht durchgehend so t...,1176340
1,00:01:35:08 00:01:35:11,ICH1,,,00:01:35:10,./data/1176340/screenshots/screenshot_551.png,Ich kann bei der Arbeit nicht durchgehend so t...,1176340
2,00:01:35:11 00:01:35:15,,,,00:01:35:13,./data/1176340/screenshots/screenshot_552.png,Ich kann bei der Arbeit nicht durchgehend so t...,1176340
3,00:01:35:15 00:01:35:21,ARBEITEN1*,ARBEITEN1*,arbeite,00:01:35:18,./data/1176340/screenshots/screenshot_553.png,Ich kann bei der Arbeit nicht durchgehend so t...,1176340
4,00:01:35:21 00:01:35:25,,,,00:01:35:23,./data/1176340/screenshots/screenshot_554.png,Ich kann bei der Arbeit nicht durchgehend so t...,1176340


In [79]:
ungrouped_list

[                       Zeit Lexem/Gebärde Lexem/Gebärde.1        Mund  \
 0   00:01:35:00 00:01:35:08          None            None        None   
 1   00:01:35:08 00:01:35:11          ICH1            None        None   
 2   00:01:35:11 00:01:35:15          None            None        None   
 3   00:01:35:15 00:01:35:21    ARBEITEN1*      ARBEITEN1*     arbeite   
 4   00:01:35:21 00:01:35:25          None            None        None   
 5   00:01:35:25 00:01:35:31       IMMER1C            None       immer   
 6   00:01:35:31 00:01:35:38          None            None        None   
 7   00:01:35:38 00:01:35:48        $PROD*            None        [MG]   
 8   00:01:35:48 00:01:36:06          None            None        None   
 9   00:01:36:06 00:01:36:20        KANN1*          KANN1*  kann nicht   
 10  00:01:36:20 00:01:36:27          None            None  kann nicht   
 11  00:01:36:27 00:01:36:32          ICH2            None  kann nicht   
 12  00:01:36:32 00:01:36:38          

- **Experiment per sentence**

In [None]:
results = []
def analyze_sentence_image_sequences(df_sentence):
    for index, row in df_sentence.iterrows():
        lexem_gebarde = row['Lexem/Gebärde']
        lexem_gebarde_1 = row['Lexem/Gebärde.1']
        if lexem_gebarde is not None and lexem_gebarde_1 is not None:    
            ubersetzung = row['Übersetzung']
            video_id = row['video_id']
            image_path = row['screenshot_path']
            processed_timestamp = row['processed_timestamps']
            glosses = {
                "prueba": "prueba"
            }
            results.append({
                'video_id': video_id,
                'sentence': ubersetzung,
                'zeit': processed_timestamp,
                'image_path': image_path,
                'gtp_4o_result': glosses
            })
            print(f"Image Path: {image_path}")
            print(f"Glosses: {glosses}")
            print()

- **Experiment with each sentence**

In [80]:
len(ungrouped_list)

10