- **Import libraries**

In [102]:
import os
from moviepy import VideoFileClip
import cv2
import pandas as pd
from typing import Dict, Optional, List
import base64
from openai import OpenAI
import os
import pandas as pd
import random
from PIL import Image
from difflib import SequenceMatcher
import json

In [3]:
def timestamp_to_seconds(timestamp, fps=30):
    hh, mm, ss, ff = map(int, timestamp.split(':'))
    total_seconds = hh * 3600 + mm * 60 + ss + ff / fps
    return total_seconds

- **Function for extracting screenshots in certain time**

In [4]:
def get_screenshots_and_processed_df(video_id):
    def extract_screenshots(video_path, timestamps, output_folder):
        # Check if the output folder exists, if not, create it
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        
        video = VideoFileClip(video_path)
        screenshot_paths = []

        # Iterate through timestamps
        for index, timestamp in enumerate(timestamps):
            # Set the video to the specified timestamp
            try:
                timestamp_float = timestamp_to_seconds(timestamp)
                frame = video.get_frame(timestamp_float)  # Ensure timestamp is a float
            except Exception as e:
                print(f"Error getting frame at timestamp {timestamp} : {e}")
                screenshot_paths.append(None)
                continue  # Skip to the next timestamp if there's an error

            # Convert the frame to BGR format (OpenCV uses BGR)
            frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
            print(f"Frame Shape: {frame_bgr.shape}, Type: {frame_bgr.dtype}")

            # Save the screenshot
            output_path = f"{output_folder}/screenshot_{index+1}.png"
            success = cv2.imwrite(output_path, frame_bgr)
            if success:
                print(f"Screenshot saved: {output_path}")
                screenshot_paths.append(output_path)
            else:
                print(f"Failed to save screenshot: {output_path}")
                screenshot_paths.append(None)

        # Close the video file
        video.close()
        return screenshot_paths

    def process_num(n):
        return '0' + str(n) if n < 10 else str(n)

    def process_timestamps(timestamps):
        res = []
        for ts in timestamps: 
            ts = ts.split(' ')
            first = int(ts[0][-2:])
            second = int(ts[1][-2:])
            mid = round(abs(int(first) - int(second)) / 2)
            millisec = process_num(first+mid)
            res.append(ts[0][:-2] + millisec)
        return res

    # Load your dataset (assuming it's a CSV file with 'second' and 'video_link' columns)
    df = pd.read_csv(f'./data/{video_id}/transcript_{video_id}.csv')

    # Remove duplicate columns
    df = df.loc[:, ~df.columns.duplicated()]

    # Extract timestamps and video link
    timestamps_unprocessed = df['Zeit'].tolist()[1:]
    video_link = df['video_src'].iloc[0]  # Assuming all rows have the same video link

    # Specify the output folder for screenshots
    output_folder = f'./data/{video_id}/screenshots'

    processed_timestamps = process_timestamps(timestamps_unprocessed)
    
    # Ensure the lengths match
    if len(processed_timestamps) != len(timestamps_unprocessed):
        raise ValueError("Processed timestamps length does not match unprocessed timestamps length")

    screenshot_paths = extract_screenshots(video_link, processed_timestamps, output_folder)

    # Ensure the lengths match
    if len(screenshot_paths) != len(df) - 1:  # Adjust for the header row
        # Adjust the length of screenshot_paths to match the DataFrame
        screenshot_paths += [None] * ((len(df) - 1) - len(screenshot_paths))

    # Generate a DataFrame with the required information
    df_processed = df.iloc[1:][['Zeit', 'Übersetzung', 'Lexem/Gebärde', 'Lexem/Gebärde.1', 'Mund', 'video_src', 'video_id']].copy()
    df_processed['processed_timestamps'] = processed_timestamps
    df_processed['screenshot_path'] = screenshot_paths

    # Group by 'Übersetzung'
    df_grouped = df_processed.groupby('Übersetzung').agg({
        'Zeit': list,
        'Lexem/Gebärde': list,
        'Lexem/Gebärde.1': list,
        'Mund': list,
        'video_src': 'first',
        'video_id': 'first',
        'processed_timestamps': list,
        'screenshot_path': list
    }).reset_index()

    return df_grouped

In [100]:
video_id = '1176340'
df_grouped = get_screenshots_and_processed_df(video_id)

{'video_found': True, 'audio_found': False, 'metadata': {'major_brand': 'isom', 'minor_version': '512', 'compatible_brands': 'isomiso2avc1mp41', 'encoder': 'Lavf59.27.100', 'copyright': '2010-2023 DGS-Korpus, Universität Hamburg'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [1280, 360], 'bitrate': 574, 'fps': 50.0, 'codec_name': 'h264', 'profile': '(Constrained Baseline)', 'metadata': {'Metadata': '', 'handler_name': 'VideoHandler', 'vendor_id': '[0][0][0][0]', 'encoder': 'Lavc59.37.100 libx264'}}], 'input_number': 0}], 'duration': 718.74, 'bitrate': 576, 'start': 0.0, 'default_video_input_number': 0, 'default_video_stream_number': 0, 'video_codec_name': 'h264', 'video_profile': '(Constrained Baseline)', 'video_size': [1280, 360], 'video_bitrate': 574, 'video_fps': 50.0, 'video_duration': 718.74, 'video_n_frames': 35937}
c:\Users\javie\Desktop\ABERDEEN\RESEARCH\sign-language-experiment\venv\Lib\site



Frame Shape: (360, 1280, 3), Type: uint8
Screenshot saved: ./data/1176340/screenshots/screenshot_3563.png




In [None]:
df_grouped.to_json(f'./data/{video_id}/processed_transcript_{video_id}.json', index=False)

- **Calculate tokens**
    - According to: https://community.openai.com/t/how-do-i-calculate-image-tokens-in-gpt4-vision/492318/2

In [6]:
from PIL import Image
from math import ceil

def resize(width, height):
    if width > 1024 or height > 1024:
        if width > height:
            height = int(height * 1024 / width)
            width = 1024
        else:
            width = int(width * 1024 / height)
            height = 1024
    return width, height

def count_image_tokens(width: int, height: int):
    width, height = resize(width, height)
    h = ceil(height / 512)
    w = ceil(width / 512)
    total = 85 + 170 * h * w
    return total

def calculate_tokens_from_image(src: str):
    with Image.open(src) as img:
        width, height = img.size
    return count_image_tokens(width, height)

# Example usage
src = './data/1176340/screenshots/screenshot_1.png'
tokens = calculate_tokens_from_image(src)
print(f'Tokens: {tokens}')
total_image_tokens = 3569*tokens
print(f'Total Image Tokens: {total_image_tokens}')

Tokens: 425
Total Image Tokens: 1516825


In [7]:
def calculate_transcript_stats(data_dir='./data'):
    total_rows = 0
    transcript_counts = []
    total_videos = 0

    for folder_name in os.listdir(data_dir):
        folder_path = os.path.join(data_dir, folder_name)
        if os.path.isdir(folder_path):
            transcript_path = os.path.join(folder_path, f'transcript_{folder_name}.csv')
            if os.path.exists(transcript_path):
                df = pd.read_csv(transcript_path)
                row_count = len(df)
                total_rows += row_count
                transcript_counts.append(row_count)
                total_videos += 1

    average_rows = total_rows / len(transcript_counts) if transcript_counts else 0
    return total_rows, average_rows, total_videos

# Example usage
total_rows, average_rows, total_videos = calculate_transcript_stats()
print(f'Total videos: {total_videos}')
print(f'Total rows: {total_rows}')
print(f'Average rows per transcript: {average_rows}')

Total videos: 423
Total rows: 746367
Average rows per transcript: 1764.4609929078015


- **Function to call open ai with the images**

In [8]:
def analyze_image(image_path, prompt):
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

    with open(image_path, "rb") as image_file:
        base64_image = base64.b64encode(image_file.read()).decode('utf-8')

    messages = [
        {
            "role": "system",
            "content": prompt
        },
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Analyze the image and provide insights."},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}",
                        "detail": "high"
                    },
                },
            ],
        }
    ]

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=messages,
        max_tokens=1024,
    )

    return response.choices[0].message.content


- **Prompt that will be used**
    - tokens: 114

In [9]:
prompt = """
    I have an image with two people using German Sign Language (DGS). 
    Please identify the DGS glosses based on the hand movements without 
    interpreting the meaning. I only want the gloss, the direct word 
    translation in German, the person (either 'left' or 'right'), and 
    the hand used (either 'left' or 'right'). Provide the information 
    in the following JSON format:

    {
        "gloss": "",
        "word": "",
        "person": "",
        "hand": ""
    }
"""

prompt_tokens = 114
total_prompt_tokens = 3569*prompt_tokens
print(f'Total Prompt Tokens: {total_prompt_tokens}')
print(f"Total tokens used for the image and prompt: {total_image_tokens + total_prompt_tokens}")

Total Prompt Tokens: 406866
Total tokens used for the image and prompt: 1923691


## Experiment with images
- Sample taken from video: 1176340

- **Import dataset with images and the sources to images**

In [10]:
grouped_images_df = pd.read_json('./data/1176340/processed_transcript_1176340.json')
grouped_images_df.head()

Unnamed: 0,Übersetzung,Zeit,Lexem/Gebärde,Lexem/Gebärde.1,Mund,video_src,video_id,processed_timestamps,screenshot_path
0,"Aber das ist eigentlich auch egal, weil ich de...","[00:00:34:01 00:00:34:09, 00:00:34:09 00:00:34...","[None, EGAL3*, None, $GEST-ABWINKEN1^*, None, ...","[None, EGAL3*, None, $GEST-ABWINKEN1^*, None, ...","[None, egal, None, None, None, da, None, mut, ...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:00:34:05, 00:00:34:12, 00:00:34:18, 00:00:...",[./data/1176340/screenshots/screenshot_198.png...
1,Aber was?,"[00:03:37:13 00:03:37:19, 00:03:37:19 00:03:38...","[None, $ORAL^, None]","[None, $ORAL^, None]","[None, ??, None]",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:03:37:16, 00:03:37:21, 00:03:38:16]",[./data/1176340/screenshots/screenshot_1148.pn...
2,"Ach ja, das war mit dir zusammen.","[00:02:58:28 00:02:58:41, 00:02:58:41 00:02:58...","[None, None, DU1, None, None, UNTER1A^*, None,...","[None, None, None, None, None, None, None, Non...","[None, None, None, None, None, None, None, Non...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:02:58:34, 00:02:58:45, 00:02:58:72, 00:02:...",[./data/1176340/screenshots/screenshot_958.png...
3,"Als meine Schwester dann wegging, kam daraufhi...","[00:06:35:17 00:06:35:21, 00:06:35:21 00:06:35...","[$INDEX1, None, SCHWESTER1A*, None, PLÖTZLICH4...","[None, None, SCHWESTER1A*, None, None, None, N...","[None, None, schwester, None, [MG], None, [MG]...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:06:35:19, 00:06:35:24, 00:06:35:31, 00:06:...",[./data/1176340/screenshots/screenshot_2016.pn...
4,Als meine Schwester und ich klein waren/,"[00:02:01:05 00:02:01:13, 00:02:01:13 00:02:01...","[None, GRUND4B*, None, $INDEX1, $INDEX1, None,...","[None, GRUND4B*, None, None, None, None, None,...","[None, grund, None, None, None, None, None, No...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:02:01:09, 00:02:01:18, 00:02:01:27, 00:02:...",[./data/1176340/screenshots/screenshot_658.png...


In [11]:
grouped_images_df.size

2349

- **Get sample of 10 sentences**
    - Use 4 as the seed number

In [12]:
seed_number = 4
sampled_df = grouped_images_df.sample(n=10, random_state=seed_number)
sampled_df

Unnamed: 0,Übersetzung,Zeit,Lexem/Gebärde,Lexem/Gebärde.1,Mund,video_src,video_id,processed_timestamps,screenshot_path
134,Ich kann bei der Arbeit nicht durchgehend so t...,"[00:01:35:00 00:01:35:08, 00:01:35:08 00:01:35...","[None, ICH1, None, ARBEITEN1*, None, IMMER1C, ...","[None, None, None, ARBEITEN1*, None, None, Non...","[None, None, None, arbeite, None, immer, None,...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:01:35:04, 00:01:35:10, 00:01:35:13, 00:01:...",[./data/1176340/screenshots/screenshot_550.png...
83,"Es ist auch eine schöne Erinnerung, wie wir fr...","[00:03:42:04 00:03:42:11, 00:03:42:11 00:03:42...","[None, SCHÖN1A*, None, WAR1*, None, JA1A, None...","[None, None, None, None, None, None, None, Non...","[None, None, None, war, None, ja, None, schön,...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:03:42:08, 00:03:42:15, 00:03:42:20, 00:03:...",[./data/1176340/screenshots/screenshot_1155.pn...
218,Sie wollten das zunächst erstmal unter sich be...,"[00:10:38:40 00:10:38:41, 00:10:38:41 00:10:39...","[None, $GEST^, None, WIMMELN1^*, None, BITTE1A...","[None, $GEST^, None, WIMMELN1^*, None, None, N...","[None, None, None, None, None, [MG], None, Non...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:10:38:40, 00:10:38:55, 00:10:39:19, 00:10:...",[./data/1176340/screenshots/screenshot_3207.pn...
118,Ich blieb zu Hause.,"[00:10:25:10 00:10:25:11, 00:10:25:11 00:10:25...","[None, ICH1, None, BLEIBEN2*, None]","[None, None, None, None, None]","[None, None, None, bleibe, None]",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:10:25:10, 00:10:25:15, 00:10:25:23, 00:10:...",[./data/1176340/screenshots/screenshot_3129.pn...
101,"Für mich war es wichtig, dass ich mich mit mei...","[00:06:09:19 00:06:09:27, 00:06:09:27 00:06:09...","[None, ICH1, None, WICHTIG1, None, ICH1, None,...","[None, None, None, WICHTIG1, None, None, None,...","[None, [MG], None, wichtig, None, schwester, s...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:06:09:23, 00:06:09:33, 00:06:09:42, 00:06:...",[./data/1176340/screenshots/screenshot_1849.pn...
242,"Wenn wir gebärdeten, machte er kleine Fehler u...","[00:06:48:40 00:06:48:48, 00:06:48:48 00:06:49...","[None, VERGEBÄRDEN2*, None, ICH1, None]","[None, VERGEBÄRDEN2*, None, None, None]","[None, [MG] schief, None, None, None]",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:06:48:44, 00:06:48:58, 00:06:49:31, 00:06:...",[./data/1176340/screenshots/screenshot_2109.pn...
215,Sie ist meine Schwester/,"[00:06:38:37 00:06:38:45, 00:06:38:45 00:06:38...","[None, SCHWESTER1A*, None, MEIN1, None, SCHWES...","[None, SCHWESTER1A*, None, None, None, SCHWEST...","[None, sch{wester}, None, meine, None, schwest...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:06:38:41, 00:06:38:47, 00:06:38:71, 00:06:...",[./data/1176340/screenshots/screenshot_2042.pn...
6,"Also über meine Firma, da habe ich 2006 zum er...","[00:00:12:21 00:00:12:29, 00:00:12:29 00:00:12...","[None, FIRMA1B*, None, ZUSAMMENHANG1A*, None, ...","[None, None, None, ZUSAMMENHANG1A*, None, None...","[None, zusammenhang, zusammenhang, zusammenhan...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:00:12:25, 00:00:12:33, 00:00:12:40, 00:00:...","[./data/1176340/screenshots/screenshot_69.png,..."
258,"Zuerst waren wir zusammen in Schleswig, bis si...","[00:02:09:26 00:02:09:33, 00:02:09:33 00:02:09...","[None, SCHLESWIG1*, None, ZUERST1A*, None, SCH...","[None, None, None, None, None, None, None, ZUS...","[None, schleswig, None, zuerst, None, schleswi...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:02:09:30, 00:02:09:39, 00:02:09:65, 00:02:...",[./data/1176340/screenshots/screenshot_721.png...
108,"Ich beginne das Thema von Anfang an, so wie bi...","[00:00:07:48 00:00:08:11, 00:00:08:11 00:00:08...","[None, ÜBER1, None, THEMA1*, None, ANFANG1A*, ...","[None, None, None, None, None, ANFANG1A*, None...","[None, über, None, thema, None, anfangen, anfa...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:00:07:66, 00:00:08:13, 00:00:08:18, 00:00:...","[./data/1176340/screenshots/screenshot_43.png,..."


- **Ungroup dataset**

In [53]:
import pandas as pd

# Create an empty DataFrame with the desired columns
ungrouped_df = pd.DataFrame(columns=['Zeit', 'Lexem/Gebärde', 'Lexem/Gebärde.1', 'Mund', 'processed_timestamps', 'screenshot_path', 'Übersetzung', 'video_id'])
ungrouped_list = []

# Iterate over the sampled DataFrame and ungroup the data
for index, row in grouped_images_df.iterrows():
    group_cols = ['Zeit', 'Lexem/Gebärde', 'Lexem/Gebärde.1', 'Mund', 'processed_timestamps', 'screenshot_path']
    ubersetzung = row['Übersetzung']
    print("video: ", row['video_id'])
    print("Translation words number: ", len(ubersetzung.split(' ')))
    print("Zeit number: ", len(row['Zeit']))
    print("glosses number: ", len([x for x in row['Lexem/Gebärde']+row['Lexem/Gebärde.1'] if x is not None]))
    ungrouped_per_sentence = pd.DataFrame(columns=['Zeit', 'Lexem/Gebärde', 'Lexem/Gebärde.1', 'Mund', 'processed_timestamps', 'screenshot_path', 'Übersetzung', 'video_id'])
    for i in range(len(row['Zeit'])):
        group_data = {col: row[col][i] for col in group_cols}
        group_data['Übersetzung'] = row['Übersetzung']
        group_data['video_id'] = row['video_id']
        
        # Append the ungrouped data to the DataFrame
        ungrouped_df = pd.concat([ungrouped_df, pd.DataFrame([group_data])], ignore_index=True)
        ungrouped_per_sentence = pd.concat([ungrouped_per_sentence, pd.DataFrame([group_data])], ignore_index=True)
    ungrouped_list.append(ungrouped_per_sentence)

video:  1176340
Translation words number:  17
Zeit number:  13
glosses number:  11
video:  1176340
Translation words number:  2
Zeit number:  3
glosses number:  2
video:  1176340
Translation words number:  7
Zeit number:  9
glosses number:  3
video:  1176340
Translation words number:  22
Zeit number:  26
glosses number:  17
video:  1176340
Translation words number:  7
Zeit number:  14
glosses number:  8
video:  1176340
Translation words number:  16
Zeit number:  20
glosses number:  15
video:  1176340
Translation words number:  12
Zeit number:  19
glosses number:  11
video:  1176340
Translation words number:  15
Zeit number:  19
glosses number:  13
video:  1176340
Translation words number:  13
Zeit number:  7
glosses number:  5
video:  1176340
Translation words number:  10
Zeit number:  6
glosses number:  4
video:  1176340
Translation words number:  13
Zeit number:  18
glosses number:  10
video:  1176340
Translation words number:  7
Zeit number:  8
glosses number:  4
video:  1176340
Tra

In [54]:
ungrouped_df.head()

Unnamed: 0,Zeit,Lexem/Gebärde,Lexem/Gebärde.1,Mund,processed_timestamps,screenshot_path,Übersetzung,video_id
0,00:00:34:01 00:00:34:09,,,,00:00:34:05,./data/1176340/screenshots/screenshot_198.png,"Aber das ist eigentlich auch egal, weil ich de...",1176340
1,00:00:34:09 00:00:34:15,EGAL3*,EGAL3*,egal,00:00:34:12,./data/1176340/screenshots/screenshot_199.png,"Aber das ist eigentlich auch egal, weil ich de...",1176340
2,00:00:34:15 00:00:34:21,,,,00:00:34:18,./data/1176340/screenshots/screenshot_200.png,"Aber das ist eigentlich auch egal, weil ich de...",1176340
3,00:00:34:21 00:00:34:30,$GEST-ABWINKEN1^*,$GEST-ABWINKEN1^*,,00:00:34:25,./data/1176340/screenshots/screenshot_201.png,"Aber das ist eigentlich auch egal, weil ich de...",1176340
4,00:00:34:30 00:00:34:35,,,,00:00:34:32,./data/1176340/screenshots/screenshot_202.png,"Aber das ist eigentlich auch egal, weil ich de...",1176340


In [55]:
ungrouped_list

[                       Zeit      Lexem/Gebärde    Lexem/Gebärde.1    Mund  \
 0   00:00:34:01 00:00:34:09               None               None    None   
 1   00:00:34:09 00:00:34:15             EGAL3*             EGAL3*    egal   
 2   00:00:34:15 00:00:34:21               None               None    None   
 3   00:00:34:21 00:00:34:30  $GEST-ABWINKEN1^*  $GEST-ABWINKEN1^*    None   
 4   00:00:34:30 00:00:34:35               None               None    None   
 5   00:00:34:35 00:00:34:39               DA1*               DA1*      da   
 6   00:00:34:39 00:00:34:47               None               None    None   
 7   00:00:34:47 00:00:35:01               None              MUT1A     mut   
 8   00:00:35:01 00:00:35:09               None               None    None   
 9   00:00:35:09 00:00:35:17   $GEST-ABWINKEN1^   $GEST-ABWINKEN1^  locker   
 10  00:00:35:17 00:00:35:20               None               None  locker   
 11  00:00:35:20 00:00:35:33            LOCKER1            LOCKE

- **Experiment per sentence**

In [16]:
results = []
def analyze_sentence_image_sequences(df_sentence):
    for index, row in df_sentence.iterrows():
        lexem_gebarde = row['Lexem/Gebärde']
        lexem_gebarde_1 = row['Lexem/Gebärde.1']
        if lexem_gebarde is not None and lexem_gebarde_1 is not None:    
            ubersetzung = row['Übersetzung']
            video_id = row['video_id']
            image_path = row['screenshot_path']
            processed_timestamp = row['processed_timestamps']
            glosses = {
                "prueba": "prueba"
            }
            results.append({
                'video_id': video_id,
                'sentence': ubersetzung,
                'zeit': processed_timestamp,
                'image_path': image_path,
                'gtp_4o_result': glosses
            })
            print(f"Image Path: {image_path}")
            print(f"Glosses: {glosses}")
            print()

In [56]:
ungrouped_df.size

27864

- Image without context: no successful

## Experiment with image sequencies
- Few sentences sample
- Send vocabulary dictioonary to model as context, then send sentence (sequence of images) for its translation
- Calculate or measure accuracy

- **Separate screenshots by left - right**
- Generate Vocabulary 

In [58]:
# Crear lista para el nuevo dataset
new_data = []

# Carpeta donde se guardarán las imágenes divididas
output_folder = "./data/1176340/screenshots_divided/"
os.makedirs(output_folder, exist_ok=True)

# Procesar cada fila del DataFrame
for index, row in ungrouped_df.iterrows():
    left_gloss = row["Lexem/Gebärde"]
    right_gloss = row["Lexem/Gebärde.1"]
    screenshot_path = row["screenshot_path"]
    timestamp = row["processed_timestamps"]
    sentence = row["Übersetzung"]

    # Verificar si el gloss existe
    left_exists = pd.notna(left_gloss)
    right_exists = pd.notna(right_gloss)

    # Si no hay gloss en ninguna columna, pasar a la siguiente fila
    if not left_exists and not right_exists:
        continue

    # Verificar si la imagen original existe
    if not os.path.exists(screenshot_path):
        print(f"Imagen no encontrada: {screenshot_path}")
        continue

    # Abrir imagen y dividir en dos
    with Image.open(screenshot_path) as img:
        width, height = img.size

        # Guardar solo si el gloss existe
        if left_exists:
            left_half = img.crop((0, 0, width // 2, height))
            left_path = os.path.join(output_folder, f"gloss_{index}_left.png")
            left_half.save(left_path)
            new_data.append([sentence, left_gloss, "left", screenshot_path, left_path, timestamp])

        if right_exists:
            right_half = img.crop((width // 2, 0, width, height))
            right_path = os.path.join(output_folder, f"gloss_{index}_right.png")
            right_half.save(right_path)
            new_data.append([sentence, right_gloss, "right", screenshot_path, right_path, timestamp])

# Crear DataFrame ordenado por timestamp
columns = ["sentence", "gloss", "position", "original_path", "new_path", "timestamp"]
sorted_df = pd.DataFrame(new_data, columns=columns).sort_values(by="timestamp")

# Guardar en CSV
sorted_df.to_csv("./data/1176340/gloss_dataset.csv", index=False, encoding="utf-8")

print("Gloss vocabulary: ./data/1176340/gloss_dataset.csv")

Gloss vocabulary: ./data/1176340/gloss_dataset.csv


In [59]:
sorted_df.size

14364

- **Create vocabulary dictionary**

In [67]:
def create_vocabulary_dictionary(df_ungrouped: pd.DataFrame) -> Dict[str, Dict]:
    """
    Create a vocabulary dictionary with glosses and associated images.
    
    Args:
        df_ungrouped: DataFrame with ungrouped sign language data
        
    Returns:
        Dictionary with glosses as keys and associated data (excluding the gloss column) as values.
    """
    vocabulary = {}

    # Obtener los glosses únicos
    unique_glosses = set(df_ungrouped["gloss"].dropna())

    for gloss in unique_glosses:
        # Filtrar DataFrame por gloss actual
        grouped_by_gloss = df_ungrouped[df_ungrouped["gloss"] == gloss]

        # Eliminar la columna "gloss" para evitar redundancia
        grouped_by_gloss = grouped_by_gloss.drop(columns=["gloss"]).reset_index(drop=True)

        # Convertir a diccionario
        vocabulary[gloss] = grouped_by_gloss.to_dict(orient="records")

    return vocabulary


- **Create sentences sample to send sequences**

In [81]:
def create_sentences_sample(df_ungrouped: pd.DataFrame, vocabulary: Dict[str, Dict], seed_number: int = 4, sample_n: int = 10) -> Dict[str, Dict]:
    """
    Create a sample of sentences with the sequence of glosses used.

    Args:
        df_ungrouped: DataFrame with ungrouped sign language data
        vocabulary: Dictionary containing glosses and their associated images
        seed_number: Random seed for reproducibility
        sample_n: Number of sentences to return as a sample

    Returns:
        Dictionary with sentences, their glosses, other data, and a vocabulary sample for the prompt.
    """
    random.seed(seed_number)
    result = {}

    # Obtener las oraciones únicas
    unique_sentences = list(df_ungrouped["sentence"].dropna().unique())

    # Seleccionar solo sample_n oraciones aleatorias
    sampled_sentences = random.sample(unique_sentences, min(sample_n, len(unique_sentences)))

    for sentence in sampled_sentences:
        # Filtrar DataFrame por la oración actual
        grouped_by_sentence = df_ungrouped[df_ungrouped["sentence"] == sentence].drop(columns=["sentence"]).reset_index(drop=True)

        # Obtener los glosses de la oración
        glosses = grouped_by_sentence["gloss"].dropna().tolist()

        vocabulary_sample_for_prompt = {}

        for gloss in glosses:
            if gloss in vocabulary:
                df_gloss = pd.DataFrame(vocabulary[gloss])
                if "new_path" in df_gloss.columns and not df_gloss.empty:
                    images_from_gloss = df_gloss["new_path"].dropna().tolist()
                    if images_from_gloss:
                        selected_image = random.choice(images_from_gloss)
                        vocabulary_sample_for_prompt[gloss] = selected_image

        # Añadir 4 glosses aleatorios para confundir al LLM
        all_glosses = list(vocabulary.keys())
        random_glosses = random.sample([g for g in all_glosses if g not in glosses], min(4, len(all_glosses)))

        for random_gloss in random_glosses:
            df_random_gloss = pd.DataFrame(vocabulary[random_gloss])
            if "new_path" in df_random_gloss.columns and not df_random_gloss.empty:
                images_from_random_gloss = df_random_gloss["new_path"].dropna().tolist()
                if images_from_random_gloss:
                    vocabulary_sample_for_prompt[random_gloss] = random.choice(images_from_random_gloss)

        # Shuffle vocabulary_sample_for_prompt to randomize order
        shuffled_vocab_sample = dict(random.sample(list(vocabulary_sample_for_prompt.items()), len(vocabulary_sample_for_prompt)))

        # Guardar los datos en el diccionario final
        result[sentence] = {
            "data": grouped_by_sentence.to_dict(orient="records"),
            "vocabulary_sample_for_prompt": shuffled_vocab_sample
        }

    return result


In [40]:
def encode_image_to_base64(image_path: str) -> Optional[str]:
    """
    Encode an image to base64 string.
    
    Args:
        image_path: Path to the image file
        
    Returns:
        Base64 encoded string of the image or None if the file doesn't exist
    """
    if not os.path.exists(image_path):
        print(f"Warning: Image file {image_path} does not exist.")
        return None
    
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

- **Generate vocabulary**

In [79]:
vocabulary_dict = create_vocabulary_dictionary(sorted_df)
print(f"Number of glosses: {len(list(vocabulary_dict))}")
print(vocabulary_dict)

Number of glosses: 675
{'MÜHE1': [{'sentence': 'Ich gebe mir nun mehr Mühe damit, alleine Dinge zu erledigen ohne einen Dolmetscher.', 'position': 'left', 'original_path': './data/1176340/screenshots/screenshot_2507.png', 'new_path': './data/1176340/screenshots_divided/gloss_1575_left.png', 'timestamp': '00:08:06:07'}, {'sentence': 'Mir ist es wichtig, dass Hörende auch gebärden und sich dabei Mühe geben.', 'position': 'left', 'original_path': './data/1176340/screenshots/screenshot_3547.png', 'new_path': './data/1176340/screenshots_divided/gloss_2628_left.png', 'timestamp': '00:11:56:12'}], 'WISSEN2B': [{'sentence': 'Weißt du das noch? Bayern München war immer ein Thema für uns.', 'position': 'left', 'original_path': './data/1176340/screenshots/screenshot_977.png', 'new_path': './data/1176340/screenshots_divided/gloss_3127_left.png', 'timestamp': '00:03:01:04'}, {'sentence': 'Dazu haben wir noch Essen organisiert, weil wir bereits an das Spiel gedacht haben und darauf gespannt waren.',

- **Generate sentences sample to experiment**

In [85]:
sample_to_experiment = create_sentences_sample(sorted_df, vocabulary_dict, seed_number=4, sample_n=5)
sample_to_experiment

{'Der Jugendwart kann z.B. initiieren, dass man einen Ausflug macht oder die verschiedenen Gruppen mehr zusammenkommen, sich kennenlernen und austauschen.': {'data': [{'gloss': 'JUNG1*',
    'position': 'left',
    'original_path': './data/1176340/screenshots/screenshot_1659.png',
    'new_path': './data/1176340/screenshots_divided/gloss_797_left.png',
    'timestamp': '00:05:16:16'},
   {'gloss': 'AUFPASSEN1B^*',
    'position': 'left',
    'original_path': './data/1176340/screenshots/screenshot_1661.png',
    'new_path': './data/1176340/screenshots_divided/gloss_799_left.png',
    'timestamp': '00:05:16:28'},
   {'gloss': 'AUFPASSEN1B^*',
    'position': 'right',
    'original_path': './data/1176340/screenshots/screenshot_1661.png',
    'new_path': './data/1176340/screenshots_divided/gloss_799_right.png',
    'timestamp': '00:05:16:28'},
   {'gloss': 'INHALT3*',
    'position': 'left',
    'original_path': './data/1176340/screenshots/screenshot_1663.png',
    'new_path': './data/1176

- **Experiment**

In [110]:
def compute_translation_accuracy(predicted: str, actual: str) -> float:
    """
    Compute similarity between predicted and actual sentences using SequenceMatcher.
    Returns a percentage match.
    """
    similarity = SequenceMatcher(None, predicted.lower(), actual.lower()).ratio()
    return similarity * 100

def evaluate_translation(result: Dict, expected_german: str) -> Dict:
    """
    Evaluate the translation accuracy against the expected German sentence.
    """
    predicted_german = result.get("german_translation", "")
    accuracy = compute_translation_accuracy(predicted_german, expected_german)

    return {
    "predicted_german": predicted_german,
    "expected_german": expected_german,
    "accuracy": accuracy,
    "success": accuracy >= 90 # Consider successful if accuracy is 90% or higher
    }

def translate_german_to_english(client: OpenAI, german_text: str) -> str:
    """
    Translate German text to English using OpenAI API.
    """
    prompt = f"Translate the following German text to English:\n\n{german_text}\n\nEnglish translation:"
    
    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=200
        )
        
        return response.choices[0].message.content.strip()
    except Exception as e:
        return f"Error translating to English: {str(e)}"

def translate_image_sequence(sample_of_sentences: Dict[str, Dict], sentence_index: int = 0) -> Dict:
    """
    Translate a sequence of images to glosses and sentences using OpenAI API.
    """
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
    glosses = []
    sentence_experiment = list(sample_of_sentences)[sentence_index] 
    sentence = sample_of_sentences[sentence_experiment]["data"]
    image_vocabulary = sample_of_sentences[sentence_experiment]["vocabulary_sample_for_prompt"]
    content = [{"type": "text", "text": "VOCABULARY REFERENCE (Image → Gloss):"}]
    
    for i, (gloss, image_path) in enumerate(image_vocabulary.items(), 1):
        base64_image = encode_image_to_base64(image_path)
        if base64_image:
            content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}})
            content.append({"type": "text", "text": f"Vocabulary Image {i}: {gloss}"})
    
    content.append({"type": "text", "text": "\nSENTENCE TO TRANSLATE (Sequence of Images):"})
    
    for i, fragment in enumerate(sentence, 1):
        glosses.append(fragment["gloss"])
        base64_image = encode_image_to_base64(fragment["new_path"])
        if base64_image:
            content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}})
            content.append({"type": "text", "text": f"Sentence Image {i}"})
    
    prompt = """
    Based on the vocabulary reference images and their corresponding glosses provided above, 
    analyze the sequence of images representing a sentence and:
    
    1. Identify which glosses from the vocabulary each image in the sentence corresponds to
    2. Translate the sequence of glosses into a coherent German sentence
    
    Return your response in the following JSON format:
    {
        "predicted_glosses": ["gloss1", "gloss2", ...],
        "german_translation": "German sentence translation",
        "english_translation": "English sentence translation"
    }
    
    If you're uncertain about any image, make your best guess based on visual similarity to the vocabulary images.
    """
    
    content.append({"type": "text", "text": prompt})

    try:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{"role": "user", "content": content}],
            max_tokens=1000,
            response_format={"type": "json_object"}
        )
        
        result = json.loads(response.choices[0].message.content)
        result["expected_glosses"] = glosses
        accuracy_result = evaluate_translation(result, sentence_experiment)
        result["accuracy"] = accuracy_result
        
        # Translate German to English
        english_translation = translate_german_to_english(client, sentence_experiment)
        result["expected_german_english_translation"] = english_translation
        
        return result
    
    except Exception as e:
        return {
            "error": str(e),
            "expected_glosses": glosses,
            "predicted_glosses": result["glosses"],
            "german_translation": "Error generating translation",
            "english_translation": "Error generating translation",
            "accuracy": {"success": False, "accuracy": 0},
            "accuracy_glosses": {"success": False, "accuracy": 0}
        }


In [111]:
translate_image_sequence(sample_to_experiment, 0)

{'predicted_glosses': ['BEISPIEL1*',
  'GRÜNDEN2',
  'REISE1C^',
  'BEISPIEL1*',
  'GRUPPE1A*',
  'BEIDE2A*',
  'GRÜNDEN2',
  'REISE1C^',
  'KENNENLERNEN1*'],
 'german_translation': 'Zum Beispiel Reise zu gründen, Gruppe beide kennenlernen',
 'english_translation': 'For example, to organize a trip, meet both groups',
 'expected_glosses': ['JUNG1*',
  'AUFPASSEN1B^*',
  'AUFPASSEN1B^*',
  'INHALT3*',
  'INHALT3*',
  'BEISPIEL1*',
  'BEISPIEL1*',
  'GRÜNDEN2',
  'GRÜNDEN2',
  '$GEST^',
  '$GEST^',
  'REISE1C^',
  'MACHEN1',
  'MACHEN1',
  'MEHR1*',
  'MEHR1*',
  'GRUPPE1A*',
  'GRUPPE1A*',
  'BEIDE2A*',
  'BEIDE2A*',
  'KENNENLERNEN1*',
  'KENNENLERNEN1*',
  'AUSTAUSCHEN-KOMMUNIKATION2*',
  'AUSTAUSCHEN-KOMMUNIKATION2*'],
 'accuracy': {'predicted_german': 'Zum Beispiel Reise zu gründen, Gruppe beide kennenlernen',
  'expected_german': 'Der Jugendwart kann z.B. initiieren, dass man einen Ausflug macht oder die verschiedenen Gruppen mehr zusammenkommen, sich kennenlernen und austauschen.',

In [112]:
translate_image_sequence(sample_to_experiment, 1)

{'predicted_glosses': ['BRUDER1A*',
  'GRUPPE1A^*',
  'BRUDER1A*',
  'DGS1',
  'GRUPPE1A^*'],
 'german_translation': 'Bruder Gruppe Bruder DGS Gruppe',
 'english_translation': 'Brother group brother sign language group',
 'expected_glosses': ['BRUDER1A*',
  'BRUDER1A*',
  'BESCHEID1A*',
  '$GEST^',
  '$GEST^'],
 'accuracy': {'predicted_german': 'Bruder Gruppe Bruder DGS Gruppe',
  'expected_german': 'Mein Bruder meinte zu mir: „Hey, warum hast du mir das nicht gesagt?“',
  'accuracy': 28.000000000000004,
  'success': False},
 'expected_german_english_translation': 'My brother said to me: "Hey, why didn\'t you tell me that?"'}

In [113]:
translate_image_sequence(sample_to_experiment, 2)

{'predicted_glosses': ['ICH1*', 'NUR2A'],
 'german_translation': 'Ich nur.',
 'english_translation': 'Only me.',
 'expected_glosses': ['$INDEX1*', 'ICH1*'],
 'accuracy': {'predicted_german': 'Ich nur.',
  'expected_german': 'Ich/',
  'accuracy': 50.0,
  'success': False},
 'expected_german_english_translation': 'I'}