- **Import libraries**

In [26]:
import os
from moviepy import VideoFileClip
import cv2
import pandas as pd
from typing import Dict, Optional, List
import base64
from openai import OpenAI
import os
import pandas as pd
import random
from PIL import Image
from difflib import SequenceMatcher
import json
from nltk.translate.bleu_score import sentence_bleu


In [2]:
def timestamp_to_seconds(timestamp, fps=30):
    hh, mm, ss, ff = map(int, timestamp.split(':'))
    total_seconds = hh * 3600 + mm * 60 + ss + ff / fps
    return total_seconds

- **Function for extracting screenshots in certain time**

In [3]:
def get_screenshots_and_processed_df(video_id):
    def extract_screenshots(video_path, timestamps, output_folder):
        # Check if the output folder exists, if not, create it
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        
        video = VideoFileClip(video_path)
        screenshot_paths = []

        # Iterate through timestamps
        for index, timestamp in enumerate(timestamps):
            # Set the video to the specified timestamp
            try:
                timestamp_float = timestamp_to_seconds(timestamp)
                frame = video.get_frame(timestamp_float)  # Ensure timestamp is a float
            except Exception as e:
                print(f"Error getting frame at timestamp {timestamp} : {e}")
                screenshot_paths.append(None)
                continue  # Skip to the next timestamp if there's an error

            # Convert the frame to BGR format (OpenCV uses BGR)
            frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
            print(f"Frame Shape: {frame_bgr.shape}, Type: {frame_bgr.dtype}")

            # Save the screenshot
            output_path = f"{output_folder}/screenshot_{index+1}.png"
            success = cv2.imwrite(output_path, frame_bgr)
            if success:
                print(f"Screenshot saved: {output_path}")
                screenshot_paths.append(output_path)
            else:
                print(f"Failed to save screenshot: {output_path}")
                screenshot_paths.append(None)

        # Close the video file
        video.close()
        return screenshot_paths

    def process_num(n):
        return '0' + str(n) if n < 10 else str(n)

    def process_timestamps(timestamps):
        res = []
        for ts in timestamps: 
            ts = ts.split(' ')
            first = int(ts[0][-2:])
            second = int(ts[1][-2:])
            mid = round(abs(int(first) - int(second)) / 2)
            millisec = process_num(first+mid)
            res.append(ts[0][:-2] + millisec)
        return res

    # Load your dataset (assuming it's a CSV file with 'second' and 'video_link' columns)
    df = pd.read_csv(f'./data/{video_id}/transcript_{video_id}.csv')

    # Remove duplicate columns
    df = df.loc[:, ~df.columns.duplicated()]

    # Extract timestamps and video link
    timestamps_unprocessed = df['Zeit'].tolist()[1:]
    video_link = df['video_src'].iloc[0]  # Assuming all rows have the same video link

    # Specify the output folder for screenshots
    output_folder = f'./data/{video_id}/screenshots'

    processed_timestamps = process_timestamps(timestamps_unprocessed)
    
    # Ensure the lengths match
    if len(processed_timestamps) != len(timestamps_unprocessed):
        raise ValueError("Processed timestamps length does not match unprocessed timestamps length")

    screenshot_paths = extract_screenshots(video_link, processed_timestamps, output_folder)

    # Ensure the lengths match
    if len(screenshot_paths) != len(df) - 1:  # Adjust for the header row
        # Adjust the length of screenshot_paths to match the DataFrame
        screenshot_paths += [None] * ((len(df) - 1) - len(screenshot_paths))

    # Generate a DataFrame with the required information
    df_processed = df.iloc[1:][['Zeit', 'Übersetzung', 'Lexem/Gebärde', 'Lexem/Gebärde.1', 'Mund', 'video_src', 'video_id']].copy()
    df_processed['processed_timestamps'] = processed_timestamps
    df_processed['screenshot_path'] = screenshot_paths

    # Group by 'Übersetzung'
    df_grouped = df_processed.groupby('Übersetzung').agg({
        'Zeit': list,
        'Lexem/Gebärde': list,
        'Lexem/Gebärde.1': list,
        'Mund': list,
        'video_src': 'first',
        'video_id': 'first',
        'processed_timestamps': list,
        'screenshot_path': list
    }).reset_index()

    return df_grouped

In [None]:
video_id = '1176340'
df_grouped = get_screenshots_and_processed_df(video_id)

In [None]:
df_grouped.to_json(f'./data/{video_id}/processed_transcript_{video_id}.json', index=False)

- **Calculate tokens**
    - According to: https://community.openai.com/t/how-do-i-calculate-image-tokens-in-gpt4-vision/492318/2

In [4]:
from PIL import Image
from math import ceil

def resize(width, height):
    if width > 1024 or height > 1024:
        if width > height:
            height = int(height * 1024 / width)
            width = 1024
        else:
            width = int(width * 1024 / height)
            height = 1024
    return width, height

def count_image_tokens(width: int, height: int):
    width, height = resize(width, height)
    h = ceil(height / 512)
    w = ceil(width / 512)
    total = 85 + 170 * h * w
    return total

def calculate_tokens_from_image(src: str):
    with Image.open(src) as img:
        width, height = img.size
    return count_image_tokens(width, height)

# Example usage
src = './data/1176340/screenshots/screenshot_1.png'
tokens = calculate_tokens_from_image(src)
print(f'Tokens: {tokens}')
total_image_tokens = 3569*tokens
print(f'Total Image Tokens: {total_image_tokens}')

Tokens: 425
Total Image Tokens: 1516825


In [5]:
def calculate_transcript_stats(data_dir='./data'):
    total_rows = 0
    transcript_counts = []
    total_videos = 0

    for folder_name in os.listdir(data_dir):
        folder_path = os.path.join(data_dir, folder_name)
        if os.path.isdir(folder_path):
            transcript_path = os.path.join(folder_path, f'transcript_{folder_name}.csv')
            if os.path.exists(transcript_path):
                df = pd.read_csv(transcript_path)
                row_count = len(df)
                total_rows += row_count
                transcript_counts.append(row_count)
                total_videos += 1

    average_rows = total_rows / len(transcript_counts) if transcript_counts else 0
    return total_rows, average_rows, total_videos

# Example usage
total_rows, average_rows, total_videos = calculate_transcript_stats()
print(f'Total videos: {total_videos}')
print(f'Total rows: {total_rows}')
print(f'Average rows per transcript: {average_rows}')

Total videos: 423
Total rows: 746367
Average rows per transcript: 1764.4609929078015


- **Function to call open ai with the images**

In [6]:
def analyze_image(image_path, prompt, model="gpt-4o"):
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

    with open(image_path, "rb") as image_file:
        base64_image = base64.b64encode(image_file.read()).decode('utf-8')

    messages = [
        {
            "role": "system",
            "content": prompt
        },
        {
            "role": "user",
            "content": [
                {"type": "text", "text": "Analyze the image and provide insights."},
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{base64_image}",
                        "detail": "high"
                    },
                },
            ],
        }
    ]

    response = client.chat.completions.create(
        model=model,
        messages=messages,
        max_tokens=1024,
    )

    return response.choices[0].message.content


- **Prompt that will be used**
    - tokens: 114

In [7]:
prompt = """
    I have an image with two people using German Sign Language (DGS). 
    Please identify the DGS glosses based on the hand movements without 
    interpreting the meaning. I only want the gloss, the direct word 
    translation in German, the person (either 'left' or 'right'), and 
    the hand used (either 'left' or 'right'). Provide the information 
    in the following JSON format:

    {
        "gloss": "",
        "word": "",
        "person": "",
        "hand": ""
    }
"""

prompt_tokens = 114
total_prompt_tokens = 3569*prompt_tokens
print(f'Total Prompt Tokens: {total_prompt_tokens}')
print(f"Total tokens used for the image and prompt: {total_image_tokens + total_prompt_tokens}")

Total Prompt Tokens: 406866
Total tokens used for the image and prompt: 1923691


## Experiment with images
- Sample taken from video: 1176340

- **Import dataset with images and the sources to images**

In [8]:
grouped_images_df = pd.read_json('./data/1176340/processed_transcript_1176340.json')
grouped_images_df.head()

Unnamed: 0,Übersetzung,Zeit,Lexem/Gebärde,Lexem/Gebärde.1,Mund,video_src,video_id,processed_timestamps,screenshot_path
0,"Aber das ist eigentlich auch egal, weil ich de...","[00:00:34:01 00:00:34:09, 00:00:34:09 00:00:34...","[None, EGAL3*, None, $GEST-ABWINKEN1^*, None, ...","[None, EGAL3*, None, $GEST-ABWINKEN1^*, None, ...","[None, egal, None, None, None, da, None, mut, ...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:00:34:05, 00:00:34:12, 00:00:34:18, 00:00:...",[./data/1176340/screenshots/screenshot_198.png...
1,Aber was?,"[00:03:37:13 00:03:37:19, 00:03:37:19 00:03:38...","[None, $ORAL^, None]","[None, $ORAL^, None]","[None, ??, None]",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:03:37:16, 00:03:37:21, 00:03:38:16]",[./data/1176340/screenshots/screenshot_1148.pn...
2,"Ach ja, das war mit dir zusammen.","[00:02:58:28 00:02:58:41, 00:02:58:41 00:02:58...","[None, None, DU1, None, None, UNTER1A^*, None,...","[None, None, None, None, None, None, None, Non...","[None, None, None, None, None, None, None, Non...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:02:58:34, 00:02:58:45, 00:02:58:72, 00:02:...",[./data/1176340/screenshots/screenshot_958.png...
3,"Als meine Schwester dann wegging, kam daraufhi...","[00:06:35:17 00:06:35:21, 00:06:35:21 00:06:35...","[$INDEX1, None, SCHWESTER1A*, None, PLÖTZLICH4...","[None, None, SCHWESTER1A*, None, None, None, N...","[None, None, schwester, None, [MG], None, [MG]...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:06:35:19, 00:06:35:24, 00:06:35:31, 00:06:...",[./data/1176340/screenshots/screenshot_2016.pn...
4,Als meine Schwester und ich klein waren/,"[00:02:01:05 00:02:01:13, 00:02:01:13 00:02:01...","[None, GRUND4B*, None, $INDEX1, $INDEX1, None,...","[None, GRUND4B*, None, None, None, None, None,...","[None, grund, None, None, None, None, None, No...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:02:01:09, 00:02:01:18, 00:02:01:27, 00:02:...",[./data/1176340/screenshots/screenshot_658.png...


In [9]:
grouped_images_df.size

2349

- **Get sample of 10 sentences**
    - Use 4 as the seed number

In [10]:
seed_number = 4
sampled_df = grouped_images_df.sample(n=10, random_state=seed_number)
sampled_df

Unnamed: 0,Übersetzung,Zeit,Lexem/Gebärde,Lexem/Gebärde.1,Mund,video_src,video_id,processed_timestamps,screenshot_path
134,Ich kann bei der Arbeit nicht durchgehend so t...,"[00:01:35:00 00:01:35:08, 00:01:35:08 00:01:35...","[None, ICH1, None, ARBEITEN1*, None, IMMER1C, ...","[None, None, None, ARBEITEN1*, None, None, Non...","[None, None, None, arbeite, None, immer, None,...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:01:35:04, 00:01:35:10, 00:01:35:13, 00:01:...",[./data/1176340/screenshots/screenshot_550.png...
83,"Es ist auch eine schöne Erinnerung, wie wir fr...","[00:03:42:04 00:03:42:11, 00:03:42:11 00:03:42...","[None, SCHÖN1A*, None, WAR1*, None, JA1A, None...","[None, None, None, None, None, None, None, Non...","[None, None, None, war, None, ja, None, schön,...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:03:42:08, 00:03:42:15, 00:03:42:20, 00:03:...",[./data/1176340/screenshots/screenshot_1155.pn...
218,Sie wollten das zunächst erstmal unter sich be...,"[00:10:38:40 00:10:38:41, 00:10:38:41 00:10:39...","[None, $GEST^, None, WIMMELN1^*, None, BITTE1A...","[None, $GEST^, None, WIMMELN1^*, None, None, N...","[None, None, None, None, None, [MG], None, Non...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:10:38:40, 00:10:38:55, 00:10:39:19, 00:10:...",[./data/1176340/screenshots/screenshot_3207.pn...
118,Ich blieb zu Hause.,"[00:10:25:10 00:10:25:11, 00:10:25:11 00:10:25...","[None, ICH1, None, BLEIBEN2*, None]","[None, None, None, None, None]","[None, None, None, bleibe, None]",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:10:25:10, 00:10:25:15, 00:10:25:23, 00:10:...",[./data/1176340/screenshots/screenshot_3129.pn...
101,"Für mich war es wichtig, dass ich mich mit mei...","[00:06:09:19 00:06:09:27, 00:06:09:27 00:06:09...","[None, ICH1, None, WICHTIG1, None, ICH1, None,...","[None, None, None, WICHTIG1, None, None, None,...","[None, [MG], None, wichtig, None, schwester, s...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:06:09:23, 00:06:09:33, 00:06:09:42, 00:06:...",[./data/1176340/screenshots/screenshot_1849.pn...
242,"Wenn wir gebärdeten, machte er kleine Fehler u...","[00:06:48:40 00:06:48:48, 00:06:48:48 00:06:49...","[None, VERGEBÄRDEN2*, None, ICH1, None]","[None, VERGEBÄRDEN2*, None, None, None]","[None, [MG] schief, None, None, None]",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:06:48:44, 00:06:48:58, 00:06:49:31, 00:06:...",[./data/1176340/screenshots/screenshot_2109.pn...
215,Sie ist meine Schwester/,"[00:06:38:37 00:06:38:45, 00:06:38:45 00:06:38...","[None, SCHWESTER1A*, None, MEIN1, None, SCHWES...","[None, SCHWESTER1A*, None, None, None, SCHWEST...","[None, sch{wester}, None, meine, None, schwest...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:06:38:41, 00:06:38:47, 00:06:38:71, 00:06:...",[./data/1176340/screenshots/screenshot_2042.pn...
6,"Also über meine Firma, da habe ich 2006 zum er...","[00:00:12:21 00:00:12:29, 00:00:12:29 00:00:12...","[None, FIRMA1B*, None, ZUSAMMENHANG1A*, None, ...","[None, None, None, ZUSAMMENHANG1A*, None, None...","[None, zusammenhang, zusammenhang, zusammenhan...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:00:12:25, 00:00:12:33, 00:00:12:40, 00:00:...","[./data/1176340/screenshots/screenshot_69.png,..."
258,"Zuerst waren wir zusammen in Schleswig, bis si...","[00:02:09:26 00:02:09:33, 00:02:09:33 00:02:09...","[None, SCHLESWIG1*, None, ZUERST1A*, None, SCH...","[None, None, None, None, None, None, None, ZUS...","[None, schleswig, None, zuerst, None, schleswi...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:02:09:30, 00:02:09:39, 00:02:09:65, 00:02:...",[./data/1176340/screenshots/screenshot_721.png...
108,"Ich beginne das Thema von Anfang an, so wie bi...","[00:00:07:48 00:00:08:11, 00:00:08:11 00:00:08...","[None, ÜBER1, None, THEMA1*, None, ANFANG1A*, ...","[None, None, None, None, None, ANFANG1A*, None...","[None, über, None, thema, None, anfangen, anfa...",https://www.sign-lang.uni-hamburg.de/meinedgs_...,1176340,"[00:00:07:66, 00:00:08:13, 00:00:08:18, 00:00:...","[./data/1176340/screenshots/screenshot_43.png,..."


- **Ungroup dataset**

In [None]:
import pandas as pd

# Create an empty DataFrame with the desired columns
ungrouped_df = pd.DataFrame(columns=['Zeit', 'Lexem/Gebärde', 'Lexem/Gebärde.1', 'Mund', 'processed_timestamps', 'screenshot_path', 'Übersetzung', 'video_id'])
ungrouped_list = []

# Iterate over the sampled DataFrame and ungroup the data
for index, row in grouped_images_df.iterrows():
    group_cols = ['Zeit', 'Lexem/Gebärde', 'Lexem/Gebärde.1', 'Mund', 'processed_timestamps', 'screenshot_path']
    ubersetzung = row['Übersetzung']
    print("video: ", row['video_id'])
    print("Translation words number: ", len(ubersetzung.split(' ')))
    print("Zeit number: ", len(row['Zeit']))
    print("glosses number: ", len([x for x in row['Lexem/Gebärde']+row['Lexem/Gebärde.1'] if x is not None]))
    ungrouped_per_sentence = pd.DataFrame(columns=['Zeit', 'Lexem/Gebärde', 'Lexem/Gebärde.1', 'Mund', 'processed_timestamps', 'screenshot_path', 'Übersetzung', 'video_id'])
    for i in range(len(row['Zeit'])):
        group_data = {col: row[col][i] for col in group_cols}
        group_data['Übersetzung'] = row['Übersetzung']
        group_data['video_id'] = row['video_id']
        
        # Append the ungrouped data to the DataFrame
        ungrouped_df = pd.concat([ungrouped_df, pd.DataFrame([group_data])], ignore_index=True)
        ungrouped_per_sentence = pd.concat([ungrouped_per_sentence, pd.DataFrame([group_data])], ignore_index=True)
    ungrouped_list.append(ungrouped_per_sentence)

In [12]:
ungrouped_df.head()

Unnamed: 0,Zeit,Lexem/Gebärde,Lexem/Gebärde.1,Mund,processed_timestamps,screenshot_path,Übersetzung,video_id
0,00:00:34:01 00:00:34:09,,,,00:00:34:05,./data/1176340/screenshots/screenshot_198.png,"Aber das ist eigentlich auch egal, weil ich de...",1176340
1,00:00:34:09 00:00:34:15,EGAL3*,EGAL3*,egal,00:00:34:12,./data/1176340/screenshots/screenshot_199.png,"Aber das ist eigentlich auch egal, weil ich de...",1176340
2,00:00:34:15 00:00:34:21,,,,00:00:34:18,./data/1176340/screenshots/screenshot_200.png,"Aber das ist eigentlich auch egal, weil ich de...",1176340
3,00:00:34:21 00:00:34:30,$GEST-ABWINKEN1^*,$GEST-ABWINKEN1^*,,00:00:34:25,./data/1176340/screenshots/screenshot_201.png,"Aber das ist eigentlich auch egal, weil ich de...",1176340
4,00:00:34:30 00:00:34:35,,,,00:00:34:32,./data/1176340/screenshots/screenshot_202.png,"Aber das ist eigentlich auch egal, weil ich de...",1176340


In [13]:
ungrouped_df.size

27864

## Experiment with image sequencies
- Few sentences sample
- Send vocabulary dictioonary to model as context, then send sentence (sequence of images) for its translation
- Calculate or measure accuracy

- **Separate screenshots by left - right**
- Generate Vocabulary 

In [58]:
# Crear lista para el nuevo dataset
new_data = []

# Carpeta donde se guardarán las imágenes divididas
output_folder = "./data/1176340/screenshots_divided/"
os.makedirs(output_folder, exist_ok=True)

# Procesar cada fila del DataFrame
for index, row in ungrouped_df.iterrows():
    left_gloss = row["Lexem/Gebärde"]
    right_gloss = row["Lexem/Gebärde.1"]
    screenshot_path = row["screenshot_path"]
    timestamp = row["processed_timestamps"]
    sentence = row["Übersetzung"]

    # Verificar si el gloss existe
    left_exists = pd.notna(left_gloss)
    right_exists = pd.notna(right_gloss)

    # Si no hay gloss en ninguna columna, pasar a la siguiente fila
    if not left_exists and not right_exists:
        continue

    # Verificar si la imagen original existe
    if not os.path.exists(screenshot_path):
        print(f"Imagen no encontrada: {screenshot_path}")
        continue

    # Abrir imagen y dividir en dos
    with Image.open(screenshot_path) as img:
        width, height = img.size

        # Guardar solo si el gloss existe
        if left_exists:
            left_half = img.crop((0, 0, width // 2, height))
            left_path = os.path.join(output_folder, f"gloss_{index}_left.png")
            left_half.save(left_path)
            new_data.append([sentence, left_gloss, "left", screenshot_path, left_path, timestamp])

        if right_exists:
            right_half = img.crop((width // 2, 0, width, height))
            right_path = os.path.join(output_folder, f"gloss_{index}_right.png")
            right_half.save(right_path)
            new_data.append([sentence, right_gloss, "right", screenshot_path, right_path, timestamp])

# Crear DataFrame ordenado por timestamp
columns = ["sentence", "gloss", "position", "original_path", "new_path", "timestamp"]
sorted_df = pd.DataFrame(new_data, columns=columns).sort_values(by="timestamp")

# Guardar en CSV
sorted_df.to_csv("./data/1176340/gloss_dataset.csv", index=False, encoding="utf-8")

print("Gloss vocabulary: ./data/1176340/gloss_dataset.csv")

Gloss vocabulary: ./data/1176340/gloss_dataset.csv


In [15]:
sorted_df = pd.read_csv("./data/1176340/gloss_dataset.csv")

In [16]:
sorted_df.size

14364

- **Create vocabulary dictionary**

In [17]:
def create_vocabulary_dictionary(df_ungrouped: pd.DataFrame) -> Dict[str, Dict]:
    """
    Create a vocabulary dictionary with glosses and associated images.
    
    Args:
        df_ungrouped: DataFrame with ungrouped sign language data
        
    Returns:
        Dictionary with glosses as keys and associated data (excluding the gloss column) as values.
    """
    vocabulary = {}

    # Obtener los glosses únicos
    unique_glosses = set(df_ungrouped["gloss"].dropna())

    for gloss in unique_glosses:
        # Filtrar DataFrame por gloss actual
        grouped_by_gloss = df_ungrouped[df_ungrouped["gloss"] == gloss]

        # Eliminar la columna "gloss" para evitar redundancia
        grouped_by_gloss = grouped_by_gloss.drop(columns=["gloss"]).reset_index(drop=True)

        # Convertir a diccionario
        vocabulary[gloss] = grouped_by_gloss.to_dict(orient="records")

    return vocabulary


- **Create sentences sample to send sequences**

In [18]:
def create_sentences_sample(df_ungrouped: pd.DataFrame, vocabulary: Dict[str, Dict], seed_number: int = 4, sample_n: int = 10) -> Dict[str, Dict]:
    """
    Create a sample of sentences with the sequence of glosses used.

    Args:
        df_ungrouped: DataFrame with ungrouped sign language data
        vocabulary: Dictionary containing glosses and their associated images
        seed_number: Random seed for reproducibility
        sample_n: Number of sentences to return as a sample

    Returns:
        Dictionary with sentences, their glosses, other data, and a vocabulary sample for the prompt.
    """
    random.seed(seed_number)
    result = {}

    # Obtener las oraciones únicas
    unique_sentences = list(df_ungrouped["sentence"].dropna().unique())

    # Seleccionar solo sample_n oraciones aleatorias
    sampled_sentences = random.sample(unique_sentences, min(sample_n, len(unique_sentences)))

    for sentence in sampled_sentences:
        # Filtrar DataFrame por la oración actual
        grouped_by_sentence = df_ungrouped[df_ungrouped["sentence"] == sentence].drop(columns=["sentence"]).reset_index(drop=True)

        # Obtener los glosses de la oración
        glosses = grouped_by_sentence["gloss"].dropna().tolist()

        vocabulary_sample_for_prompt = {}

        for gloss in glosses:
            if gloss in vocabulary:
                df_gloss = pd.DataFrame(vocabulary[gloss])
                if "new_path" in df_gloss.columns and not df_gloss.empty:
                    images_from_gloss = df_gloss["new_path"].dropna().tolist()
                    if images_from_gloss:
                        selected_image = random.choice(images_from_gloss)
                        vocabulary_sample_for_prompt[gloss] = selected_image

        # Añadir 4 glosses aleatorios para confundir al LLM
        all_glosses = list(vocabulary.keys())
        random_glosses = random.sample([g for g in all_glosses if g not in glosses], min(4, len(all_glosses)))

        for random_gloss in random_glosses:
            df_random_gloss = pd.DataFrame(vocabulary[random_gloss])
            if "new_path" in df_random_gloss.columns and not df_random_gloss.empty:
                images_from_random_gloss = df_random_gloss["new_path"].dropna().tolist()
                if images_from_random_gloss:
                    vocabulary_sample_for_prompt[random_gloss] = random.choice(images_from_random_gloss)

        # Shuffle vocabulary_sample_for_prompt to randomize order
        shuffled_vocab_sample = dict(random.sample(list(vocabulary_sample_for_prompt.items()), len(vocabulary_sample_for_prompt)))

        # Guardar los datos en el diccionario final
        result[sentence] = {
            "data": grouped_by_sentence.to_dict(orient="records"),
            "vocabulary_sample_for_prompt": shuffled_vocab_sample
        }

    return result


In [19]:
def encode_image_to_base64(image_path: str) -> Optional[str]:
    """
    Encode an image to base64 string.
    
    Args:
        image_path: Path to the image file
        
    Returns:
        Base64 encoded string of the image or None if the file doesn't exist
    """
    if not os.path.exists(image_path):
        print(f"Warning: Image file {image_path} does not exist.")
        return None
    
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

- **Generate vocabulary**

In [36]:
vocabulary_dict = create_vocabulary_dictionary(sorted_df)
print(f"Number of glosses: {len(list(vocabulary_dict))}")

Number of glosses: 675


- **Generate sentences sample to experiment**

In [35]:
sample_to_experiment = create_sentences_sample(sorted_df, vocabulary_dict, seed_number=4, sample_n=10)

- **Experiment**

In [29]:
def compute_glosses_accuracy_nltk(predicted: List[str], actual: List[str]) -> float:
    """
    Compute similarity between predicted and actual sentences using SequenceMatcher.
    Returns a percentage match.
    """
    similarity = SequenceMatcher(None, ' '.join(predicted).lower(), ' '.join(actual).lower()).ratio()
    return similarity * 100

def compute_glosses_accuracy_nltk_bleu(predicted: List[str], actual: List[str]) -> float:
    """
    Compute BLEU score between predicted and actual sentences.
    Returns a percentage match.
    """
    reference = [actual]
    candidate = predicted
    score = sentence_bleu(reference, candidate)
    return score * 100

def translate_german_to_english(client: OpenAI, german_text: str) -> str:
    """
    Translate German text to English using OpenAI API.
    """
    prompt = f"Translate the following German text to English:\n\n{german_text}\n\nEnglish translation:"
    
    try:
        response = client.chat.completions.create(
            model="gpt-4",
            messages=[{"role": "user", "content": prompt}],
            max_tokens=200
        )
        
        return response.choices[0].message.content.strip()
    except Exception as e:
        return f"Error translating to English: {str(e)}"

def translate_image_sequence(sample_of_sentences: Dict[str, Dict], sentence_index: int = 0, model: str = "gpt-4o") -> Dict:
    """
    Translate a sequence of images to glosses and sentences using OpenAI API.
    """
    client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))
    glosses = []
    sentence_experiment = list(sample_of_sentences)[sentence_index] 
    sentence = sample_of_sentences[sentence_experiment]["data"]
    image_vocabulary = sample_of_sentences[sentence_experiment]["vocabulary_sample_for_prompt"]
    content = [{"type": "text", "text": "VOCABULARY REFERENCE (Image → Gloss):"}]
    
    for i, (gloss, image_path) in enumerate(image_vocabulary.items(), 1):
        base64_image = encode_image_to_base64(image_path)
        if base64_image:
            content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}})
            content.append({"type": "text", "text": f"Vocabulary Image {i}: {gloss}"})
    
    content.append({"type": "text", "text": "\nSENTENCE TO TRANSLATE (Sequence of Images):"})
    
    for i, fragment in enumerate(sentence, 1):
        glosses.append(fragment["gloss"])
        base64_image = encode_image_to_base64(fragment["new_path"])
        if base64_image:
            content.append({"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}})
            content.append({"type": "text", "text": f"Sentence Image {i}"})
    
    prompt = """
    Based on the vocabulary reference images and their corresponding glosses provided above, 
    analyze the sequence of images representing a sentence and:
    
    1. Identify which glosses from the vocabulary each image in the sentence corresponds to
    2. Translate the sequence of glosses into a coherent German sentence
    
    Return your response in the following JSON format:
    {
        "predicted_glosses": ["gloss1", "gloss2", ...],
        "german_translation": "German sentence translation",
        "english_translation": "English sentence translation"
    }
    
    If you're uncertain about any image, make your best guess based on visual similarity to the vocabulary images.
    """
    
    content.append({"type": "text", "text": prompt})

    try:
        response = client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": content}],
            max_tokens=1000,
            response_format={"type": "json_object"}
        )
        
        result = json.loads(response.choices[0].message.content)
        result["expected_glosses"] = glosses
        result["image_sequence"] = [fragment["new_path"] for fragment in sentence]
        accuracy_result_sequence_matcher = compute_glosses_accuracy_nltk(result["predicted_glosses"], glosses)
        accuracy_result_bleu = compute_glosses_accuracy_nltk_bleu(result["predicted_glosses"], glosses)
        result["sequence_matcher_accuracy"] = accuracy_result_sequence_matcher
        result["bleu_accuracy"] = accuracy_result_bleu
        
        # Translate German to English
        english_translation = translate_german_to_english(client, sentence_experiment)
        result["expected_german_english_translation"] = english_translation
        
        return result
    
    except Exception as e:
        return {
            "error": str(e),
            "expected_glosses": glosses,
            "predicted_glosses": result["glosses"],
            "german_translation": "Error generating translation",
            "english_translation": "Error generating translation",
            "sequence_matcher_accuracy": 0,
            "bleu_accuracy": 0,
        }


In [30]:
results_df = []
for i in range(len(sample_to_experiment)):
    result = translate_image_sequence(sample_to_experiment, sentence_index=i, model="gpt-4o-mini")
    result["sentence"] = list(sample_to_experiment)[i]
    results_df.append(result)
results_df = pd.DataFrame(results_df)

The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [34]:
results_df

Unnamed: 0,predicted_glosses,german_translation,english_translation,expected_glosses,image_sequence,sequence_matcher_accuracy,bleu_accuracy,expected_german_english_translation,sentence
0,"[MEHR1, NUR2A, INHALT3, AUSTAUSCHEN-KOMMUNIKAT...","Mehr nur Inhalt, Austausch und Kommunikation, ...","More just content, exchange and communication,...","[JUNG1*, AUFPASSEN1B^*, AUFPASSEN1B^*, INHALT3...",[./data/1176340/screenshots_divided/gloss_797_...,20.895522,0.0,"The youth leader, for example, can initiate a ...","Der Jugendwart kann z.B. initiieren, dass man ..."
1,"[$GEST^, BRUDER1A*, BESCHEID1A*, SCHLIMM3B*, S...",Ich spreche später mit meinem Bruder.,I will talk to my brother later.,"[BRUDER1A*, BRUDER1A*, BESCHEID1A*, $GEST^, $G...",[./data/1176340/screenshots_divided/gloss_2436...,60.869565,9.283143000000001e-153,"My brother said to me, ""Hey, why didn't you te...","Mein Bruder meinte zu mir: „Hey, warum hast du..."
2,"[ICH1, ALLE3]",Ich alle.,I all.,"[$INDEX1*, ICH1*]",[./data/1176340/screenshots_divided/gloss_2224...,33.333333,0.0,I,Ich/
3,"[ICH2, EINFACH3, NICHT3A, MÖGEN4]",Ich mag es nicht einfach.,I do not like it simply.,"[GRUND4B*, GRUND4B*, ICH2, KEIN1*, MÖGEN4, NIC...",[./data/1176340/screenshots_divided/gloss_778_...,22.5,1.495451e-230,The reason was that I did not want to sit arou...,"Der Grund war der, dass ich nicht rumsitzen un..."
4,"[ZUSAMMENHANG1A^, ICH1, VERLETZUNG1A, TRENNEN2...","Zusammenhang ich Verletzung trennen, wie sozus...","Context I injury separate, how to say.","[DARUM1*, DARUM1*, ICH1, WIE-SOZUSAGEN1*, WIE-...",[./data/1176340/screenshots_divided/gloss_1482...,36.666667,3.237241e-230,I burst into tears because I felt hurt.,"Ich bin in Tränen ausgebrochen, weil ich mich ..."
5,"[MEINUNG1A, STIMMT1B*, GEHÖREN1^*, MANNSCHAFT1...","Die Meinung stimmt, es gehört zur Mannschaft a...",The opinion is correct; it belongs to the team...,"[MANNSCHAFT1*, FUSSBALL1A, FUSSBALL1A, MANNSCH...",[./data/1176340/screenshots_divided/gloss_3462...,27.027027,2.5352400000000004e-153,We shared the same opinion about the football ...,Über die Fußballmannschaft Bayern München teil...
6,"[ICH2, AUCH1A, KANN2B, WICHTIG1*, GEBÄRDEN1A*]",Ich kann auch wichtig gebärden.,I can also sign important things.,"[ICH2, WICHTIG1*, WICHTIG1*, AUCH1A, CHEF1B, K...",[./data/1176340/screenshots_divided/gloss_2601...,44.827586,8.186019e-230,It is also important to me that my boss can us...,"Mir ist es auch wichtig, dass mein Chef gebärd..."
7,"[HOFFEN1A*, ICH2, WARUM1*, RICHTIG1A*, MANCHMAL1]","Ich hoffe, warum ist es manchmal richtig?","I hope, why is it sometimes right?","[ICH2, GEBÄRDEN1A*, GEBÄRDEN1A*, HOFFEN1A*, FÖ...",[./data/1176340/screenshots_divided/gloss_1592...,32.380952,1.186218e-229,I gestured with them and hoped to motivate the...,"Ich gebärdete mit ihnen und hoffte, sie dadurc..."
8,"[SCHWER1A, MEISTENS1B, DASSELBE2A, VERLETZUNG1...","Es ist meistens dasselbe und sehr schlimm, wen...",It is mostly the same and very bad when one is...,"[$GEST^, $GEST^, $PROD*, DASSELBE2A, DASSELBE2...",[./data/1176340/screenshots_divided/gloss_2955...,42.372881,9.853444999999999e-230,Should I hold back? That would be something si...,Soll ich mich zurückhalten? Das wäre so etwas ...
9,"[GLAUBEN2A, UNSICHER1, WIE-VERGLEICH1A, ZUSAMM...","Ich glaube, ich bin unsicher, wie diese Person...",I believe I am uncertain about how this person...,"[$INDEX1*, WISSEN2B^*, WIE-VERGLEICH1A*, WIE-V...",[./data/1176340/screenshots_divided/gloss_605_...,40.0,0.0,That was like/,Das war wie/


- **Export results**

In [33]:
results_df.to_json("./data/1176340/gloss_dataset_results_gpt4o_mini.json")