# SETUP

In [21]:
import os
import re
import ast
import json
import ollama
import random
import base64
import requests
import itertools
import numpy as np
import pandas as pd
from tqdm import tqdm
from PIL import Image
from scipy import stats
from itertools import product
from collections import Counter
import matplotlib.pyplot as plt
import matplotlib.patches as patches

from typing import List, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
from pydantic import BaseModel, Field

# DF

In [2]:
folder_path = r'C:\Users\David\Documents\AGEAI\Scripts\OUTPUTS\ANDREA\06_08_midjourney_activities_parejo'

description_list = []

prompt_pattern = re.compile(r'(.*?)(?= --personalize| --style|$)', re.DOTALL)

for file_name in os.listdir(folder_path):
    if file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff')):
        file_path = os.path.join(folder_path, file_name)
        
        image = Image.open(file_path)
        
        info = image.info
        
        description = info.get('Description', None)
        
        if description:
            match = prompt_pattern.match(description)
            cleaned_description = match.group(1) if match else description
        else:
            cleaned_description = None
        
        description_list.append({'filename': file_name, 'prompt': cleaned_description})

df = pd.DataFrame(description_list, columns=['filename', 'prompt'])

df['codi'] = df['filename'].str.split('_').str[0]

df['age_group'] = df['filename'].apply(lambda x: 'older' if 'OP' in x else 'neutral')

df['ID_num'] = df['codi'].str.extract('(\d+)')

df['ID_num'] = pd.to_numeric(df['ID_num'])
df=df.sort_values(by='ID_num', ascending=True)

df = df.reset_index(drop=True)

df_older = df[df['age_group'] == 'older']
df_neutral = df[df['age_group'] == 'neutral']

df_older['ID'] = ''
for prompt in df_older['ID_num'].unique():
    df_subset = df_older[df_older['ID_num'] == prompt]
    count = 1
    for index, row in df_subset.iterrows():
        df_older.loc[index, 'ID'] = f'o_p{row["ID_num"]}_f{count}'
        count += 1

#df_older

df_neutral['ID'] = ''
for prompt in df_neutral['ID_num'].unique():
    df_subset = df_neutral[df_neutral['ID_num'] == prompt]
    count = 1
    for index, row in df_subset.iterrows():
        df_neutral.loc[index, 'ID'] = f'a_p{row["ID_num"]}_f{count}'
        count += 1

#df_neutral

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_older['ID'] = ''
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_neutral['ID'] = ''


In [3]:
output_dir = "C:/Users/David/Documents/AGEAI/Scripts/OUTPUTS/ANDREA/06_08_midjourney_activities_parejo/CSV"

os.makedirs(output_dir, exist_ok=True)
df_older.to_csv(os.path.join(output_dir, "df_older.csv"), index=False)
df_neutral.to_csv(os.path.join(output_dir, "df_neutral.csv"), index=False)

In [4]:
# Comprimir imagenes
def reduce_image_size(image, max_width, max_height):
    width, height = image.size
    if width > max_width or height > max_height:
        image.thumbnail((max_width, max_height), Image.LANCZOS)
    return image

def process_images(input_folder, neutral_df, older_df, neutral_subfolder, older_subfolder):
    neutral_output_folder = os.path.join(input_folder, neutral_subfolder)
    older_output_folder = os.path.join(input_folder, older_subfolder)
    
    if not os.path.exists(neutral_output_folder):
        os.makedirs(neutral_output_folder)
    if not os.path.exists(older_output_folder):
        os.makedirs(older_output_folder)
    
    neutral_files = neutral_df['filename'].tolist()
    older_files = older_df['filename'].tolist()
    
    # Procesar imágenes
    for filename in os.listdir(input_folder):
        if filename.endswith('.png'):
            input_path = os.path.join(input_folder, filename)
            
            if filename in neutral_files:
                output_folder = neutral_output_folder
            elif filename in older_files:
                output_folder = older_output_folder
            else:
                print(f"Archivo no clasificado: {filename}")
                continue
            
            output_path = os.path.join(output_folder, os.path.splitext(filename)[0] + '.jpg')

            try:
                with Image.open(input_path) as img:
                    img = reduce_image_size(img, 1024, 1024)
                    img = img.convert('RGB')
                    img.save(output_path, 'JPEG', optimize=True, quality=85)
            except Exception as e:
                print(f"Error procesando {filename}: {e}")

    print("Proceso completado.")

input_folder = folder_path
neutral_subfolder = 'neutral_comp'
older_subfolder = 'older_comp'

process_images(input_folder, df_neutral, df_older, neutral_subfolder, older_subfolder)

Proceso completado.


# OLLAMA

In [13]:
prompts = {
  "gender": "Determine the gender of the person in the image. Choose one of the following: male, female, or none (if indeterminable). Return the answer in the following JSON format: {\"gender\": \"[gender]\"}.",
  "age_range": "Estimate the age range of the person in the image. Select from the following options: 18-24, 25-34, 35-44, 45-54, 55-64, 65-79, or 80+. Return the answer in the following JSON format: {\"age\": \"[range]\"}.",
  "race": "Identify the race of the person in the image. Choose from the following options: asian, black, white, hispanic, or other. Return the answer in the following JSON format: {\"race\": \"[race]\"}.",
  "emotions_short": "Identify the emotion expressed by the person in the image. Choose from the following: neutral, positive, negative, or exaggerated. Return the answer in the following JSON format: {\"emotion\": \"[emotion]\"}.",
  "personality_short": "Determine the personality trait expressed by the person in the image. Choose from the following options: Openness, Conscientiousness, Extraversion, Agreeableness, or Neuroticism. Return the answer in the following JSON format: {\"personality\": \"[personality]\"}.",
  "location": "Identify the setting of the image. Choose one of the following: indoors, outdoors, or none (if indeterminable). Return the answer in the following JSON format: {\"location\": \"[location]\"}.",
  "objects": "List up to 10 objects visible in the image. Return the list in the following JSON format: {\"objects\": [\"object\", ...]}.",
  "objects_assist_devices": "List any assistive devices visible in the image. Return the list in the following JSON format: {\"assistive_devices\": [\"object\", ...]}.",
  "objects_digi_devices": "List any digital devices visible in the image. Return the list in the following JSON format: {\"digital_devices\": [\"object\", ...]}.",
  "person_count": "Count the number of people visible in the image. Choose one of the following: 1, 2, 3, or +3. Return the answer in the following JSON format: {\"person_count\": \"[count]\"}.",
  "shot": "Identify the type of shot in the image. Choose one of the following options: close-up shot, medium shot, or full shot. Return the answer in the following JSON format: {\"shot\": \"[selected shot]\"}.",
  "position_short": "Describe the posture of the person in the image using one word. Examples: standing, sitting, lying. Return the answer in the following JSON format: {\"position\": \"[position]\"}."
}

In [4]:
# Define Pydantic models for each prompt output
class Gender(BaseModel):
    gender: str = Field(..., description="Gender of the person in the image (male, female, or none)")

class AgeRange(BaseModel):
    age: str = Field(..., description="Age range of the person in the image (18-24, 25-34, 35-44, 45-54, 55-64, 65-79, or 80+)")

class Race(BaseModel):
    race: str = Field(..., description="Race of the person in the image (asian, black, white, hispanic, or other)")

class Emotion(BaseModel):
    emotion: str = Field(..., description="Emotion expressed by the person in the image (neutral, positive, negative, or exaggerated)")

class Personality(BaseModel):
    personality: str = Field(..., description="Personality trait expressed by the person in the image (Openness, Conscientiousness, Extraversion, Agreeableness, or Neuroticism)")

class Location(BaseModel):
    location: str = Field(..., description="Setting of the image (indoors, outdoors, or none)")

class Objects(BaseModel):
    objects: List[str] = Field(..., description="List of up to 10 objects visible in the image")

class AssistiveDevices(BaseModel):
    assistive_devices: List[str] = Field(..., description="List of assistive devices visible in the image")

class DigitalDevices(BaseModel):
    digital_devices: List[str] = Field(..., description="List of digital devices visible in the image")

class PersonCount(BaseModel):
    person_count: str = Field(..., description="Number of people visible in the image (1, 2, 3, or +3)")

class Shot(BaseModel):
    shot: str = Field(..., description="Type of shot in the image (close-up shot, medium shot, or full shot)")

class Position(BaseModel):
    position: str = Field(..., description="Posture of the person in the image (e.g., standing, sitting, lying)")

class ImageAnalysisResults(BaseModel):
    ID_jpg: str = Field(..., description="Filename of the image")
    gender: Optional[Gender] = None
    age_range: Optional[AgeRange] = None
    race: Optional[Race] = None
    emotions_short: Optional[Emotion] = None
    personality_short: Optional[Personality] = None
    location: Optional[Location] = None
    objects: Optional[Objects] = None
    objects_assist_devices: Optional[AssistiveDevices] = None
    objects_digi_devices: Optional[DigitalDevices] = None
    person_count: Optional[PersonCount] = None
    shot: Optional[Shot] = None
    position_short: Optional[Position] = None
    error: Optional[str] = None 

In [19]:
def call_ollama(image_path, prompt):
    try:
        with open(image_path, 'rb') as file:
            image_bytes = file.read()
        
        response = ollama.chat(
            model='llava:7b-v1.6-mistral-q8_0',
            #format='json',
            messages=[
                {
                    'role': 'system',
                    'content': """You are a helpful visual assistant. Analyze the following image and provide answers to these prompts. 
                    Be concise and precise in your answers, and provide only the information requested. Do not provide any additional information."""
                },
                {
                    'role': 'user',
                    'content': prompt,
                    'images': [image_bytes],
                },
            ],
             options= {
                 'seed': 123,
                 'temperature': 0}
         )
        return response['message']['content']
    except Exception as e:
        print(f"Error calling Ollama: {e}")
        return "Error"

# def process_image(image_path):
#     try:
#         results = {"ID_jpg": os.path.basename(image_path)}
        
#         for prompt_name, prompt_text in prompts.items():
#             results[prompt_name] = call_ollama(image_path, prompt_text)
        
#         return results
#     except Exception as e:
#         print(f"Error processing {image_path}: {e}")
#         return {"ID_jpg": os.path.basename(image_path), "error": str(e)}

def process_image(image_path):
    try:
        results = {"ID_jpg": os.path.basename(image_path)}
        
        for prompt_name, prompt_text in prompts.items():
            response_str = call_ollama(image_path, prompt_text)
            
            try:
                # Clean up the response string (if needed)
                response_str = response_str.strip() 
                if response_str.startswith("```json") and response_str.endswith("```"):
                    response_str = response_str[7:-3].strip()

                response_json = json.loads(response_str) # Use json.loads() for safer parsing

                # Validate JSON using Pydantic models
                if prompt_name == "gender":
                    results[prompt_name] = Gender(**response_json)
                elif prompt_name == "age_range":
                    results[prompt_name] = AgeRange(**response_json)
                elif prompt_name == "race":
                    results[prompt_name] = Race(**response_json)
                elif prompt_name == "emotions_short":
                    results[prompt_name] = Emotion(**response_json)
                elif prompt_name == "personality_short":
                    results[prompt_name] = Personality(**response_json)
                elif prompt_name == "location":
                    results[prompt_name] = Location(**response_json)
                elif prompt_name == "objects":
                    results[prompt_name] = Objects(**response_json)
                elif prompt_name == "objects_assist_devices":
                    results[prompt_name] = AssistiveDevices(**response_json)
                elif prompt_name == "objects_digi_devices":
                    results[prompt_name] = DigitalDevices(**response_json)
                elif prompt_name == "person_count":
                    results[prompt_name] = PersonCount(**response_json)
                elif prompt_name == "shot":
                    results[prompt_name] = Shot(**response_json)
                elif prompt_name == "position_short":
                    results[prompt_name] = Position(**response_json)

            except (json.JSONDecodeError, SyntaxError) as e:
                print(f"Error parsing JSON for {prompt_name} in {image_path}: {e}")

                # Attempt to fix the missing closing brace
                if isinstance(e, json.JSONDecodeError) and "Expecting ',' delimiter" in str(e):
                    try:
                        response_str += "}"  # Add the missing closing brace
                        response_json = json.loads(response_str)

                        # Re-attempt Pydantic validation with the fixed JSON for all prompts
                        if prompt_name == "gender":
                            results[prompt_name] = Gender(**response_json)
                        elif prompt_name == "age_range":
                            results[prompt_name] = AgeRange(**response_json)
                        # ... (Handle all other prompts similarly)

                    except (json.JSONDecodeError, SyntaxError) as e2:
                        print(f"Failed to fix JSON for {prompt_name} in {image_path} after adding closing brace: {e2}")
                        results[prompt_name] = response_str  # Store raw string if fixing fails

                else:
                    results[prompt_name] = response_str  # Store raw string if other parsing errors occur
        
        return ImageAnalysisResults(**results).model_dump() # Use model_dump for dictionary conversion

    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return ImageAnalysisResults(ID_jpg=os.path.basename(image_path), error=str(e)).model_dump() 

def process_folder(folder_path):
    all_results = []
    image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    
    for image_file in tqdm(image_files, desc="Processing images"):
        image_path = os.path.join(folder_path, image_file)
        results = process_image(image_path)
        all_results.append(results)
    
    return pd.DataFrame(all_results)

In [22]:
def call_ollama(image_path: str, prompt: str) -> str:
    try:
        with open(image_path, 'rb') as file:
            image_bytes = file.read()
        
        response = ollama.chat(
            model='llava:7b-v1.6-mistral-q8_0',
            messages=[
                {
                    'role': 'system',
                    'content': """You are a helpful visual assistant. Analyze the following image and provide answers to these prompts. 
                    Be concise and precise in your answers, and provide only the information requested in the exact JSON format specified."""
                },
                {
                    'role': 'user',
                    'content': prompt,
                    'images': [image_bytes],
                },
            ],
            options={'seed': 123, 'temperature': 0}
        )
        return response['message']['content']
    except Exception as e:
        print(f"Error calling Ollama: {e}")
        return json.dumps({"error": str(e)})

def clean_json_string(json_str: str) -> str:
    # Remove any leading/trailing whitespace and non-JSON content
    json_str = json_str.strip()
    if json_str.startswith("```json"):
        json_str = json_str[7:]
    if json_str.endswith("```"):
        json_str = json_str[:-3]
    return json_str.strip()

def parse_json_safely(json_str: str) -> dict:
    try:
        return json.loads(json_str)
    except json.JSONDecodeError as e:
        print(f"JSON Decode Error: {e}")
        # Attempt to fix common JSON errors
        if "Expecting ',' delimiter" in str(e):
            json_str += "}"
        elif "Expecting property name enclosed in double quotes" in str(e):
            json_str = json_str.replace("'", '"')
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            return {"error": "Failed to parse JSON"}

def process_single_prompt(image_path: str, prompt_name: str, prompt_text: str) -> tuple:
    response_str = call_ollama(image_path, prompt_text)
    cleaned_response = clean_json_string(response_str)
    parsed_response = parse_json_safely(cleaned_response)
    
    try:
        if prompt_name == "gender":
            return prompt_name, Gender(**parsed_response)
        elif prompt_name == "age_range":
            return prompt_name, AgeRange(**parsed_response)
        # ... [Handle other prompt types] ...
        else:
            return prompt_name, parsed_response
    except Exception as e:
        print(f"Error validating {prompt_name} for {image_path}: {e}")
        return prompt_name, {"error": str(e)}

def process_image(image_path: str) -> ImageAnalysisResults:
    results = {"ID_jpg": os.path.basename(image_path)}
    
    with ThreadPoolExecutor(max_workers=5) as executor:
        future_to_prompt = {executor.submit(process_single_prompt, image_path, prompt_name, prompt_text): prompt_name 
                            for prompt_name, prompt_text in prompts.items()}
        
        for future in as_completed(future_to_prompt):
            prompt_name = future_to_prompt[future]
            try:
                prompt_name, result = future.result()
                results[prompt_name] = result
            except Exception as e:
                print(f"Error processing {prompt_name} for {image_path}: {e}")
                results[prompt_name] = {"error": str(e)}
    
    return ImageAnalysisResults(**results)

def process_folder(folder_path: str) -> pd.DataFrame:
    image_files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.png', '.jpg', '.jpeg'))]
    all_results = []
    
    with ThreadPoolExecutor(max_workers=3) as executor:
        future_to_image = {executor.submit(process_image, os.path.join(folder_path, image_file)): image_file 
                           for image_file in image_files}
        
        for future in tqdm(as_completed(future_to_image), total=len(image_files), desc="Processing images"):
            image_file = future_to_image[future]
            try:
                result = future.result()
                all_results.append(result.model_dump())
            except Exception as e:
                print(f"Error processing {image_file}: {e}")
                all_results.append({"ID_jpg": image_file, "error": str(e)})
    
    return pd.DataFrame(all_results)

if __name__ == "__main__":
    input_folder = "C:/Users/David/Documents/AGEAI/Scripts/OUTPUTS/AGOST/ANDREA/demo_pydantic"
    output_file = "C:/Users/David/Documents/AGEAI/Scripts/OUTPUTS/AGOST/ANDREA/neutral_pydantic_improved.csv"
    
    df_results = process_folder(input_folder)
    df_results.to_csv(output_file, index=False)
    print(f"Results saved to {output_file}")

Processing images:   0%|          | 0/5 [00:00<?, ?it/s]

JSON Decode Error: Expecting ',' delimiter: line 1 column 21 (char 20)
JSON Decode Error: Expecting ',' delimiter: line 1 column 21 (char 20)


Processing images:  60%|██████    | 3/5 [01:56<00:55, 27.83s/it] 

JSON Decode Error: Expecting ',' delimiter: line 1 column 27 (char 26)
JSON Decode Error: Expecting ',' delimiter: line 1 column 21 (char 20)
JSON Decode Error: Expecting ',' delimiter: line 1 column 21 (char 20)


Processing images: 100%|██████████| 5/5 [03:21<00:00, 40.27s/it]

Results saved to C:/Users/David/Documents/AGEAI/Scripts/OUTPUTS/AGOST/ANDREA/neutral_pydantic_improved.csv





In [20]:
#NEUTRAL
#df_neutral_results = process_folder("C:/Users/David/Documents/AGEAI/Scripts/OUTPUTS/ANDREA/06_08_midjourney_activities_parejo/neutral_comp")
#df_neutral_results.to_csv("C:/Users/David/Documents/AGEAI/Scripts/OUTPUTS/ANDREA/06_08_midjourney_activities_parejo/CSV/neutral_analysis_llava.csv", index=False)

df_neutral_results = process_folder("C:/Users/David/Documents/AGEAI/Scripts/OUTPUTS/AGOST/ANDREA/demo_pydantic")
df_neutral_results.to_csv("C:/Users/David/Documents/AGEAI/Scripts/OUTPUTS/AGOST/ANDREA/neutral_pydantic.csv", index=False)
print(f"Results saved")

Processing images:   0%|          | 0/5 [00:00<?, ?it/s]

Error parsing JSON for person_count in C:/Users/David/Documents/AGEAI/Scripts/OUTPUTS/AGOST/ANDREA/demo_pydantic\103OP_Create_an_image_in_front_view_of_an_older_person_taking_98ee94f2-6af2-4a2c-a90a-d2b13e820f1e_1.png: Expecting ',' delimiter: line 1 column 21 (char 20)


Processing images:  20%|██        | 1/5 [00:33<02:13, 33.34s/it]

Error parsing JSON for location in C:/Users/David/Documents/AGEAI/Scripts/OUTPUTS/AGOST/ANDREA/demo_pydantic\103OP_Create_an_image_in_front_view_of_an_older_person_taking_98ee94f2-6af2-4a2c-a90a-d2b13e820f1e_2.png: Expecting ',' delimiter: line 1 column 24 (char 23)
Error parsing JSON for person_count in C:/Users/David/Documents/AGEAI/Scripts/OUTPUTS/AGOST/ANDREA/demo_pydantic\103OP_Create_an_image_in_front_view_of_an_older_person_taking_98ee94f2-6af2-4a2c-a90a-d2b13e820f1e_2.png: Expecting ',' delimiter: line 1 column 21 (char 20)


Processing images:  40%|████      | 2/5 [01:07<01:41, 33.67s/it]

Error parsing JSON for person_count in C:/Users/David/Documents/AGEAI/Scripts/OUTPUTS/AGOST/ANDREA/demo_pydantic\103OP_Create_an_image_in_front_view_of_an_older_person_taking_98ee94f2-6af2-4a2c-a90a-d2b13e820f1e_3.png: Expecting ',' delimiter: line 1 column 21 (char 20)


Processing images:  60%|██████    | 3/5 [01:47<01:13, 36.76s/it]

Error parsing JSON for person_count in C:/Users/David/Documents/AGEAI/Scripts/OUTPUTS/AGOST/ANDREA/demo_pydantic\103P_Create_an_image_in_front_view_of_a_person_taking_care_o_b53494f5-4010-4070-8751-3f1299e21435_0.png: Expecting ',' delimiter: line 1 column 21 (char 20)


Processing images:  80%|████████  | 4/5 [02:26<00:37, 37.48s/it]

Error parsing JSON for person_count in C:/Users/David/Documents/AGEAI/Scripts/OUTPUTS/AGOST/ANDREA/demo_pydantic\103P_Create_an_image_in_front_view_of_a_person_taking_care_o_b53494f5-4010-4070-8751-3f1299e21435_1.png: Expecting ',' delimiter: line 1 column 21 (char 20)


Processing images: 100%|██████████| 5/5 [03:01<00:00, 36.37s/it]

Results saved





In [5]:
#OLDER
df_older_results = process_folder("C:/Users/David/Documents/AGEAI/Scripts/OUTPUTS/ANDREA/06_08_midjourney_activities_parejo/older_comp")
df_older_results.to_csv("C:/Users/David/Documents/AGEAI/Scripts/OUTPUTS/ANDREA/06_08_midjourney_activities_parejo/CSV/older_analysis_llava.csv", index=False)
print(f"Results saved")

Processing images: 100%|██████████| 403/403 [4:25:07<00:00, 39.47s/it]  

Results saved





# MERGE FINAL

In [221]:
output_dir = "C:/Users/David/Documents/AGEAI/Scripts/OUTPUTS/ANDREA/06_08_midjourney_activities_parejo/CSV/"

df_neutral = pd.read_csv(f'{output_dir}/df_neutral.csv')
df_older = pd.read_csv(f'{output_dir}/df_older.csv')
df_merged = pd.concat([df_neutral, df_older], axis=0)
df_merged = df_merged.rename(columns={'filename': 'filename_png'})
df_merged['filename_merge'] = df_merged['filename_png'].str.replace('.png', '')

  df_merged['filename_merge'] = df_merged['filename_png'].str.replace('.png', '')


In [222]:
df_neutral_cv = pd.read_csv("C:/Users/David/Documents/AGEAI/Scripts/OUTPUTS/ANDREA/06_08_midjourney_activities_parejo/CSV/neutral_analysis_llava.csv")
df_older_cv = pd.read_csv("C:/Users/David/Documents/AGEAI/Scripts/OUTPUTS/ANDREA/06_08_midjourney_activities_parejo/CSV/older_analysis_llava.csv")

df_merged_2 = pd.concat([df_neutral_cv, df_older_cv], axis=0)
df_merged_2 = df_merged_2.rename(columns={'ID_jpg': 'filename_jpg'})   

In [223]:
df_merged_2['filename_merge'] = df_merged_2['filename_jpg'].str.replace('.jpg', '')
df_merged = pd.merge(df_merged, df_merged_2, on='filename_merge', how='left')
df_merged = df_merged.drop(columns=["ID_num", "filename_merge"])

  df_merged_2['filename_merge'] = df_merged_2['filename_jpg'].str.replace('.jpg', '')


In [224]:
def clean_json(text):
    if isinstance(text, str):
        # Eliminar delimitadores de código y la etiqueta 'json'
        cleaned_text = re.sub(r'```json', '', text)  # Eliminar '```json'
        cleaned_text = re.sub(r'```', '', cleaned_text)  # Eliminar '```'
        
        # Utilizamos expresiones regulares para eliminar espacios alrededor de '{' y '}'
        cleaned_text = re.sub(r'\{\s+', '{', cleaned_text)
        cleaned_text = re.sub(r'\s+\}', '}', cleaned_text)
        
        # Eliminamos '\n' y luego eliminamos cualquier espacio en blanco adicional
        cleaned_text = cleaned_text.replace('\n', '').strip()
        
        return cleaned_text
    return text

# Definimos la función para asegurar el formato de lista
def ensure_list_format(value):
    if isinstance(value, str):
        try:
            # Intenta convertir la cadena a un diccionario
            data = ast.literal_eval(value)
            if isinstance(data, dict):
                for key in data:
                    # Verifica si el valor asociado a cada clave no está en una lista
                    if not isinstance(data[key], list):
                        data[key] = [data[key]]
                # Convierte el diccionario de vuelta a cadena
                return str(data)
            return value
        except:
            return value
    return value

# Lista de columnas que queremos transformar
columns_to_transform = [
    'gender', 
    'age_range', 
    'race', 
    'emotions_short', 
    'location', 
    'personality_short', 
    'shot', 
    'position_short',
    'objects',
    'objects_assist_devices',
    'objects_digi_devices',
    'person_count'
]

#excepto 'filename_png' y 'filename_jpg'
for column in df_merged.columns:
    if column not in ['filename_png', 'filename_jpg']:
        df_merged[column] = df_merged[column].apply(clean_json)
        df_merged[column] = df_merged[column].apply(lambda x: x.lower() if isinstance(x, str) else x)
        df_merged[column] = df_merged[column].apply(ensure_list_format)

In [232]:
df_merged['age_range'].unique()

array(["{'age': ['25-34']}", "{'age': ['18-24']}", "{'age': ['35-44']}",
       "{'age': ['45-54']}", "{'age': ['65-79']}", "{'age': ['80+']}",
       "{'age': ['55-64']}", '{"age": "[80+"]"}', '{{"age": "[80+]"}',
       '{{"age": "[80+]"}.}'], dtype=object)

In [231]:
# Lista de valores válidos
valid_age_ranges = {'18-24', '25-34', '35-44', '45-54', '55-64', '65-79', '80+'}

def extract_and_format_age(age_str):
    # Limpiar espacios adicionales
    age_str = age_str.strip()
    
    # Intentar interpretar el string como un diccionario literal
    try:
        age_dict = ast.literal_eval(age_str)
    except (ValueError, SyntaxError):
        return age_str  # Si no se puede interpretar, devolver el string original
    
    # Extraer el valor de la clave 'age'
    if isinstance(age_dict, dict) and 'age' in age_dict:
        age_value = age_dict['age']
        
        # Si el valor es una lista, tomar el primer elemento
        if isinstance(age_value, list):
            age_value = age_value[0]
        
        # Limpiar corchetes y comillas adicionales
        if isinstance(age_value, str):
            # Usar expresiones regulares para extraer el texto deseado
            age_value = re.sub(r'[^\w\s\-+]', '', age_value)
            age_value = re.sub(r'\s+', '', age_value)  # Eliminar espacios adicionales
        
        # Verificar que el valor extraído esté en la lista de valores válidos
        if age_value in valid_age_ranges:
            return f"{{'age': ['{age_value}']}}"
    
    # Si no se puede formatear, devolver el valor original
    return age_str

df_merged['age_range'] = df_merged['age_range'].apply(extract_and_format_age)

In [233]:
#AGERANGE
mapping = {
    '{"age": "[80+"]"}': "{'age': ['80+']}",
    '{"age": "[80+"]"}': "{'age': ['80+']}",
    '{{"age": "[80+]"}': "{'age': ['80+']}",
    '{{"age": "[80+]"}.}': "{'age': ['80+']}",
}

df_merged['age_range'] = df_merged['age_range'].map(mapping).fillna(df_merged['age_range'])
df_merged['age_range'].unique()

array(["{'age': ['25-34']}", "{'age': ['18-24']}", "{'age': ['35-44']}",
       "{'age': ['45-54']}", "{'age': ['65-79']}", "{'age': ['80+']}",
       "{'age': ['55-64']}"], dtype=object)

In [163]:
df_merged

Unnamed: 0,filename_png,prompt,codi,age_group,ID,filename_jpg,gender,age_range,race,emotions_short,personality_short,location,objects,objects_assist_devices,objects_digi_devices,person_count,shot,position_short
0,01P_a_person_walking_to_the_bathroom_in_photor...,a person walking to the bathroom in photoreali...,01p,neutral,a_p1_f1,01P_a_person_walking_to_the_bathroom_in_photor...,{'gender': ['male']},{'age': ['25-34']},{'race': ['white']},{'emotion': ['neutral']},{'personality': ['extraversion']},"{""location"": ""indoor""","{'objects': ['bathroom sink', 'glass door', 'l...",{'assistive_devices': ['no assistive devices a...,{'digital_devices': []},"{""person_count"": ""1""","{""shot"": ""medium""",{'position': ['standing']}
1,01P_a_person_walking_to_the_bathroom_in_photor...,a person walking to the bathroom in photoreali...,01p,neutral,a_p1_f2,01P_a_person_walking_to_the_bathroom_in_photor...,{'gender': ['male']},{'age': ['18-24']},{'race': ['white']},{'emotion': ['neutral']},{'personality': ['openness']},{'location': ['indoor']},"{'objects': ['person', 'doorway', 'wall tile',...",{'assistive_devices': ['door']},{'digital_devices': []},"{""person_count"": ""1""",{'shot': ['medium']},{'position': ['standing']}
2,01P_a_person_walking_to_the_bathroom_in_photor...,a person walking to the bathroom in photoreali...,01p,neutral,a_p1_f3,01P_a_person_walking_to_the_bathroom_in_photor...,{'gender': ['male']},{'age': ['25-34']},{'race': ['white']},{'emotion': ['neutral']},{'personality': ['openness']},{'location': ['indoor']},"{'objects': ['person', 'sink', 'plant', 'doorw...",{'assistive_devices': ['sink']},{'digital_devices': []},"{""person_count"": ""1""",{'shot': ['full shot']},{'position': ['standing']}
3,01P_a_person_walking_to_the_bathroom_in_photor...,a person walking to the bathroom in photoreali...,01p,neutral,a_p1_f4,01P_a_person_walking_to_the_bathroom_in_photor...,{'gender': ['female']},{'age': ['35-44']},{'race': ['other']},{'emotion': ['neutral']},{'personality': ['agreeableness']},{'location': ['indoor']},"{'objects': ['person', 'sink', 'bathroom tiles...",{'assistive_devices': ['sink']},{'digital_devices': []},"{""person_count"": ""1""",{'shot': ['full shot']},{'position': ['standing']}
4,2P_a_front_photo_of_a_person_walking_to_a_stor...,a front photo of a person walking to a store i...,2p,neutral,a_p2_f1,2P_a_front_photo_of_a_person_walking_to_a_stor...,{'gender': ['male']},{'age': ['25-34']},{'race': ['white']},{'emotion': ['neutral']},{'personality': ['conscientiousness']},{'location': ['outdoors']},"{'objects': ['man walking on sidewalk', 'store...",{'assistive_devices': ['backpack']},{'digital_devices': ['notebook']},"{""person_count"": ""1""",{'shot': ['medium shot']},{'position': ['standing']}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
803,139OP_a_front_photo_of_an_older_person_doing_d...,a front photo of an older person doing diet an...,139op,older,o_p139_f4,139OP_a_front_photo_of_an_older_person_doing_d...,{'gender': ['male']},{'age': ['65-79']},{'race': ['white']},"{""emotion"": ""neutral""",{'personality': ['conscientiousness']},"{""location"": ""indoor""","{'objects': ['tablet', 'pen', 'book', 'wooden ...",{'assistive_devices': ['pen']},{'digital_devices': []},{'person_count': ['1']},{'shot': ['close-up shot']},{'position': ['sitting']}
804,140OP_an_older_person_at_the_hospital_in_photo...,an older person at the hospital in photorealis...,140op,older,o_p140_f1,140OP_an_older_person_at_the_hospital_in_photo...,{'gender': ['male']},{'age': ['80+']},{'race': ['white']},{'emotion': ['negative']},{'personality': ['agreeableness']},{'location': ['indoor']},"{'objects': ['man', 'chair', 'waiting area', '...",{'assistive_devices': ['cane']},{'digital_devices': []},"{""person_count"": ""1""",{'shot': ['full shot']},{'position': ['sitting']}
805,140OP_an_older_person_at_the_hospital_in_photo...,an older person at the hospital in photorealis...,140op,older,o_p140_f2,140OP_an_older_person_at_the_hospital_in_photo...,{'gender': ['male']},{'age': ['80+']},{'race': ['white']},{'emotion': ['negative']},{'personality': ['agreeableness']},{'location': ['indoors']},"{'objects': ['man', 'hospital bed', 'medical e...",{'assistive_devices': ['cane']},{'digital_devices': []},"{""person_count"": ""1""",{'shot': ['full shot']},{'position': ['sitting']}
806,140OP_an_older_person_at_the_hospital_in_photo...,an older person at the hospital in photorealis...,140op,older,o_p140_f3,140OP_an_older_person_at_the_hospital_in_photo...,{'gender': ['male']},"{""age"": ""[65-79]""",{'race': ['white']},{'emotion': ['neutral']},{'personality': ['agreeableness']},{'location': ['indoors']},"{'objects': ['hospital bed', 'white pillowcase...",{'assistive_devices': []},{'digital_devices': ['computer monitor']},"{""person_count"": ""1""",{'shot': ['full shot']},{'position': ['lying']}


In [64]:
#df_merged['gender'].unique()
#df_merged['race'].unique()
#df_merged['location'].unique()
#df_merged['emotions_short'].unique()
#df_merged['personality_short'].unique()
#df_merged['shot'].unique()
#df_merged['person_count'].unique()

array(['{"person_count": "1"', "{'person_count': ['1']}",
       '{"person_count": "1"}.', '{"person_count": "0"}.',
       "{'person_count': [1]}", '{"person_count": "2"',
       "{'person_count': [0]}", '{"person_count": "3"',
       "{'person_count': ['3']}", "{'person_count': [2]}",
       '{"person_count": "4"', "{'person_count': ['2']}",
       "{'person_count': [3]}", '{"person_count": "5"',
       "{'person_count': ['4']}", "{'person_count': [4]}",
       "{'person_count': ['9']}", "{'person_count': [6]}"], dtype=object)

In [235]:
# Función para estandarizar las entradas de la columna 'race'
def clean_race(race):
    # Encontrar la categoría dentro de la cadena usando regex
    match = re.search(r'["\']race["\']: ?\[?["\']?(\w+)["\']?\]?', race)
    if match:
        # Obtener la categoría encontrada
        category = match.group(1)
        # Retornar el formato estandarizado
        return f"{{'race': ['{category}']}}"
    else:
        # Si no se encuentra la categoría, retornar la entrada original
        return race

# Aplicar la función a la columna 'race'
df_merged['race'] = df_merged['race'].apply(clean_race)

# Verificar las categorías únicas después de la limpieza
df_merged['race'].unique()


array(["{'race': ['white']}", "{'race': ['other']}",
       "{'race': ['[hispanic]']}", "{'race': ['asian']}",
       "{'race': ['black']}", "{'race': ['hispanic']}",
       "{'race': ['[white]']}"], dtype=object)

In [236]:
location_mapping = {
    "indoor": "indoor",
    "indoors": "indoor",
    "inddoors": "indoor",
    "indors": "indoor",
    "indors": "indoor",
    "indoor market": "indoor",
    "ind indoors": "indoor",
    "indindoors": "indoor",
    "outdoor": "outdoors",
    "outdoors": "outdoors",
    "indeterminate": "none",
    "indeterminable": "none",
    "none": "none"
}

# Función para limpiar y mapear los valores de 'location'
def clean_location(location):
    # Encontrar la categoría dentro de la cadena usando regex
    match = re.search(r'["\']location["\']: ?\[?["\']?([\w\s]+)["\']?\]?', location)
    if match:
        # Obtener la categoría encontrada
        category = match.group(1).strip().lower()
        # Mapear la categoría a la estándar
        category = location_mapping.get(category, 'none')  # Por defecto 'none' si no se encuentra el match
        # Retornar el formato estandarizado
        return f"{{'location': ['{category}']}}"
    else:
        # Si no se encuentra la categoría, retornar 'none' por defecto
        return "{'location': ['none']}"

# Aplicar la función a la columna 'location'
df_merged['location'] = df_merged['location'].apply(clean_location)

# Verificar las categorías únicas después de la limpieza
df_merged['location'].unique()

array(["{'location': ['indoor']}", "{'location': ['outdoors']}",
       "{'location': ['none']}"], dtype=object)

In [237]:
import re

# Función para limpiar y estandarizar los valores en la columna 'personality_short'
def clean_personality(personality):
    # Encontrar la categoría dentro de la cadena usando regex
    match = re.search(r'["\']personality["\']: ?\[?["\']?(\w+)["\']?\]?', personality)
    if match:
        # Obtener la categoría encontrada
        category = match.group(1).strip().lower()
        # Retornar el formato estandarizado
        return f"{{'personality': ['{category}']}}"
    else:
        # Si no se encuentra la categoría, retornar la entrada original
        return personality

# Aplicar la función a la columna 'personality_short'
df_merged['personality_short'] = df_merged['personality_short'].apply(clean_personality)

# Verificar las categorías únicas después de la limpieza
df_merged['personality_short'].unique()


array(["{'personality': ['extraversion']}",
       "{'personality': ['openness']}",
       "{'personality': ['agreeableness']}",
       "{'personality': ['conscientiousness']}",
       "{'personality': ['neuroticism']}"], dtype=object)

In [238]:
# Diccionario para mapear los valores incorrectos a los correctos
shot_mapping = {
    "medium": "medium",
    "medium shot": "medium",
    "full": "full shot",
    "full shot": "full shot",
    "close-up": "close-up shot",
    "close-up shot": "close-up shot"
}

# Función para limpiar y estandarizar los valores en la columna 'shot'
def clean_shot(shot):
    # Encontrar la categoría dentro de la cadena usando regex
    match = re.search(r'["\']shot["\']: ?\[?["\']?([\w\s-]+)["\']?\]?', shot)
    if match:
        # Obtener la categoría encontrada
        category = match.group(1).strip().lower()
        # Mapear la categoría a la estándar
        category = shot_mapping.get(category, 'medium')  # Usar 'medium' por defecto si no se encuentra el match
        # Retornar el formato estandarizado
        return f"{{'shot': ['{category}']}}"
    else:
        # Si no se encuentra la categoría, retornar 'medium' por defecto
        return "{'shot': ['medium']}"

# Aplicar la función a la columna 'shot'
df_merged['shot'] = df_merged['shot'].apply(clean_shot)

# Verificar las categorías únicas después de la limpieza
df_merged['shot'].unique()

array(["{'shot': ['medium']}", "{'shot': ['full shot']}",
       "{'shot': ['close-up shot']}"], dtype=object)

In [239]:
# Definir categorías específicas a las que se deben mapear los valores
specific_categories = {
    'standing': "{'position': ['standing']}",
    'sitting': "{'position': ['sitting']}",
    'lying': "{'position': ['lying']}",
    'crouched': "{'position': ['crouched']}",
    'none': "{'position': ['none']}",
    'walking': "{'position': ['standing']}",  # Agrupar bajo 'standing'
    'bending': "{'position': ['standing']}",  # Agrupar bajo 'standing'
    'no person present': "{'position': ['none']}",  # Agrupar como 'none'
    'kneeling': "{'position': ['standing']}",  # Agrupar bajo 'standing'
    'bending over': "{'position': ['standing']}",  # Agrupar bajo 'standing'
    'kissing': "{'position': ['standing']}",  # Agrupar bajo 'standing'
    'shaking hands': "{'position': ['standing']}",  # Agrupar bajo 'standing'
    'running': "{'position': ['standing']}",  # Agrupar bajo 'standing'
    'praying': "{'position': ['standing']}",  # Agrupar bajo 'standing'
    'squatting': "{'position': ['sitting']}",  # Agrupar bajo 'sitting'
    'crouching': "{'position': ['crouched']}",  # Agrupar bajo 'crouched'
    'kicking': "{'position': ['standing']}",  # Agrupar bajo 'standing'
    'climbing': "{'position': ['standing']}",  # Agrupar bajo 'standing'
    'brushing teeth': "{'position': ['standing']}"  # Agrupar bajo 'standing'
}

# Función para limpiar y mapear categorías
def map_to_specific_category(position_str):
    try:
        # Intentar convertir el valor a un diccionario Python
        position_dict = ast.literal_eval(position_str)
        # Extraer el valor de la clave 'position'
        position_value = position_dict.get('position', '')
        # Si 'position' es una lista, tomar el primer elemento
        if isinstance(position_value, list):
            simplified_value = specific_categories.get(position_value[0], position_value[0])
            return simplified_value
        return specific_categories.get(position_value, f"{{'position': ['{position_value}']}}")
    except (SyntaxError, ValueError) as e:
        # Manejar casos donde el formato sea incorrecto
        print(f"Error al procesar el valor: {position_str} - {e}")
        return "{'position': ['none']}"

# Aplicar la función a la columna 'position_short'
df_merged['position_short'] = df_merged['position_short'].apply(map_to_specific_category)

# Verificar el resultado
print(df_merged['position_short'].unique())

Error al procesar el valor: {"position": "standing" - '{' was never closed (<unknown>, line 1)
Error al procesar el valor: {"position": "standing" - '{' was never closed (<unknown>, line 1)
Error al procesar el valor: {"position": "standing" - '{' was never closed (<unknown>, line 1)
Error al procesar el valor: {"position": "standing" - '{' was never closed (<unknown>, line 1)
Error al procesar el valor: {"position": "standing" - '{' was never closed (<unknown>, line 1)
Error al procesar el valor: {"position": "standing" - '{' was never closed (<unknown>, line 1)
Error al procesar el valor: {"position": "standing" - '{' was never closed (<unknown>, line 1)
Error al procesar el valor: {"position": "standing" - '{' was never closed (<unknown>, line 1)
Error al procesar el valor: {"position": "lying" - '{' was never closed (<unknown>, line 1)
Error al procesar el valor: {"position": "standing" - '{' was never closed (<unknown>, line 1)
Error al procesar el valor: {"position": "standing" -

In [242]:
#GENDER
df_merged['gender'].unique()
mapping = {
    '{"gender": "male"}': "{'gender': ['male']}",
    '{"gender": "female"}': "{'gender': ['female']}",
    '{"gender": "none"}': "{'gender': ['none']}",

    '{"gender": "male"': "{'gender': ['male']}",
    '{"gender": "male"}.': "{'gender': ['male']}",
    '{"gender": "female"': "{'gender': ['female']}",
    '{"gender": "none"': "{'gender': ['none']}",
}
df_merged['gender'] = df_merged['gender'].map(mapping).fillna(df_merged['gender'])
df_merged['gender'].unique()

#RACE
df_merged['race'].unique()
mapping = {
    '{"race": "white"}': "{'race': ['white']}",
    "{'race': ['[hispanic]']}": "{'race': ['hispanic']}",
    "{'race': ['[white]']}": "{'race': ['white']}",
}
df_merged['race'] = df_merged['race'].map(mapping).fillna(df_merged['race'])
df_merged['race'].unique()

#LOCATION
#df_merged['location'].unique()
# mapping = {
#     #'{"location": ["indoor"]}': '{"location": ["indoors"]}',
#     "{'location': ['indoor']}": "{'location': ['indoors']}",
#     #'{"location": "indoors"}': '{"location": ["indoors"]}',
#     #'{"location": "indoor"}': '{"location": ["indoors"]}',
#     "{'location': ['indors']}": "{'location': ['indoors']}",
#     "{'location': ['inddoors']}": "{'location': ['indoors']}",  
#     "{'location': ['individual']}": "{'location': ['indoors']}",  
#     "{'location': ['indicators']}": "{'location': ['indoors']}",  
# }
# df_merged['location'] = df_merged['location'].map(mapping).fillna(df_merged['location'])
# df_merged['location'].unique()

#EMTOIONS SHORT
df_merged['emotions_short'].unique()
mapping = {
    '{"emotion": "neutral"}': '{"emotion": ["neutral"]}',
    '{"emotion": "positive"}': '{"emotion": ["positive"]}',
    '{"emotion": "negative"}': '{"emotion": ["negative"]}',
    '{"emotion": "neutral"': "{'emotion': ['neutral']}",
    '{"emotion": "positive"': "{'emotion': ['positive']}",
    '{"emotion": "negative"': "{'emotion': ['negative']}",
}

df_merged['emotions_short'] = df_merged['emotions_short'].map(mapping).fillna(df_merged['emotions_short'])
df_merged['emotions_short'].unique()

#PERSONALITY SHORT
#df_merged['personality_short'].unique()

# #SHOT
# #df_merged['shot'].unique()
# mapping = {
#     #'{"shot": "close-up shot"}': '{"shot": ["close-up shot"]}',
#     #'{"shot": "full shot"}': '{"shot": ["full shot"]}',
#     "{'shot': ['medium']}": "{'shot': ['medium shot']}",
#     #'{"shot": ["medium"]}': '{"shot": ["medium shot"]}',
#     #'{"shot": "medium shot"}': '{"shot": ["medium shot"]}',
#     "{'shot': ['close-up']}": "{'shot': ['close-up shot']}",
# }
# df_merged['shot'] = df_merged['shot'].map(mapping).fillna(df_merged['shot'])
# df_merged['shot'].unique()

# #POSITION SHORT

#PERSON COUNT
def normalize_person_count(value):
    # Remover cualquier punto al final
    value = value.strip(".")
    
    # Corregir comillas desbalanceadas
    value = re.sub(r'["]', "'", value)  # Reemplazar comillas dobles por simples
    value = re.sub(r"''", "'", value)   # Reemplazar comillas simples dobles por una
    
    # Extraer números dentro de las comillas
    numbers = re.findall(r"'(\d+)'", value)
    if not numbers:  # Si no se encontraron números entre comillas, intentar encontrarlos sin comillas
        numbers = re.findall(r"(\d+)", value)
    
    if numbers:
        numbers = list(map(int, numbers))
        if any(num >= 3 for num in numbers):
            return "{'person_count': ['+3']}"
        else:
            return "{'person_count': ['" + "', '".join(map(str, numbers)) + "']}"
    else:
        return value

# Aplicar la normalización a la columna 'person_count'
df_merged['person_count'] = df_merged['person_count'].apply(normalize_person_count)

mapping = {
    "{'person_count': [1]}": "{'person_count': ['1']}",
    "{'person_count': [2]}": "{'person_count': ['2']}",
    "{'person_count': [3]}": "{'person_count': ['3']}",
}
df_merged['person_count'] = df_merged['person_count'].map(mapping).fillna(df_merged['person_count'])
df_merged['person_count'].unique()

array(["{'person_count': ['1']}", "{'person_count': ['0']}",
       "{'person_count': ['2']}", "{'person_count': ['+3']}"],
      dtype=object)

In [203]:
df_merged

Unnamed: 0,filename_png,prompt,codi,age_group,ID,filename_jpg,gender,age_range,race,emotions_short,personality_short,location,objects,objects_assist_devices,objects_digi_devices,person_count,shot,position_short
0,01P_a_person_walking_to_the_bathroom_in_photor...,a person walking to the bathroom in photoreali...,01p,neutral,a_p1_f1,01P_a_person_walking_to_the_bathroom_in_photor...,{'gender': ['male']},{'age': ['25-34']},{'race': ['white']},{'emotion': ['neutral']},{'personality': ['extraversion']},"{""location"": ""indoor""","{'objects': ['bathroom sink', 'glass door', 'l...",{'assistive_devices': ['no assistive devices a...,{'digital_devices': []},{'person_count': ['1']},"{""shot"": ""medium""",{'position': ['standing']}
1,01P_a_person_walking_to_the_bathroom_in_photor...,a person walking to the bathroom in photoreali...,01p,neutral,a_p1_f2,01P_a_person_walking_to_the_bathroom_in_photor...,{'gender': ['male']},{'age': ['18-24']},{'race': ['white']},{'emotion': ['neutral']},{'personality': ['openness']},{'location': ['indoor']},"{'objects': ['person', 'doorway', 'wall tile',...",{'assistive_devices': ['door']},{'digital_devices': []},{'person_count': ['1']},{'shot': ['medium']},{'position': ['standing']}
2,01P_a_person_walking_to_the_bathroom_in_photor...,a person walking to the bathroom in photoreali...,01p,neutral,a_p1_f3,01P_a_person_walking_to_the_bathroom_in_photor...,{'gender': ['male']},{'age': ['25-34']},{'race': ['white']},{'emotion': ['neutral']},{'personality': ['openness']},{'location': ['indoor']},"{'objects': ['person', 'sink', 'plant', 'doorw...",{'assistive_devices': ['sink']},{'digital_devices': []},{'person_count': ['1']},{'shot': ['full shot']},{'position': ['standing']}
3,01P_a_person_walking_to_the_bathroom_in_photor...,a person walking to the bathroom in photoreali...,01p,neutral,a_p1_f4,01P_a_person_walking_to_the_bathroom_in_photor...,{'gender': ['female']},{'age': ['35-44']},{'race': ['other']},{'emotion': ['neutral']},{'personality': ['agreeableness']},{'location': ['indoor']},"{'objects': ['person', 'sink', 'bathroom tiles...",{'assistive_devices': ['sink']},{'digital_devices': []},{'person_count': ['1']},{'shot': ['full shot']},{'position': ['standing']}
4,2P_a_front_photo_of_a_person_walking_to_a_stor...,a front photo of a person walking to a store i...,2p,neutral,a_p2_f1,2P_a_front_photo_of_a_person_walking_to_a_stor...,{'gender': ['male']},{'age': ['25-34']},{'race': ['white']},{'emotion': ['neutral']},{'personality': ['conscientiousness']},{'location': ['outdoors']},"{'objects': ['man walking on sidewalk', 'store...",{'assistive_devices': ['backpack']},{'digital_devices': ['notebook']},{'person_count': ['1']},{'shot': ['medium shot']},{'position': ['standing']}
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
803,139OP_a_front_photo_of_an_older_person_doing_d...,a front photo of an older person doing diet an...,139op,older,o_p139_f4,139OP_a_front_photo_of_an_older_person_doing_d...,{'gender': ['male']},{'age': ['65-79']},{'race': ['white']},{'emotion': ['neutral']},{'personality': ['conscientiousness']},"{""location"": ""indoor""","{'objects': ['tablet', 'pen', 'book', 'wooden ...",{'assistive_devices': ['pen']},{'digital_devices': []},{'person_count': ['1']},{'shot': ['close-up shot']},{'position': ['sitting']}
804,140OP_an_older_person_at_the_hospital_in_photo...,an older person at the hospital in photorealis...,140op,older,o_p140_f1,140OP_an_older_person_at_the_hospital_in_photo...,{'gender': ['male']},{'age': ['80+']},{'race': ['white']},{'emotion': ['negative']},{'personality': ['agreeableness']},{'location': ['indoor']},"{'objects': ['man', 'chair', 'waiting area', '...",{'assistive_devices': ['cane']},{'digital_devices': []},{'person_count': ['1']},{'shot': ['full shot']},{'position': ['sitting']}
805,140OP_an_older_person_at_the_hospital_in_photo...,an older person at the hospital in photorealis...,140op,older,o_p140_f2,140OP_an_older_person_at_the_hospital_in_photo...,{'gender': ['male']},{'age': ['80+']},{'race': ['white']},{'emotion': ['negative']},{'personality': ['agreeableness']},{'location': ['indoors']},"{'objects': ['man', 'hospital bed', 'medical e...",{'assistive_devices': ['cane']},{'digital_devices': []},{'person_count': ['1']},{'shot': ['full shot']},{'position': ['sitting']}
806,140OP_an_older_person_at_the_hospital_in_photo...,an older person at the hospital in photorealis...,140op,older,o_p140_f3,140OP_an_older_person_at_the_hospital_in_photo...,{'gender': ['male']},"{""age"": ""[65-79]""",{'race': ['white']},{'emotion': ['neutral']},{'personality': ['agreeableness']},{'location': ['indoors']},"{'objects': ['hospital bed', 'white pillowcase...",{'assistive_devices': []},{'digital_devices': ['computer monitor']},{'person_count': ['1']},{'shot': ['full shot']},{'position': ['lying']}


In [243]:
df_merged['gender'].unique()
df_merged['race'].unique()
df_merged['location'].unique()
df_merged['emotions_short'].unique()
df_merged['personality_short'].unique()
df_merged['shot'].unique()
df_merged['person_count'].unique()
df_merged['position_short'].unique()

array(["{'position': ['standing']}", "{'position': ['none']}",
       "{'position': ['sitting']}", "{'position': ['lying']}",
       "{'position': ['crouched']}"], dtype=object)

In [244]:
def ensure_list(value):
    # Intenta convertir la cadena a una lista de Python
    if isinstance(value, str):
        try:
            # Evalúa la cadena para convertirla en una lista o dict
            data = ast.literal_eval(value)
            # Si es un diccionario, toma el primer valor, que se asume es una lista
            if isinstance(data, dict):
                return list(data.values())[0] if isinstance(list(data.values())[0], list) else [list(data.values())[0]]
            # Si es una lista, devuélvela tal cual
            elif isinstance(data, list):
                return data
        except:
            return [value]  # Si falla la evaluación, devuelve el valor original como un solo elemento de una lista
    return value if isinstance(value, list) else [value]  # Si ya es una lista, devuélvela; si no, envuélvela en una lista

def is_list(value):
    return isinstance(value, list)

# Lista de columnas que quieres transformar
columns_to_transform = [
    'gender',
    'age_range',
    'race', 
    'emotions_short', 
    'location', 
    'personality_short', 
    'shot', 
    'position_short',
    'objects',
    'objects_assist_devices',
    'objects_digi_devices',
    'person_count',
    'shot'
]

# Aplica la transformación a cada columna
for column in columns_to_transform:
    df_merged[column] = df_merged[column].apply(ensure_list)

# Verificar que ahora todas son listas de Python
for column in columns_to_transform:
    if df_merged[column].apply(is_list).all():
        print(f"Todos los valores en la columna '{column}' son listas de Python.")
    else:
        print(f"Algunos valores en la columna '{column}' NO son listas de Python.")

def convert_assist_devices(value):
    if isinstance(value, list) or isinstance(value, str):
        if isinstance(value, list):
            value = str(value)

        if 'assistive_devices' in value:
            return ['none']
        
        if isinstance(value, str) and not value.strip():
            return ['none']

        try:
            data = ast.literal_eval(value)
            if isinstance(data, dict) and 'assistive_devices' in data:
                return ['none']
        except (ValueError, SyntaxError):
            pass

    if isinstance(value, list) and not value:
        return ['none']
    return value

df_merged['objects_assist_devices'] = df_merged['objects_assist_devices'].apply(convert_assist_devices)

def convert_digi_devices(value):
    if isinstance(value, list) or isinstance(value, str):
        if isinstance(value, list):
            value = str(value)

        if 'digital_devices' in value:
            return ['none']
        
        if isinstance(value, str) and not value.strip():
            return ['none']

        try:
            data = ast.literal_eval(value)
            if isinstance(data, dict) and 'digital_devices' in data:
                return ['none']
        except (ValueError, SyntaxError):
            pass

    if isinstance(value, list) and not value:
        return ['none']
    return value

df_merged['objects_digi_devices'] = df_merged['objects_digi_devices'].apply(convert_digi_devices)

Todos los valores en la columna 'gender' son listas de Python.
Todos los valores en la columna 'age_range' son listas de Python.
Todos los valores en la columna 'race' son listas de Python.
Todos los valores en la columna 'emotions_short' son listas de Python.
Todos los valores en la columna 'location' son listas de Python.
Todos los valores en la columna 'personality_short' son listas de Python.
Todos los valores en la columna 'shot' son listas de Python.
Todos los valores en la columna 'position_short' son listas de Python.
Todos los valores en la columna 'objects' son listas de Python.
Todos los valores en la columna 'objects_assist_devices' son listas de Python.
Todos los valores en la columna 'objects_digi_devices' son listas de Python.
Todos los valores en la columna 'person_count' son listas de Python.
Todos los valores en la columna 'shot' son listas de Python.


In [245]:
def reemplazar_lista_vacia(valor):
    if isinstance(valor, str):
        valor = ast.literal_eval(valor)  # Convertir la cadena a una lista o diccionario
    
    if isinstance(valor, list) and not valor:
        return ['none']
    elif isinstance(valor, dict) and not valor:
        return ['none']
    return valor

df_merged['objects_digi_devices'] = df_merged['objects_digi_devices'].apply(reemplazar_lista_vacia)
df_merged['objects_assist_devices'] = df_merged['objects_assist_devices'].apply(reemplazar_lista_vacia)

# Ahora, las listas vacías en 'objects_digi_devices' y 'objects_assist_devices' 
# se han reemplazado por ['none']

In [67]:
df_merged

Unnamed: 0,filename_png,prompt,codi,age_group,ID,filename_jpg,gender,age_range,race,emotions_short,personality_short,location,objects,objects_assist_devices,objects_digi_devices,person_count,shot,position_short
0,01P_a_person_walking_to_the_bathroom_in_photor...,a person walking to the bathroom in photoreali...,01p,neutral,a_p1_f1,01P_a_person_walking_to_the_bathroom_in_photor...,[male],[25-34],[white],[neutral],[extraversion],[indoor],"[bathroom sink, glass door, large window, towe...",['no assistive devices are visible in the image'],[],[1],[medium],[standing]
1,01P_a_person_walking_to_the_bathroom_in_photor...,a person walking to the bathroom in photoreali...,01p,neutral,a_p1_f2,01P_a_person_walking_to_the_bathroom_in_photor...,[male],[18-24],[white],[neutral],[openness],[indoor],"[person, doorway, wall tile, wood flooring, li...",['door'],[],[1],[medium],[standing]
2,01P_a_person_walking_to_the_bathroom_in_photor...,a person walking to the bathroom in photoreali...,01p,neutral,a_p1_f3,01P_a_person_walking_to_the_bathroom_in_photor...,[male],[25-34],[white],[neutral],[openness],[indoor],"[person, sink, plant, doorway, tile flooring, ...",['sink'],[],[1],[full shot],[standing]
3,01P_a_person_walking_to_the_bathroom_in_photor...,a person walking to the bathroom in photoreali...,01p,neutral,a_p1_f4,01P_a_person_walking_to_the_bathroom_in_photor...,[female],[35-44],[other],[neutral],[agreeableness],[indoor],"[person, sink, bathroom tiles, light fixture, ...",['sink'],[],[1],[full shot],[standing]
4,2P_a_front_photo_of_a_person_walking_to_a_stor...,a front photo of a person walking to a store i...,2p,neutral,a_p2_f1,2P_a_front_photo_of_a_person_walking_to_a_stor...,[male],[25-34],[white],[neutral],[conscientiousness],[outdoors],"[man walking on sidewalk, storefront with awni...",['backpack'],['notebook'],[1],[medium],[standing]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
803,139OP_a_front_photo_of_an_older_person_doing_d...,a front photo of an older person doing diet an...,139op,older,o_p139_f4,139OP_a_front_photo_of_an_older_person_doing_d...,[male],[65-79],[white],[neutral],[conscientiousness],[indoor],"[tablet, pen, book, wooden cutting board with ...",['pen'],[],[1],[close-up shot],[sitting]
804,140OP_an_older_person_at_the_hospital_in_photo...,an older person at the hospital in photorealis...,140op,older,o_p140_f1,140OP_an_older_person_at_the_hospital_in_photo...,[male],[80+],[white],[negative],[agreeableness],[indoor],"[man, chair, waiting area, building entrance, ...",['cane'],[],[1],[full shot],[sitting]
805,140OP_an_older_person_at_the_hospital_in_photo...,an older person at the hospital in photorealis...,140op,older,o_p140_f2,140OP_an_older_person_at_the_hospital_in_photo...,[male],[80+],[white],[negative],[agreeableness],[indoor],"[man, hospital bed, medical equipment, grey ha...",['cane'],[],[1],[full shot],[sitting]
806,140OP_an_older_person_at_the_hospital_in_photo...,an older person at the hospital in photorealis...,140op,older,o_p140_f3,140OP_an_older_person_at_the_hospital_in_photo...,[male],"[{""age"": ""[65-79]""]",[white],[neutral],[agreeableness],[indoor],"[hospital bed, white pillowcase, man lying dow...",[],['computer monitor'],[1],[full shot],[lying]


# PIL BRIGHTNESS

In [68]:
def rgb_to_hex(rgb):
    return '#{:02x}{:02x}{:02x}'.format(int(rgb[0]), int(rgb[1]), int(rgb[2]))

def rgb_to_css(rgb):
    return f'rgb({int(rgb[0])}, {int(rgb[1])}, {int(rgb[2])})'

def simplify_color(rgb):
    color_names = {
        (0, 0, 0): "Negro", (255, 255, 255): "Blanco",
        (255, 0, 0): "Rojo", (0, 255, 0): "Verde", (0, 0, 255): "Azul",
        (255, 255, 0): "Amarillo", (255, 0, 255): "Magenta", (0, 255, 255): "Cian",
        (128, 128, 128): "Gris"
    }
    
    distances = {name: sum((a - b) ** 2 for a, b in zip(rgb, color_rgb))
                 for color_rgb, name in color_names.items()}
    return min(distances, key=distances.get)

def analizar_imagen(ruta_imagen, max_size=1000):
    try:
        with Image.open(ruta_imagen) as imagen:
            if max(imagen.size) > max_size:
                imagen.thumbnail((max_size, max_size))
            
            imagen_array = np.array(imagen)
        
        brillo_promedio = np.mean(imagen_array)
        
        contraste = np.std(imagen_array)
        
        if len(imagen_array.shape) == 3:  # Imagen a color
            pixels = imagen_array.reshape(-1, 3)
            color_dominante, conteo = stats.mode(pixels, axis=0)
            color_dominante = color_dominante.ravel()
            conteo = conteo.ravel()[0]  # Tomar el primer elemento del array de conteo
        else:  # Imagen en escala de grises
            color_dominante = np.array([brillo_promedio] * 3)
            conteo = imagen_array.size
        
        # Calcular el porcentaje de píxeles del color dominante
        total_pixels = imagen_array.size // 3 if len(imagen_array.shape) == 3 else imagen_array.size
        porcentaje_dominante = (conteo / total_pixels) * 100
        
        return {
            'filename_jpg': os.path.basename(ruta_imagen),
            'brillo_promedio': float(brillo_promedio),
            'contraste': float(contraste),
            'color_dominante': {
                'rgb': tuple(map(int, color_dominante)),
                'hex': rgb_to_hex(color_dominante),
                'css': rgb_to_css(color_dominante),
                'simplificado': simplify_color(tuple(map(int, color_dominante)))
            },
            'porcentaje_dominante': float(porcentaje_dominante)
        }
    except Exception as e:
        print(f"Error al procesar {os.path.basename(ruta_imagen)}: {str(e)}")
        return None

def analizar_carpeta(ruta_carpeta, limit=None):
    resultados = []
    archivos = [f for f in os.listdir(ruta_carpeta) if f.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif'))]
    
    if limit:
        archivos = archivos[:limit]
    
    total = len(archivos)
    for i, archivo in enumerate(archivos, 1):
        ruta_completa = os.path.join(ruta_carpeta, archivo)
        resultado = analizar_imagen(ruta_completa)
        if resultado:
            resultados.append(resultado)
        print(f"Procesado {i}/{total}: {archivo}")
    
    return resultados

if __name__ == '__main__':
    resultados_n = analizar_carpeta("C:/Users/David/Documents/AGEAI/Scripts/OUTPUTS/ANDREA/06_08_midjourney_activities_parejo/neutral_comp")
    resultados_o = analizar_carpeta("C:/Users/David/Documents/AGEAI/Scripts/OUTPUTS/ANDREA/06_08_midjourney_activities_parejo/older_comp")
    
    for resultado_n in resultados_n:
        print(f"\nImagen: {resultado_n['filename_jpg']}")
        print(f"  Brillo promedio: {resultado_n['brillo_promedio']:.2f}")
        print(f"  Contraste: {resultado_n['contraste']:.2f}")
        print(f"  Color dominante:")
        print(f"    RGB: {resultado_n['color_dominante']['rgb']}")
        print(f"    Hex: {resultado_n['color_dominante']['hex']}")
        print(f"    CSS: {resultado_n['color_dominante']['css']}")
        print(f"    Simplificado: {resultado_n['color_dominante']['simplificado']}")
        print(f"  Porcentaje del color dominante: {resultado_n['porcentaje_dominante']:.2f}%")

    for resultado_o in resultados_o:
        print(f"\nImagen: {resultado_o['filename_jpg']}")
        print(f"  Brillo promedio: {resultado_o['brillo_promedio']:.2f}")
        print(f"  Contraste: {resultado_o['contraste']:.2f}")
        print(f"  Color dominante:")
        print(f"    RGB: {resultado_o['color_dominante']['rgb']}")
        print(f"    Hex: {resultado_o['color_dominante']['hex']}")
        print(f"    CSS: {resultado_o['color_dominante']['css']}")
        print(f"    Simplificado: {resultado_o['color_dominante']['simplificado']}")
        print(f"  Porcentaje del color dominante: {resultado_o['porcentaje_dominante']:.2f}%")

  color_dominante, conteo = stats.mode(pixels, axis=0)


Procesado 1/405: 01P_a_person_walking_to_the_bathroom_in_photorealistic_styl_3c1413d3-141a-4f73-821a-dc0ec97832df_0.jpg
Procesado 2/405: 01P_a_person_walking_to_the_bathroom_in_photorealistic_styl_3c1413d3-141a-4f73-821a-dc0ec97832df_1.jpg
Procesado 3/405: 01P_a_person_walking_to_the_bathroom_in_photorealistic_styl_3c1413d3-141a-4f73-821a-dc0ec97832df_2.jpg
Procesado 4/405: 01P_a_person_walking_to_the_bathroom_in_photorealistic_styl_3c1413d3-141a-4f73-821a-dc0ec97832df_3.jpg
Procesado 5/405: 03P_a_full_body_portrait_of_a_person_in_his_or_her_room_in__a07105f7-5e59-489d-b0f7-c2e3dae66d69_0.jpg
Procesado 6/405: 03P_a_full_body_portrait_of_a_person_in_his_or_her_room_in__a07105f7-5e59-489d-b0f7-c2e3dae66d69_1.jpg
Procesado 7/405: 03P_a_full_body_portrait_of_a_person_in_his_or_her_room_in__a07105f7-5e59-489d-b0f7-c2e3dae66d69_2.jpg
Procesado 8/405: 03P_a_full_body_portrait_of_a_person_in_his_or_her_room_in__a07105f7-5e59-489d-b0f7-c2e3dae66d69_3.jpg
Procesado 9/405: 04P_a_front_view_of_a_p

In [None]:
pil_n = pd.DataFrame(resultados_n)
pil_o = pd.DataFrame(resultados_o)
pil = pd.concat([pil_n, pil_o], axis=0)   

In [246]:
df_merged = pd.merge(df_merged, pil, on='filename_jpg', how='left')

In [247]:
df_merged['color_dominante'] = df_merged['color_dominante'].apply(
    lambda x: x.get('simplificado', '').lower())

In [248]:
df_merged.to_csv(f'{output_dir}/df_merged_mid_29_8.csv', index=False)
#sample = df_merged.sample(30)
#sample.to_csv(f'{output_folder}/sample.csv', index=False)