In [1]:
import os
import urllib.request
import zipfile
import json
import pandas as pd

# Crear carpeta base
DATASET_DIR = "coco_dataset"
os.makedirs(DATASET_DIR, exist_ok=True)

# URLs a descargar
urls = {
    "val_images": "http://images.cocodataset.org/zips/val2017.zip",
    "annotations": "http://images.cocodataset.org/annotations/annotations_trainval2017.zip"
}

# Descargar y extraer archivos
for name, url in urls.items():
    filename = os.path.join(DATASET_DIR, url.split("/")[-1])
    if not os.path.exists(filename):
        print(f"Descargando {name}...")
        urllib.request.urlretrieve(url, filename)
    else:
        print(f"{filename} ya existe.")

    print(f"Extrayendo {filename}...")
    with zipfile.ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall(DATASET_DIR)

# Leer captions del archivo JSON
with open(os.path.join(DATASET_DIR, "annotations", "captions_val2017.json"), "r") as f:
    captions_data = json.load(f)

# Mapear IDs de imagen a nombres de archivo
id_to_filename = {img["id"]: img["file_name"] for img in captions_data["images"]}

# Construir un DataFrame con imágenes y captions
records = []
for ann in captions_data["annotations"]:
    image_id = ann["image_id"]
    caption = ann["caption"]
    filename = id_to_filename[image_id]
    full_path = os.path.join(DATASET_DIR, "val2017", filename)
    records.append({"image_path": full_path, "caption": caption})

df = pd.DataFrame(records)
print("Primeras filas del dataset:")
print(df.head())

# Opcional: Guardar CSV
df.to_csv("coco_val2017_captions.csv", index=False)


coco_dataset\val2017.zip ya existe.
Extrayendo coco_dataset\val2017.zip...
coco_dataset\annotations_trainval2017.zip ya existe.
Extrayendo coco_dataset\annotations_trainval2017.zip...
Primeras filas del dataset:
                              image_path  \
0  coco_dataset\val2017\000000179765.jpg   
1  coco_dataset\val2017\000000179765.jpg   
2  coco_dataset\val2017\000000190236.jpg   
3  coco_dataset\val2017\000000331352.jpg   
4  coco_dataset\val2017\000000517069.jpg   

                                             caption  
0  A black Honda motorcycle parked in front of a ...  
1      A Honda motorcycle parked in a grass driveway  
2  An office cubicle with four different types of...  
3          A small closed toilet in a cramped space.  
4     Two women waiting at a bench next to a street.  


Preprocesamiento de texto e imágenes

In [2]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download("punkt")
nltk.download("stopwords")

stop_words = set(stopwords.words("english"))

# Cargar el CSV
df = pd.read_csv("coco_val2017_captions.csv")

# Función de limpieza
def preprocess_caption(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # eliminar puntuación
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    return " ".join(tokens)

# Agregar columna limpia
df["processed_caption"] = df["caption"].apply(preprocess_caption)

# Ver primeras filas
print(df.head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Leo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Leo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                              image_path  \
0  coco_dataset\val2017\000000179765.jpg   
1  coco_dataset\val2017\000000179765.jpg   
2  coco_dataset\val2017\000000190236.jpg   
3  coco_dataset\val2017\000000331352.jpg   
4  coco_dataset\val2017\000000517069.jpg   

                                             caption  \
0  A black Honda motorcycle parked in front of a ...   
1      A Honda motorcycle parked in a grass driveway   
2  An office cubicle with four different types of...   
3          A small closed toilet in a cramped space.   
4     Two women waiting at a bench next to a street.   

                               processed_caption  
0     black honda motorcycle parked front garage  
1         honda motorcycle parked grass driveway  
2  office cubicle four different types computers  
3              small closed toilet cramped space  
4            two women waiting bench next street  


Procesamiento y limpieza de imagenes

In [None]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm

# Leer CSV
df = pd.read_csv("coco_val2017_captions.csv")
TARGET_SIZE = (224, 224)

# Directorio de salida
os.makedirs("processed_numpy", exist_ok=True)

# Procesar una a una y guardar en archivos individuales (para evitar sobrecarga de RAM)
valid_rows = []

for i, row in tqdm(df.iterrows(), total=len(df)):
    img_path = row["image_path"]
    try:
        img = Image.open(img_path).convert("RGB")
        img_resized = img.resize(TARGET_SIZE)
        img_array = np.asarray(img_resized) / 255.0

        if img_array.shape != (224, 224, 3):
            raise ValueError("Forma inesperada")

        # Guardar como .npy individual
        filename = os.path.splitext(os.path.basename(img_path))[0]
        np.save(f"processed_numpy/{filename}.npy", img_array)

        valid_rows.append(row)

    except Exception as e:
        print(f"Error con {img_path}: {e}")

# Guardar CSV con solo las filas válidas
df_valid = pd.DataFrame(valid_rows)
df_valid.to_csv("captions_valid.csv", index=False)

print("¡Imágenes procesadas y guardadas individualmente!")


100%|██████████| 25014/25014 [02:54<00:00, 143.01it/s]


¡Imágenes procesadas y guardadas individualmente!


In [4]:
df_valid

Unnamed: 0,image_path,caption
0,coco_dataset\val2017\000000179765.jpg,A black Honda motorcycle parked in front of a ...
1,coco_dataset\val2017\000000179765.jpg,A Honda motorcycle parked in a grass driveway
2,coco_dataset\val2017\000000190236.jpg,An office cubicle with four different types of...
3,coco_dataset\val2017\000000331352.jpg,A small closed toilet in a cramped space.
4,coco_dataset\val2017\000000517069.jpg,Two women waiting at a bench next to a street.
...,...,...
25009,coco_dataset\val2017\000000009590.jpg,A group of men sipping drinks and talking at a...
25010,coco_dataset\val2017\000000084664.jpg,"A plate of food with some eggs, potatoes, brea..."
25011,coco_dataset\val2017\000000331569.jpg,The strawberries was sitting beside the tall g...
25012,coco_dataset\val2017\000000231237.jpg,A bunch of small red flowers in a barnacle enc...


Procesamiento y limpieza del texto

In [5]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download("punkt")
nltk.download("stopwords")

stop_words = set(stopwords.words("english"))

# Cargar el CSV
df = pd.read_csv("captions_valid.csv")

# Función de limpieza
def preprocess_caption(text):
    text = text.lower()
    text = re.sub(r"[^\w\s]", "", text)  # eliminar puntuación
    tokens = word_tokenize(text)
    tokens = [w for w in tokens if w not in stop_words]
    return " ".join(tokens)

# Agregar columna limpia
df["processed_caption"] = df["caption"].apply(preprocess_caption)

# Ver primeras filas
print(df.head())

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Leo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Leo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


                              image_path  \
0  coco_dataset\val2017\000000179765.jpg   
1  coco_dataset\val2017\000000179765.jpg   
2  coco_dataset\val2017\000000190236.jpg   
3  coco_dataset\val2017\000000331352.jpg   
4  coco_dataset\val2017\000000517069.jpg   

                                             caption  \
0  A black Honda motorcycle parked in front of a ...   
1      A Honda motorcycle parked in a grass driveway   
2  An office cubicle with four different types of...   
3          A small closed toilet in a cramped space.   
4     Two women waiting at a bench next to a street.   

                               processed_caption  
0     black honda motorcycle parked front garage  
1         honda motorcycle parked grass driveway  
2  office cubicle four different types computers  
3              small closed toilet cramped space  
4            two women waiting bench next street  


Transformación de texto e imágenes a embeddings

In [6]:
df

Unnamed: 0,image_path,caption,processed_caption
0,coco_dataset\val2017\000000179765.jpg,A black Honda motorcycle parked in front of a ...,black honda motorcycle parked front garage
1,coco_dataset\val2017\000000179765.jpg,A Honda motorcycle parked in a grass driveway,honda motorcycle parked grass driveway
2,coco_dataset\val2017\000000190236.jpg,An office cubicle with four different types of...,office cubicle four different types computers
3,coco_dataset\val2017\000000331352.jpg,A small closed toilet in a cramped space.,small closed toilet cramped space
4,coco_dataset\val2017\000000517069.jpg,Two women waiting at a bench next to a street.,two women waiting bench next street
...,...,...,...
25009,coco_dataset\val2017\000000009590.jpg,A group of men sipping drinks and talking at a...,group men sipping drinks talking table
25010,coco_dataset\val2017\000000084664.jpg,"A plate of food with some eggs, potatoes, brea...",plate food eggs potatoes bread items
25011,coco_dataset\val2017\000000331569.jpg,The strawberries was sitting beside the tall g...,strawberries sitting beside tall glass milkshake
25012,coco_dataset\val2017\000000231237.jpg,A bunch of small red flowers in a barnacle enc...,bunch small red flowers barnacle encrusted cla...


In [7]:
import torch
from transformers import CLIPProcessor, CLIPModel
from tqdm import tqdm

# 2. Cargar modelo CLIP
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
device = "cuda" if torch.cuda.is_available() else "cpu"
clip_model.to(device)

# 3. Generar embeddings de texto con CLIP
clip_text_embeddings = []

for text in tqdm(df["processed_caption"].tolist(), desc="Procesando texto con CLIP"):
    try:
        inputs = clip_processor(text=[text], return_tensors="pt", padding=True).to(device)
        with torch.no_grad():
            outputs = clip_model.get_text_features(**inputs)
            embedding = outputs[0].cpu().numpy()
        clip_text_embeddings.append(embedding)
    except Exception as e:
        print(f"Error con texto: {text}: {e}")
        clip_text_embeddings.append(np.zeros(512))  # vector nulo si falla

# 4. Convertir a DataFrame
embedding_array = np.vstack(clip_text_embeddings)
embedding_df = pd.DataFrame(embedding_array)

# 5. Unir con columnas originales
df_embed = pd.concat([df[["caption", "processed_caption"]].reset_index(drop=True), embedding_df], axis=1)

# 6. Guardar o mostrar
df_embed.to_csv("text_embeddings_clip.csv", index=False)
print("Embeddings de texto con CLIP generados:")
print(df_embed.head())

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Procesando texto con CLIP: 100%|██████████| 25014/25014 [08:50<00:00, 47.15it/s]


Embeddings de texto con CLIP generados:
                                             caption  \
0  A black Honda motorcycle parked in front of a ...   
1      A Honda motorcycle parked in a grass driveway   
2  An office cubicle with four different types of...   
3          A small closed toilet in a cramped space.   
4     Two women waiting at a bench next to a street.   

                               processed_caption         0         1  \
0     black honda motorcycle parked front garage  0.580966 -0.228733   
1         honda motorcycle parked grass driveway  0.285237 -0.381514   
2  office cubicle four different types computers -0.276178 -0.156697   
3              small closed toilet cramped space -0.309372  0.300551   
4            two women waiting bench next street -0.089082 -0.216656   

          2         3         4         5         6         7  ...       502  \
0  0.353271  0.001215 -0.175542 -0.259786  0.076120  0.121080  ... -0.435310   
1  0.259527 -0.176544 -0.39023

In [8]:
df

Unnamed: 0,image_path,caption,processed_caption
0,coco_dataset\val2017\000000179765.jpg,A black Honda motorcycle parked in front of a ...,black honda motorcycle parked front garage
1,coco_dataset\val2017\000000179765.jpg,A Honda motorcycle parked in a grass driveway,honda motorcycle parked grass driveway
2,coco_dataset\val2017\000000190236.jpg,An office cubicle with four different types of...,office cubicle four different types computers
3,coco_dataset\val2017\000000331352.jpg,A small closed toilet in a cramped space.,small closed toilet cramped space
4,coco_dataset\val2017\000000517069.jpg,Two women waiting at a bench next to a street.,two women waiting bench next street
...,...,...,...
25009,coco_dataset\val2017\000000009590.jpg,A group of men sipping drinks and talking at a...,group men sipping drinks talking table
25010,coco_dataset\val2017\000000084664.jpg,"A plate of food with some eggs, potatoes, brea...",plate food eggs potatoes bread items
25011,coco_dataset\val2017\000000331569.jpg,The strawberries was sitting beside the tall g...,strawberries sitting beside tall glass milkshake
25012,coco_dataset\val2017\000000231237.jpg,A bunch of small red flowers in a barnacle enc...,bunch small red flowers barnacle encrusted cla...


Embedings imagenes

In [9]:
import os
import numpy as np
import pandas as pd
from PIL import Image
from tqdm import tqdm
import torch
from transformers import CLIPProcessor, CLIPModel

# 1. Cargar modelo CLIP localmente
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# 2. Cargar CSV de imágenes válidas
df = pd.read_csv("captions_valid.csv")

# 3. Inicializar listas de salida
image_embeddings = []
valid_paths = []
captions = []

# 4. Procesar TODAS las imágenes
for i, row in tqdm(df.iterrows(), total=len(df)):
    original_path = row["image_path"]
    caption = row["caption"]
    filename = os.path.splitext(os.path.basename(original_path))[0]
    npy_path = os.path.join("processed_numpy", f"{filename}.npy")

    try:
        # Leer la imagen desde el .npy y reconstruir como imagen PIL
        img_array = np.load(npy_path)
        img_array = (img_array * 255).astype(np.uint8)
        image = Image.fromarray(img_array).convert("RGB")

        # Preprocesamiento para CLIP
        inputs = processor(images=image, return_tensors="pt").to(device)

        # Obtener embeddings
        with torch.no_grad():
            outputs = model.get_image_features(**inputs)
            embedding = outputs[0].cpu().numpy()  # Vector de 512 dimensiones

        image_embeddings.append(embedding)
        valid_paths.append(original_path)
        captions.append(caption)

    except Exception as e:
        print(f"Error con {npy_path}: {e}")

# 5. Guardar resultados
embedding_df = pd.DataFrame(image_embeddings)
meta_df = pd.DataFrame({
    "image_path": valid_paths,
    "caption": captions
})
final_df = pd.concat([meta_df, embedding_df], axis=1)

final_df.to_csv("image_embeddings_clip.csv", index=False)
print("¡Embeddings generados y guardados en image_embeddings_clip.csv!")



100%|██████████| 25014/25014 [30:09<00:00, 13.82it/s]


¡Embeddings generados y guardados en image_embeddings_clip.csv!


Indexar con FAISS

Texto e Imagen:

In [10]:
import pandas as pd
import numpy as np
import faiss

# Cargar embeddings de imágenes
df_img = pd.read_csv("image_embeddings_clip.csv")
df_img.iloc[:, 2:] = df_img.iloc[:, 2:].apply(pd.to_numeric, errors="coerce")
image_embeddings = df_img.iloc[:, 2:].values.astype("float32").copy()
faiss.normalize_L2(image_embeddings)

# Crear índice de imágenes
image_index = faiss.IndexFlatIP(image_embeddings.shape[1])
image_index.add(image_embeddings)
faiss.write_index(image_index, "faiss_image.index")

# Cargar embeddings de texto
df_txt = pd.read_csv("text_embeddings_clip.csv")
df_txt.iloc[:, 2:] = df_txt.iloc[:, 2:].apply(pd.to_numeric, errors="coerce")
text_embeddings = df_txt.iloc[:, 2:].values.astype("float32").copy()
faiss.normalize_L2(text_embeddings)

# Crear índice de textos
text_index = faiss.IndexFlatIP(text_embeddings.shape[1])
text_index.add(text_embeddings)
faiss.write_index(text_index, "faiss_text.index")

print("Índices FAISS creados y guardados correctamente.")



Índices FAISS creados y guardados correctamente.


Busqueda Texto

In [11]:
from transformers import CLIPProcessor, CLIPModel
import torch
import faiss

# Cargar modelo CLIP
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model.eval().to("cuda" if torch.cuda.is_available() else "cpu")

def search_text_to_image(query, top_k=5):
    inputs = processor(text=[query], return_tensors="pt").to(model.device)
    with torch.no_grad():
        query_embedding = model.get_text_features(**inputs).cpu().numpy().astype("float32")
    faiss.normalize_L2(query_embedding)
    scores, indices = image_index.search(query_embedding, k=top_k * 10)

    seen_images = set()
    results = []
    for score, idx in zip(scores[0], indices[0]):
        image_path = df_img.iloc[idx]['image_path']
        if image_path not in seen_images:
            seen_images.add(image_path)
            results.append({
                "image_path": image_path,
                "caption": df_img.iloc[idx]['caption'],
                "score": score
            })
        if len(results) == top_k:  # Devuelve solo top_k imágenes únicas
            break
    return results



Busqueda Imagen

In [None]:
from PIL import Image
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def get_image_embedding(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        embedding = model.get_image_features(**inputs)
        embedding = embedding / embedding.norm(p=2, dim=-1, keepdim=True)
    return embedding.cpu().numpy().astype("float32")

def search_image_to_image(query_image_path, top_k=5):
    query_vec = get_image_embedding(query_image_path)
    faiss.normalize_L2(query_vec)

    # Buscar más para tener margen de deduplicación
    D, I = image_index.search(query_vec, k=top_k * 10)

    seen_paths = set()
    results = []

    for score, idx in zip(D[0], I[0]):
        item = df_img.iloc[idx]
        img_path = item["image_path"]

        if img_path not in seen_paths and img_path != query_image_path:
            seen_paths.add(img_path)
            results.append({
                "image_path": img_path,
                "caption": item["caption"],
                "score": score
            })

        if len(results) >= top_kb:
            break

    return results
# =====================
# Obtener Contexto
# =====================
def build_context_from_image_query(query_image_path, top_k=5):
    resultados = search_image_to_image(query_image_path, top_k=top_k)
    context = ""
    for i, item in enumerate(resultados):
        context += f"[Imagen {i+1}]\nRuta: {item['image_path']}\nDescripción: {item['caption']}\n\n"
    return context.strip()

def build_context_from_text_query(query, top_k=5):
    resultados = search_text_to_image(query, top_k=top_k)
    context = ""
    for i, item in enumerate(resultados):
        context += f"[Imagen {i+1}]\nRuta: {item['image_path']}\nDescripción: {item['caption']}\n\n"
    return context.strip()



Ejemplos:

TEXTO

In [13]:
query = "a cat"

resultados = search_text_to_image(query, top_k=5)
context = build_context_from_text_query(query, top_k=5)


for i, item in enumerate(resultados):
    print(f"{i+1}. Imagen: {item['image_path']} - Caption: {item['caption']} - Score: {item['score']:.4f}")
print("\nContexto generado:")
print(context)

1. Imagen: coco_dataset\val2017\000000255965.jpg - Caption: a stripped cat sitting near a brick wall - Score: 0.2787
2. Imagen: coco_dataset\val2017\000000080666.jpg - Caption: A cat sitting on a bench in front of a building. - Score: 0.2776
3. Imagen: coco_dataset\val2017\000000304560.jpg - Caption: A black cat is staring directly into the camera.  - Score: 0.2755
4. Imagen: coco_dataset\val2017\000000014831.jpg - Caption: An up close photo of animals fur while laying on a blanket - Score: 0.2726
5. Imagen: coco_dataset\val2017\000000118515.jpg - Caption: A small kitten sitting on a pallet of wood looking back. - Score: 0.2722

Contexto generado:
[Imagen 1]
Ruta: coco_dataset\val2017\000000255965.jpg
Descripción: a stripped cat sitting near a brick wall

[Imagen 2]
Ruta: coco_dataset\val2017\000000080666.jpg
Descripción: A cat sitting on a bench in front of a building.

[Imagen 3]
Ruta: coco_dataset\val2017\000000304560.jpg
Descripción: A black cat is staring directly into the camera.

IMAGEN

In [14]:
query_img = "coco_dataset/val2017/000000580197.jpg"
resultados = search_image_to_image(query_img, top_k=5)
context = build_context_from_image_query(query_img, top_k=5)

for i, res in enumerate(resultados):
    print(f"{i+1}. Imagen: {res['image_path']} - Caption: {res['caption']} - Score: {res['score']:.4f}")
print("\nContexto generado:")
print(context)

1. Imagen: coco_dataset\val2017\000000580197.jpg - Caption: Older man in tuxedo sitting next to another younger man in tuxedo. - Score: 0.9320
2. Imagen: coco_dataset\val2017\000000032901.jpg - Caption: A group of four older men posing for a photo. - Score: 0.7042
3. Imagen: coco_dataset\val2017\000000171190.jpg - Caption: A ground of people sitting around a table holding wine glasses in their hands. - Score: 0.6626
4. Imagen: coco_dataset\val2017\000000391722.jpg - Caption: A man is being handed a birthday cake with lit candles. - Score: 0.6171
5. Imagen: coco_dataset\val2017\000000271116.jpg - Caption: The men sit at a table with their phones in their hands. - Score: 0.5971

Contexto generado:
[Imagen 1]
Ruta: coco_dataset\val2017\000000580197.jpg
Descripción: Older man in tuxedo sitting next to another younger man in tuxedo.

[Imagen 2]
Ruta: coco_dataset\val2017\000000032901.jpg
Descripción: A group of four older men posing for a photo.

[Imagen 3]
Ruta: coco_dataset\val2017\000000

RAG

In [15]:
import google.generativeai as genai

# Configurar tu clave
genai.configure(api_key="AIzaSyA2DlSe8KtZvWmehFxjvZBuj-eL9IbTC2Y")


In [16]:
def generate_gemini_response(context, prompt="Eres una aplicación de Retrieval Augmented Generation que siempre responde en español. Usa el siguiente contexto para responder la pregunta. Responde con una descripcion general usando todo tu contexto, evita dar exactamente tu contexto e ignora las rutas. Empieza siempre las imagenes que se muestran como resultado son...... Si la respuesta no está en el contexto, di que no sabes."):
    model = genai.GenerativeModel("gemini-2.5-flash")

    full_prompt = f"{prompt}\n\nContexto:\n{context}"

    response = model.generate_content(full_prompt)
    return response.text


In [17]:
respuesta = generate_gemini_response(context)
print(respuesta)


Las imagenes que se muestran como resultado son una colección de fotografías que representan a personas en diversas situaciones sociales y entornos. Se incluyen imágenes de hombres, algunos vistiendo esmoquin, tanto sentados en compañía de otros como posando en grupo. También hay escenas de personas reunidas alrededor de una mesa, en algunos casos con copas de vino o usando sus teléfonos, y una imagen de un hombre en el momento de recibir un pastel de cumpleaños.


In [None]:
# SERVIDOR FASTAPI PARA BÚSQUEDA MULTIMODAL
import threading
import time
import uvicorn
from fastapi import FastAPI, HTTPException, UploadFile, File, Form
from fastapi.middleware.cors import CORSMiddleware
from fastapi.staticfiles import StaticFiles
from PIL import Image
import io
import os
import requests

print("🚀 INICIANDO SERVIDOR FASTAPI MULTIMODAL")

# Instalar dependencias si no están disponibles
try:
    import google.generativeai as genai
except ImportError:
    import subprocess
    subprocess.check_call(["pip", "install", "google-generativeai"])
    import google.generativeai as genai

try:
    import faiss
except ImportError:
    import subprocess
    subprocess.check_call(["pip", "install", "faiss-cpu"])
    import faiss

# Configurar Gemini API
genai.configure(api_key="AIzaSyA2DlSe8KtZvWmehFxjvZBuj-eL9IbTC2Y")

# Crear aplicación FastAPI
app = FastAPI(title="Motor de Búsqueda Multimodal", version="1.0.0")

# Configurar CORS para el frontend
app.add_middleware(
    CORSMiddleware,
    allow_origins=["http://localhost:5173", "http://127.0.0.1:5173", "*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Montar archivos estáticos para servir imágenes
if os.path.exists("coco_dataset/val2017"):
    app.mount("/images", StaticFiles(directory="coco_dataset/val2017"), name="images")

@app.get("/")
async def root():
    return {
        "message": "🚀 Motor de Búsqueda Multimodal v1.0.0",
        "status": "active",
        "endpoints": ["/health", "/search", "/search-by-image"],
        "description": "Sistema de búsqueda multimodal con CLIP + FAISS + RAG"
    }

@app.get("/health")
async def health_check():
    # Verificar que las funciones y variables estén disponibles
    checks = {
        "status": "healthy",
        "search_text_function": 'search_text_to_image' in globals(),
        "search_image_function": 'search_image_to_image' in globals(),
        "image_index": 'image_index' in globals() and image_index is not None,
        "model_loaded": 'model' in globals() and model is not None,
        "processor_loaded": 'processor' in globals() and processor is not None,
        "df_img_loaded": 'df_img' in globals(),
        "context_functions": 'build_context_from_text_query' in globals(),
        "rag_function": 'generate_gemini_response' in globals()
    }
    
    if 'image_index' in globals() and image_index is not None:
        checks["index_size"] = image_index.ntotal
    
    return checks

@app.post("/search")
async def search_text(request: dict):
    """Búsqueda por texto usando las funciones implementadas en el notebook"""
    try:
        query = request.get("query", "").strip()
        if not query:
            raise HTTPException(status_code=400, detail="Query es requerido")
        
        print(f"🔍 Búsqueda por texto: '{query}'")
        
        # Verificar que las funciones estén disponibles
        if 'search_text_to_image' not in globals():
            raise HTTPException(status_code=500, detail="Función search_text_to_image no está disponible. Ejecuta las celdas del notebook primero.")
        
        # Realizar búsqueda usando la función del notebook
        search_results = search_text_to_image(query, top_k=10)
        
        # Convertir resultados al formato esperado por el frontend
        results = []
        for i, result in enumerate(search_results):
            filename = os.path.basename(result['image_path'])
            similarity_percentage = max(0, min(100, float(result['score']) * 100))
            
            results.append({
                "id": f"img_{i}",
                "type": "image",
                "image_path": result['image_path'],
                "image_url": f"http://localhost:8004/images/{filename}",
                "text": result['caption'],
                "score": round(similarity_percentage, 1),
                "caption": result['caption']
            })
        
        # Generar respuesta RAG si las funciones están disponibles
        rag_response = ""
        if 'build_context_from_text_query' in globals() and 'generate_gemini_response' in globals():
            try:
                context = build_context_from_text_query(query, top_k=5)
                rag_response = generate_gemini_response(context)
            except Exception as e:
                print(f"⚠️ Error generando RAG: {e}")
                rag_response = f"Se encontraron {len(results)} imágenes relacionadas con '{query}'."
        
        print(f"✅ Encontrados {len(results)} resultados para '{query}'")
        
        return {
            "query": query,
            "total_results": len(results),
            "rag_response": rag_response,
            "results": results,
            "success": True
        }
        
    except Exception as e:
        print(f"❌ Error en búsqueda por texto: {e}")
        import traceback
        traceback.print_exc()
        raise HTTPException(status_code=500, detail=f"Error en búsqueda: {str(e)}")

@app.post("/search-by-image")
async def search_by_image(image: UploadFile = File(...)):
    """Búsqueda por imagen usando las funciones implementadas en el notebook"""
    try:
        if not image.content_type.startswith('image/'):
            raise HTTPException(status_code=400, detail="El archivo debe ser una imagen")
        
        print(f"🖼️ Búsqueda por imagen: {image.filename}")
        
        # Verificar que las funciones estén disponibles
        if 'search_image_to_image' not in globals():
            raise HTTPException(status_code=500, detail="Función search_image_to_image no está disponible. Ejecuta las celdas del notebook primero.")
        
        # Leer y procesar la imagen subida
        image_bytes = await image.read()
        pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
        
        # Guardar temporalmente la imagen
        temp_path = f"temp_upload_{image.filename}"
        pil_image.save(temp_path)
        
        try:
            # Realizar búsqueda usando la función del notebook
            search_results = search_image_to_image(temp_path, top_k=10)
            
            # Convertir resultados al formato esperado
            results = []
            for i, result in enumerate(search_results):
                filename = os.path.basename(result['image_path'])
                similarity_percentage = max(0, min(100, float(result['score']) * 100))
                
                results.append({
                    "id": f"img_{i}",
                    "type": "image",
                    "image_path": result['image_path'],
                    "image_url": f"http://localhost:8004/images/{filename}",
                    "text": result['caption'],
                    "score": round(similarity_percentage, 1),
                    "caption": result['caption']
                })
            
            # Generar respuesta RAG
            rag_response = ""
            if 'build_context_from_image_query' in globals() and 'generate_gemini_response' in globals():
                try:
                    context = build_context_from_image_query(temp_path, top_k=5)
                    rag_response = generate_gemini_response(context)
                except Exception as e:
                    print(f"⚠️ Error generando RAG: {e}")
                    rag_response = f"Se encontraron {len(results)} imágenes similares a la imagen subida."
            
            print(f"✅ Encontrados {len(results)} resultados similares")
            
            return {
                "filename": image.filename,
                "total_results": len(results),
                "rag_response": rag_response,
                "results": results,
                "success": True
            }
            
        finally:
            # Limpiar archivo temporal
            if os.path.exists(temp_path):
                os.remove(temp_path)
        
    except Exception as e:
        print(f"❌ Error en búsqueda por imagen: {e}")
        import traceback
        traceback.print_exc()
        raise HTTPException(status_code=500, detail=f"Error en búsqueda por imagen: {str(e)}")

# Función para ejecutar el servidor
def run_server():
    uvicorn.run(app, host="127.0.0.1", port=8004, log_level="error")

def start_server():
    print("🚀 Iniciando servidor en puerto 8004...")
    server_thread = threading.Thread(target=run_server, daemon=True)
    server_thread.start()
    time.sleep(3)
    return server_thread

# Iniciar el servidor automáticamente
try:
    server_thread = start_server()
    print("✅ Servidor iniciado correctamente en http://localhost:8004")
    print("📖 Documentación API: http://localhost:8004/docs")
    print("🖼️ Imágenes estáticas: http://localhost:8004/images/")
    print("\n✅ TODAS LAS FUNCIONES ESTÁN DISPONIBLES")
    
    # Verificación básica después de un momento
    time.sleep(2)
    try:
        response = requests.get("http://localhost:8004/health", timeout=5)
        if response.status_code == 200:
            health = response.json()
            print("\n🔍 Estado del sistema:")
            for key, value in health.items():
                status = "✅" if value else "❌"
                print(f"   {status} {key}: {value}")
        else:
            print("⚠️ Error en verificación de salud del servidor")
    except Exception as e:
        print(f"⚠️ Error verificando servidor: {e}")
        
except Exception as e:
    print(f"❌ Error iniciando servidor: {e}")

print("\n" + "="*60)
print("🎯 SERVIDOR FASTAPI LISTO")
print("   🌐 URL: http://localhost:8004")
print("   📱 Frontend: http://localhost:5173")
print("   🔧 Estado: Todas las funciones están disponibles")
print("="*60)

🚀 INICIANDO SERVIDOR FASTAPI MULTIMODAL
🚀 Iniciando servidor en puerto 8004...
✅ Servidor iniciado correctamente en http://localhost:8004
📖 Documentación API: http://localhost:8004/docs
🖼️ Imágenes estáticas: http://localhost:8004/images/

✅ TODAS LAS FUNCIONES ESTÁN DISPONIBLES

🔍 Estado del sistema:
   ✅ status: healthy
   ✅ search_text_function: True
   ✅ search_image_function: True
   ✅ image_index: True
   ✅ model_loaded: True
   ✅ processor_loaded: True
   ✅ df_img_loaded: True
   ✅ context_functions: True
   ✅ rag_function: True
   ✅ index_size: 25014

🎯 SERVIDOR FASTAPI LISTO
   🌐 URL: http://localhost:8004
   📱 Frontend: http://localhost:5173
   🔧 Estado: Todas las funciones están disponibles


🔍 Búsqueda por texto: 'a cat sitting'
✅ Encontrados 10 resultados para 'a cat sitting'
🔍 Búsqueda por texto: 'perros jugando'
✅ Encontrados 10 resultados para 'perros jugando'
🔍 Búsqueda por texto: 'carros'
✅ Encontrados 10 resultados para 'carros'
🔍 Búsqueda por texto: 'paisaje montañoso con lago'
✅ Encontrados 10 resultados para 'paisaje montañoso con lago'
🖼️ Búsqueda por imagen: car.jpg
✅ Encontrados 10 resultados similares
🖼️ Búsqueda por imagen: gatos.webp
✅ Encontrados 10 resultados similares
🖼️ Búsqueda por imagen: paisaje.jpg
✅ Encontrados 10 resultados similares
🔍 Búsqueda por texto: 'pene'
✅ Encontrados 10 resultados para 'pene'
🔍 Búsqueda por texto: 'culos'
✅ Encontrados 10 resultados para 'culos'
🔍 Búsqueda por texto: 'culo'
✅ Encontrados 10 resultados para 'culo'
🔍 Búsqueda por texto: 'trasero'
✅ Encontrados 10 resultados para 'trasero'
🔍 Búsqueda por texto: 'ass'
⚠️ Error generando RAG: Invalid operation: The `response.text` quick accessor requires the response to contai