In [1]:
import os
import glob
from langchain.text_splitter import RecursiveCharacterTextSplitter
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings
from uuid import uuid4
import pandas as pd
import sys

### Funciones de afines a ambeddings y Chroma

In [2]:
# cargamos las novelas procesadas desde los archivos .txt
def cargar_corpus_procesado(directorio_corpus):
    """
    Carga todos los archivos .txt del corpus procesado.
    
    Args:
        directorio_corpus (str): Ruta al directorio con los archivos procesados
        
    Returns:
        dict: Diccionario con nombre_novela: contenido_texto
    """
    print(f"Cargando corpus desde: {directorio_corpus}")
    corpus = {}
    
    # Buscar todos los archivos .txt en el directorio
    archivos_txt = glob.glob(os.path.join(directorio_corpus, "*.txt"))
    
    if not archivos_txt:
        print(f"No se encontraron archivos .txt en {directorio_corpus}")
        return corpus
    
    for archivo in archivos_txt:
        nombre_archivo = os.path.basename(archivo)
        # Extraer el nombre de la novela (remover '_processed.txt')
        nombre_novela = nombre_archivo.replace('_processed.txt', '')
        
        try:
            with open(archivo, 'r', encoding='utf-8') as f:
                contenido = f.read()
                corpus[nombre_novela] = contenido
                print(f"  Cargada: {nombre_novela}")
        except Exception as e:
            print(f"x Error cargando {archivo}: {e}")
    
    print(f"Total de novelas cargadas: {len(corpus)}")
    return corpus

In [3]:
# Dividimos el corpus en chunks más pequeños para generar embeddings
def dividir_en_chunks(corpus, chunk_size=1000, chunk_overlap=200):
    """
    Divide el corpus en chunks más pequeños para generar embeddings.
    
    Args:
        corpus (dict): Diccionario con nombre_novela: contenido_texto
        chunk_size (int): Tamaño de cada chunk
        chunk_overlap (int): Superposición entre chunks
        
    Returns:
        list: Lista de diccionarios con información de cada chunk
    """
    print(f"\nDividiendo corpus en chunks...")
    print(f"Tamaño de chunk: {chunk_size}, Superposición: {chunk_overlap}")
    
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
    )
    
    chunks_info = []
    
    for novela, contenido in corpus.items():
        chunks_texto = text_splitter.split_text(contenido)
        
        for i, chunk in enumerate(chunks_texto):
            chunk_info = {
                'id': str(uuid4()),
                'texto': chunk,
                'novela': novela,
                'chunk_index': i,
                'chunk_size': len(chunk)
            }
            chunks_info.append(chunk_info)
        
        print(f"  - {novela}: {len(chunks_texto)} chunks")
    
    print(f"Total de chunks generados: {len(chunks_info)}")
    return chunks_info


In [4]:
# Inicializamos el modelo de embeddings
def inicializar_modelo_embeddings():
    """
    Inicializa el modelo de embeddings.
    
    Returns:
        SentenceTransformer: Modelo de embeddings
    """
    print("\nInicializando modelo de embeddings...")
    model = SentenceTransformer(
        "jinaai/jina-embeddings-v2-small-en",
        trust_remote_code=True
    )
    print("  Modelo inicializado correctamente")
    return model

In [5]:
# Generamos los embeddings para cada chunk
def generar_embeddings(model, chunks_info):
    """
    Genera embeddings para cada chunk.
    
    Args:
        model: Modelo de embeddings
        chunks_info (list): Lista de información de chunks
        
    Returns:
        list: Lista de embeddings
    """
    print(f"\nGenerando embeddings para {len(chunks_info)} chunks...")
    
    embeddings = []
    textos = [chunk['texto'] for chunk in chunks_info]
    
    # Generar embeddings en lotes para eficiencia
    batch_size = 32
    for i in range(0, len(textos), batch_size):
        batch_end = min(i + batch_size, len(textos))
        batch_textos = textos[i:batch_end]
        batch_embeddings = model.encode(batch_textos, show_progress_bar=True)
        embeddings.extend(batch_embeddings)
        
        if i % (batch_size * 10) == 0:
            print(f"  Procesados {min(i + batch_size, len(textos))}/{len(textos)} chunks")
    
    print(f" {len(embeddings)} embeddings generados")
    return embeddings

In [6]:
# Inicializamos la base de datos Chroma
def inicializar_chroma(nombre_coleccion, directorio_persistencia="./chroma_novelas_db"):
    """
    Inicializa la base de datos Chroma.
    
    Args:
        nombre_coleccion (str): Nombre de la colección
        directorio_persistencia (str): Directorio donde persistir la BD
        
    Returns:
        tuple: (cliente_chroma, coleccion)
    """
    print(f"\nInicializando Chroma...")
    print(f"Directorio de persistencia: {directorio_persistencia}")
    print(f"Nombre de colección: {nombre_coleccion}")
    
    # Crear directorio si no existe
    os.makedirs(directorio_persistencia, exist_ok=True)
    
    # Configurar cliente Chroma con persistencia
    client = chromadb.PersistentClient(
        path=directorio_persistencia,
        settings=Settings(
            anonymized_telemetry=False,
            is_persistent=True
        )
    )
    
    # Crear o obtener colección
    try:
        # Intentar eliminar colección existente para empezar limpio
        try:
            client.delete_collection(name=nombre_coleccion)
            print(f"  Colección existente '{nombre_coleccion}' eliminada")
        except:
            pass
        
        # Crear nueva colección
        collection = client.create_collection(
            name=nombre_coleccion,
            metadata={"hnsw:space": "cosine"}
        )
        print(f"  Colección '{nombre_coleccion}' creada")
        
    except Exception as e:
        print(f"Error creando colección: {e}")
        # Si falla la creación, intentar obtener la existente
        collection = client.get_collection(name=nombre_coleccion)
        print(f"  Usando colección existente '{nombre_coleccion}'")
    
    return client, collection

In [7]:
# Cargamos los embeddings y metadatos en Chroma
def cargar_a_chroma(collection, embeddings, chunks_info):
    """
    Carga los embeddings y metadatos en Chroma.
    
    Args:
        collection: Colección de Chroma
        embeddings (list): Lista de embeddings
        chunks_info (list): Lista de información de chunks
    """
    print(f"\nCargando {len(embeddings)} embeddings en Chroma...")
    
    # Preparar datos para Chroma
    ids = [chunk['id'] for chunk in chunks_info]
    documents = [chunk['texto'] for chunk in chunks_info]
    metadatas = [
        {
            'novela': chunk['novela'],
            'chunk_index': chunk['chunk_index'],
            'chunk_size': chunk['chunk_size']
        }
        for chunk in chunks_info
    ]
    
    # Cargar en lotes
    batch_size = 100
    total_lotes = (len(embeddings) + batch_size - 1) // batch_size
    
    for i in range(0, len(embeddings), batch_size):
        batch_end = min(i + batch_size, len(embeddings))
        lote_actual = (i // batch_size) + 1
        
        try:
            collection.add(
                ids=ids[i:batch_end],
                embeddings=embeddings[i:batch_end],
                documents=documents[i:batch_end],
                metadatas=metadatas[i:batch_end]
            )
            print(f"    Lote {lote_actual}/{total_lotes} cargado")
        except Exception as e:
            print(f"  x Error en lote {lote_actual}: {e}")
    
    # Verificar carga
    count = collection.count()
    print(f"  Total de documentos en Chroma: {count}")


### Funciones de consulta

In [8]:
# Para consultar a Chroma
def consultar_chroma(collection, model, query, n_results=5):
    """
    Realiza una consulta de similitud semántica en Chroma.
    
    Args:
        collection: Colección de Chroma
        model: Modelo de embeddings
        query (str): Consulta de texto
        n_results (int): Número de resultados a devolver
        
    Returns:
        dict: Resultados de la consulta
    """
    print(f"\nRealizando consulta: '{query}'")
    
    # Generar embedding de la consulta
    query_embedding = model.encode(query)
    
    # Realizar búsqueda
    results = collection.query(
        query_embeddings=[query_embedding.tolist()],
        n_results=n_results,
        include=['documents', 'metadatas', 'distances']
    )
    
    print(f"  Encontrados {len(results['documents'][0])} resultados")
    
    # Mostrar resultados
    for i, (doc, metadata, distance) in enumerate(zip(
        results['documents'][0], 
        results['metadatas'][0], 
        results['distances'][0]
    )):
        print(f"\nResultado {i+1} (distancia: {distance:.4f}):")
        print(f"  Novela: {metadata['novela']}")
        print(f"  Chunk: {metadata['chunk_index']}")
        print(f"  Texto: {doc[:200]}...")
    
    return results

In [9]:
# generamos un reporte de la colección
def generar_reporte_coleccion(collection, chunks_info, directorio_stats):
    """
    Genera un reporte estadístico de la colección.
    
    Args:
        collection: Colección de Chroma
        chunks_info (list): Información de chunks
        directorio_stats (str): Directorio para guardar estadísticas
    """
    print(f"\nGenerando reporte de la colección...")
    
    # Crear directorio si no existe
    os.makedirs(directorio_stats, exist_ok=True)
    
    # Estadísticas por novela
    stats_por_novela = {}
    for chunk in chunks_info:
        novela = chunk['novela']
        if novela not in stats_por_novela:
            stats_por_novela[novela] = {
                'total_chunks': 0,
                'total_chars': 0,
                'avg_chunk_size': 0
            }
        stats_por_novela[novela]['total_chunks'] += 1
        stats_por_novela[novela]['total_chars'] += chunk['chunk_size']
    
    # Calcular promedios
    for novela, stats in stats_por_novela.items():
        stats['avg_chunk_size'] = stats['total_chars'] / stats['total_chunks']
    
    # Crear DataFrame y guardar
    df_stats = pd.DataFrame.from_dict(stats_por_novela, orient='index')
    df_stats.index.name = 'novela'
    
    # Guardar estadísticas
    stats_file = os.path.join(directorio_stats, 'embedding_stats.csv')
    df_stats.to_csv(stats_file)
    
    print(f"  Reporte guardado en: {stats_file}")
    print("\nEstadísticas por novela:")
    print(df_stats.to_string())
    
    # Estadísticas generales
    total_chunks = len(chunks_info)
    total_chars = sum(chunk['chunk_size'] for chunk in chunks_info)
    avg_chunk_size = total_chars / total_chunks if total_chunks > 0 else 0
    
    print(f"\nEstadísticas generales:")
    print(f"  Total chunks: {total_chunks}")
    print(f"  Total caracteres: {total_chars:,}")
    print(f"  Tamaño promedio de chunk: {avg_chunk_size:.2f}")
    print(f"  Documentos en Chroma: {collection.count()}")


### Cuerpo principal

In [10]:
# Configuración
directorio_corpus = "corpus_procesado"  
directorio_stats = "estadisticas"
nombre_coleccion = "novelas_corpus"

In [11]:
# Cargamos el corpus procesado
corpus = cargar_corpus_procesado(directorio_corpus)
if not corpus:
    print("No se pudo cargar el corpus. Terminando...")
    sys.exit(1)

Cargando corpus desde: corpus_procesado
  Cargada: La_de_Bringas_314648
  Cargada: El_sombrero_de_tres_picos_pg29506
  Cargada: Tristana_pg66979
  Cargada: La_gaviota_pg23600
  Cargada: Pepita_Jimenez_pg17223
  Cargada: Su_unico_hijo_pg17341
  Cargada: La_desheredada_pg25956
  Cargada: Peñas_arriba_pg24127
  Cargada: Platero_y_yo_pg9980
  Cargada: Los_pazos_de_Ulloa_18005-8_UTF8
Total de novelas cargadas: 10


In [12]:
# Dividimos en chunks
chunks_info = dividir_en_chunks(corpus)


Dividiendo corpus en chunks...
Tamaño de chunk: 1000, Superposición: 200
  - La_de_Bringas_314648: 638 chunks
  - El_sombrero_de_tres_picos_pg29506: 584 chunks
  - Tristana_pg66979: 478 chunks
  - La_gaviota_pg23600: 743 chunks
  - Pepita_Jimenez_pg17223: 472 chunks
  - Su_unico_hijo_pg17341: 757 chunks
  - La_desheredada_pg25956: 1217 chunks
  - Peñas_arriba_pg24127: 1117 chunks
  - Platero_y_yo_pg9980: 295 chunks
  - Los_pazos_de_Ulloa_18005-8_UTF8: 741 chunks
Total de chunks generados: 7042


In [13]:
# Inicializamos el modelo
model = inicializar_modelo_embeddings()


Inicializando modelo de embeddings...
  Modelo inicializado correctamente


In [14]:
# Generamos embeddings
embeddings = generar_embeddings(model, chunks_info)


Generando embeddings para 7042 chunks...


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Procesados 32/7042 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Procesados 352/7042 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Procesados 672/7042 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Procesados 992/7042 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Procesados 1312/7042 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Procesados 1632/7042 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Procesados 1952/7042 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Procesados 2272/7042 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Procesados 2592/7042 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Procesados 2912/7042 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Procesados 3232/7042 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Procesados 3552/7042 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Procesados 3872/7042 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Procesados 4192/7042 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Procesados 4512/7042 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Procesados 4832/7042 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Procesados 5152/7042 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Procesados 5472/7042 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Procesados 5792/7042 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Procesados 6112/7042 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Procesados 6432/7042 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Procesados 6752/7042 chunks


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

  Procesados 7042/7042 chunks
 7042 embeddings generados


In [15]:
# Inicializamos Chroma
client, collection = inicializar_chroma(nombre_coleccion)


Inicializando Chroma...
Directorio de persistencia: ./chroma_novelas_db
Nombre de colección: novelas_corpus
  Colección 'novelas_corpus' creada


In [16]:
# Cargams embeddings en Chroma
cargar_a_chroma(collection, embeddings, chunks_info)


Cargando 7042 embeddings en Chroma...
    Lote 1/71 cargado
    Lote 2/71 cargado
    Lote 3/71 cargado
    Lote 4/71 cargado
    Lote 5/71 cargado
    Lote 6/71 cargado
    Lote 7/71 cargado
    Lote 8/71 cargado
    Lote 9/71 cargado
    Lote 10/71 cargado
    Lote 11/71 cargado
    Lote 12/71 cargado
    Lote 13/71 cargado
    Lote 14/71 cargado
    Lote 15/71 cargado
    Lote 16/71 cargado
    Lote 17/71 cargado
    Lote 18/71 cargado
    Lote 19/71 cargado
    Lote 20/71 cargado
    Lote 21/71 cargado
    Lote 22/71 cargado
    Lote 23/71 cargado
    Lote 24/71 cargado
    Lote 25/71 cargado
    Lote 26/71 cargado
    Lote 27/71 cargado
    Lote 28/71 cargado
    Lote 29/71 cargado
    Lote 30/71 cargado
    Lote 31/71 cargado
    Lote 32/71 cargado
    Lote 33/71 cargado
    Lote 34/71 cargado
    Lote 35/71 cargado
    Lote 36/71 cargado
    Lote 37/71 cargado
    Lote 38/71 cargado
    Lote 39/71 cargado
    Lote 40/71 cargado
    Lote 41/71 cargado
    Lote 42/71 cargado
    

In [17]:
# Generamos reporte
generar_reporte_coleccion(collection, chunks_info, directorio_stats)


Generando reporte de la colección...
  Reporte guardado en: estadisticas/embedding_stats.csv

Estadísticas por novela:
                                   total_chunks  total_chars  avg_chunk_size
novela                                                                      
La_de_Bringas_314648                        638       469846      736.435737
El_sombrero_de_tres_picos_pg29506           584       502891      861.114726
Tristana_pg66979                            478       346367      724.617155
La_gaviota_pg23600                          743       598600      805.652759
Pepita_Jimenez_pg17223                      472       357432      757.271186
Su_unico_hijo_pg17341                       757       583038      770.195509
La_desheredada_pg25956                     1217       907676      745.830731
Peñas_arriba_pg24127                       1117       832407      745.216652
Platero_y_yo_pg9980                         295       242984      823.674576
Los_pazos_de_Ulloa_18005-8_UTF8  

### Ejemplo de consulta

In [19]:
consultas_ejemplo = [
    "amor y romance",
    "guerra y conflicto",
    "personajes principales",
    "descripción de paisajes"
]

for consulta in consultas_ejemplo:
    consultar_chroma(collection, model, consulta, n_results=3)
    print("-" * 40)

print(f"\n¡Proceso completado exitosamente!")
print(f"  Embeddings guardados en Chroma")
print(f" Base de datos persistida en: ./chroma_novelas_db")
print(f" Estadísticas guardadas en: {directorio_stats}")


Realizando consulta: 'amor y romance'
  Encontrados 3 resultados

Resultado 1 (distancia: 0.1775):
  Novela: Su_unico_hijo_pg17341
  Chunk: 504
  Texto: No esperaba milagros. No le gustaban siquiera. El milagro era un
absurdo, algo contra la fría razón, y él quería método, orden, una ley
en todo, ley constante, sin excepción. El milagro era romántico,...

Resultado 2 (distancia: 0.1791):
  Novela: La_desheredada_pg25956
  Chunk: 700
  Texto: casa que trabajaba en romances de ciegos y aleluyas. El material de
planchas y grabados era inmenso, y se lo dieron por un pedazo de pan.
Montó también esta especulación en gran escala, y los ciegos p...

Resultado 3 (distancia: 0.1880):
  Novela: Su_unico_hijo_pg17341
  Chunk: 12
  Texto: ilustre Colegio, a sus solas, era romántico también, aunque algo viejo,
y tocaba la flauta con mucho sentimiento, pero jamás en público. Emma,
después de pensarlo, no tuvo inconveniente en que la flau...
----------------------------------------

Realizando consu