In [20]:
# %% [markdown]
# # Integración de Pinecone con modelo Logistic Regression para etiquetado multilabel

# %%
# Instalar librerías necesarias (una vez)
# %pip install pinecone
# %pip install sentence-transformers
# %pip install joblib

# %%
import os
import pandas as pd
import pinecone
import joblib
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from pinecone import Pinecone, ServerlessSpec
import configparser

In [21]:
# Get the pinecone API key from config.cfg
config = configparser.ConfigParser()
config.read('../config.cfg')
PINECONE_API_KEY = config['pinecone']['api_key']
PINECONE_ENV = config['pinecone']['environment']

# %%
# 1. Inicializa Pinecone (usa tu API Key personal desde https://app.pinecone.io/)
try:
    pc = Pinecone(api_key=PINECONE_API_KEY)
    print(f"✅ Connected to Pinecone!")
except Exception as e:
    print("❌ Failed to connect:", e)

index_name = "book-embeddings"
#index_name = "quickstart"

index = pc.Index(index_name)

✅ Connected to Pinecone!


In [22]:
# %%
# 3. Carga tus modelos ya entrenados
model = joblib.load("../model/book_tagging_pipeline_sentence_bert.joblib")
clf = joblib.load("../model/book_tagging_pipeline.joblib")
mlb = joblib.load("../model/book_tagging_pipeline_mlb.joblib")

# %%
# 4. Carga tus libros originales y sube a Pinecone
df = pd.read_csv("../data/processed/books.csv")
df['text'] = df['book_title'].fillna('') + '. ' + df['blurb'].fillna('')

# Ensure 'tags' column does not contain NaN values
df['tags'] = df['tags'].fillna('')

# Generate embeddings (if not already saved)
X_embeddings = model.encode(df['text'].tolist(), show_progress_bar=True)

# Prepare data for Pinecone
pinecone_data = []
for idx, (vec, tags) in enumerate(zip(X_embeddings, df['tags'])):
    # Ensure tags are strings and handle NaN or invalid values
    if isinstance(tags, list):
        tags = ','.join(tags)  # Convert list to comma-separated string
    elif pd.isna(tags) or not isinstance(tags, str):
        tags = ''  # Replace NaN or invalid values with an empty string

    pinecone_data.append((
        str(idx),
        vec.tolist(),
        {"tags": tags}
    ))

# Delete existing data in Pinecone (if any)
try:
    index.delete(delete_all=True)
except pinecone.exceptions.NotFoundException:
    print("⚠️ Namespace not found. Skipping deletion.")

# Upload embeddings to Pinecone in batches of 1000
batch_size = 1000
for i in range(0, len(pinecone_data), batch_size):
    batch = pinecone_data[i:i + batch_size]
    index.upsert(vectors=batch)

print("✅ Embeddings uploaded to Pinecone.")


# %% 
# 5. Función de predicción combinada (ensemble)
def predict_with_ensemble(title, blurb, top_k=5, threshold=0.3):
    text = title + ". " + blurb
    embedding = model.encode([text])[0]
    
    # A. Predict con Logistic Regression
    probs = np.array([estimator.predict_proba(embedding.reshape(1, -1))[0][1]
                      for estimator in clf.estimators_])
    pred_lr = (probs >= threshold).astype(int)  # Ensure this remains a NumPy array
    pred_lr = np.array([pred_lr])  # Ensure pred_lr is 2D for mlb.inverse_transform
    tags_lr = mlb.inverse_transform(pred_lr)[0]
    
    # B. Predict con Pinecone (top-K más cercanos)
    pinecone_result = index.query(vector=embedding.tolist(), top_k=top_k, include_metadata=True)
    tag_counter = {}
    for match in pinecone_result.matches:
        if 'tags' in match.metadata and match.metadata['tags']:
            for tag in match.metadata['tags'].split(','):
                tag_counter[tag.strip()] = tag_counter.get(tag.strip(), 0) + 1
    
    tags_pinecone = [tag for tag, count in tag_counter.items() if count >= 1]

    # C. Fusionar (por unión o intersección, aquí hacemos unión)
    final_tags = sorted(set(tags_lr) | set(tags_pinecone))

    return {
        "tags_logistic": sorted(tags_lr),
        "tags_pinecone": sorted(tags_pinecone),
        "tags_fusion": final_tags
    }

# %%
# 6. Prueba con libro desconocido para ti
title = "Messi: The Inside Story"
blurb = "A biography detailing the life, career, and legacy of football legend Lionel Messi."
result = predict_with_ensemble(title, blurb)

print("Tags por Logistic Regression:", result["tags_logistic"])
print("Tags por Pinecone:", result["tags_pinecone"])
print("Tags combinados (fusión):", result["tags_fusion"])

Batches:   0%|          | 0/37 [00:00<?, ?it/s]

✅ Embeddings uploaded to Pinecone.
Tags por Logistic Regression: ['biography', 'history', 'memoir']
Tags por Pinecone: []
Tags combinados (fusión): ['biography', 'history', 'memoir']
