In [1]:
!pip install jupyterlab langchain-openai langchain langchain-community chromadb pypdf unstructured python-dotenv

Collecting langchain-community
  Downloading langchain_community-0.3.27-py3-none-any.whl.metadata (2.9 kB)
Collecting chromadb
  Downloading chromadb-1.0.15-cp39-abi3-win_amd64.whl.metadata (7.1 kB)
Collecting pypdf
  Downloading pypdf-5.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting unstructured
  Downloading unstructured-0.18.5-py3-none-any.whl.metadata (24 kB)
Collecting langchain-core<0.4.0,>=0.3.29 (from langchain-openai)
  Downloading langchain_core-0.3.68-py3-none-any.whl.metadata (5.8 kB)
Collecting langchain
  Downloading langchain-0.3.26-py3-none-any.whl.metadata (7.8 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting langchain-text-splitters<1.0.0,>=0.3.8 (from langchain)
  Downloading langchain_text_splitters-0.3.8-py3-none-any.whl.metadata (1.9 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting pybas

  You can safely remove it manually.
ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
streamlit 1.37.1 requires pillow<11,>=7.1.0, but you have pillow 11.0.0 which is incompatible.
tensorflow-intel 2.16.1 requires protobuf!=4.21.0,!=4.21.1,!=4.21.2,!=4.21.3,!=4.21.4,!=4.21.5,<5.0.0dev,>=3.20.3, but you have protobuf 5.29.5 which is incompatible.


In [None]:
import os
import csv
import json

# --- Configuration ---
# This script will create a set of rich, interconnected dummy data
# to populate the data/raw folder of the project structure.

# Root directory for all raw data
RAW_DATA_PATH = 'data/raw'

# --- Data Definitions ---

# 1. Product Catalog
product_catalog_data = [
    {'sku': 'C9300-24P', 'name': 'Catalyst 9300 24-port PoE+', 'category': 'Switch', 'description': 'Enterprise-grade stackable access switch, foundational for SD-Access.'},
    {'sku': 'C9300-DNA-A-3Y', 'name': 'Cisco DNA Advantage, 3Y', 'category': 'Software License', 'description': '3-year DNA Advantage subscription for Catalyst 9300 series.'},
    {'sku': 'FPR1120-ASA-K9', 'name': 'Firepower 1120 ASA', 'category': 'Firewall', 'description': 'NGFW for small to medium-sized businesses and branch offices.'},
    {'sku': 'MER-MR46-HW', 'name': 'Meraki MR46', 'category': 'Access Point', 'description': 'Cloud-managed Wi-Fi 6 access point with 4x4:4 MIMO.'},
    {'sku': 'LIC-MR-ENT-1Y', 'name': 'Meraki MR Enterprise License, 1Y', 'category': 'Cloud License', 'description': '1-year enterprise cloud management license for MR access points.'},
    {'sku': 'CP-8841-K9', 'name': 'IP Phone 8841', 'category': 'Collaboration', 'description': '5-inch widescreen VGA display IP phone with 5 programmable line keys.'},
    {'sku': 'GLC-TE', 'name': '1000BASE-T SFP Transceiver', 'category': 'Transceiver', 'description': 'SFP transceiver module for Category 5 copper wire.'}
]

# 2. Price List
price_list_data = [
    {'sku': 'C9300-24P', 'list_price_usd': 4500, 'partner_price_usd': 2700},
    {'sku': 'C9300-DNA-A-3Y', 'list_price_usd': 1200, 'partner_price_usd': 720},
    {'sku': 'FPR1120-ASA-K9', 'list_price_usd': 2800, 'partner_price_usd': 1680},
    {'sku': 'MER-MR46-HW', 'list_price_usd': 950, 'partner_price_usd': 665},
    {'sku': 'LIC-MR-ENT-1Y', 'list_price_usd': 150, 'partner_price_usd': 120},
    {'sku': 'CP-8841-K9', 'list_price_usd': 320, 'partner_price_usd': 240},
    {'sku': 'GLC-TE', 'list_price_usd': 100, 'partner_price_usd': 55}
]

# 3. Compatibility Rules
compatibility_rules_data = {
    "C9300-24P": {
        "requires": ["C9300-DNA-A-3Y"],
        "supports": ["GLC-TE"],
        "incompatible_with": []
    },
    "MER-MR46-HW": {
        "requires": ["LIC-MR-ENT-1Y"],
        "supports": [],
        "incompatible_with": ["C9300-DNA-A-3Y"]
    },
    "FPR1120-ASA-K9": {
        "requires": [],
        "supports": ["GLC-TE"],
        "incompatible_with": []
    }
}

# 4. Solution Guide for Healthcare
healthcare_guide_content = """
# Solution Guide: Healthcare Clinic Network Refresh

## Overview
A healthcare clinic requires a highly reliable, secure, and HIPAA-compliant network infrastructure. Key requirements include secure Wi-Fi for staff and guests, robust firewalling to protect patient data (EHR), and reliable voice communication.

## Recommended Components
- **Switching:** The Catalyst 9300 series (e.g., C9300-24P) is recommended for the core network due to its advanced security features and stacking capabilities. A Cisco DNA Advantage license is mandatory for full functionality.
- **Wireless:** For clinical areas, Meraki cloud-managed Wi-Fi 6 access points like the MR46 provide secure and easy-to-manage wireless connectivity. A separate guest SSID can be configured with traffic shaping rules.
- **Security:** A next-generation firewall such as the Firepower 1000 series (e.g., FPR1120-ASA-K9) is essential for threat defense and intrusion prevention.
- **Collaboration:** Cisco IP Phones from the 8800 series (e.g., CP-8841-K9) offer reliable voice and video communication suitable for reception and clinical staff.
"""

# 5. Enterprise Agreement for Healthcare
healthcare_ea_data = {
    "agreement_name": "Healthcare Kickstart EA",
    "target_segment": "Healthcare",
    "minimum_user_count": 50,
    "included_product_families": ["Catalyst 9300 Series", "Meraki MR Series", "Firepower 1000 Series"],
    "default_discount_percentage": {
        "hardware": 45,
        "software": 30
    },
    "included_support_tier": "Solution Support"
}

# 6. Historical Quotes for ML Training
historical_quotes_data = [
    {'quote_id': 'Q1-2023-001', 'customer_segment': 'Healthcare', 'total_list_price': 8500, 'final_discount_pct': 42, 'products_sku': 'C9300-24P;FPR1120-ASA-K9', 'won': 'Yes'},
    {'quote_id': 'Q1-2023-002', 'customer_segment': 'Retail', 'total_list_price': 3500, 'final_discount_pct': 35, 'products_sku': 'MER-MR46-HW;LIC-MR-ENT-1Y', 'won': 'Yes'},
    {'quote_id': 'Q1-2023-003', 'customer_segment': 'Healthcare', 'total_list_price': 9500, 'final_discount_pct': 50, 'products_sku': 'C9300-24P;FPR1120-ASA-K9', 'won': 'No'},
    {'quote_id': 'Q2-2023-004', 'customer_segment': 'Finance', 'total_list_price': 15000, 'final_discount_pct': 40, 'products_sku': 'C9300-24P;C9300-DNA-A-3Y;CP-8841-K9', 'won': 'Yes'}
]


# --- File Writing Logic ---

def create_files():
    """Creates the directories and files with the defined data."""
    print(f"Creating dummy data in '{RAW_DATA_PATH}'...")

    # Define paths for all subdirectories
    paths = {
        "guides": os.path.join(RAW_DATA_PATH, 'solution_guides'),
        "eas": os.path.join(RAW_DATA_PATH, 'enterprise_agreements'),
        "quotes": os.path.join(RAW_DATA_PATH, 'historical_quotes')
    }

    # Create directories if they don't exist
    for path in paths.values():
        os.makedirs(path, exist_ok=True)

    # 1. Write product_catalog.csv
    with open(os.path.join(RAW_DATA_PATH, 'product_catalog.csv'), 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=product_catalog_data[0].keys())
        writer.writeheader()
        writer.writerows(product_catalog_data)
    print(" -> 'product_catalog.csv' created.")
    
    # 2. Write price_list.csv
    with open(os.path.join(RAW_DATA_PATH, 'price_list.csv'), 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=price_list_data[0].keys())
        writer.writeheader()
        writer.writerows(price_list_data)
    print(" -> 'price_list.csv' created.")

    # 3. Write compatibility_rules.json
    with open(os.path.join(RAW_DATA_PATH, 'compatibility_rules.json'), 'w', encoding='utf-8') as f:
        json.dump(compatibility_rules_data, f, indent=4)
    print(" -> 'compatibility_rules.json' created.")

    # 4. Write healthcare_solution.txt
    with open(os.path.join(paths['guides'], 'healthcare_solution.txt'), 'w', encoding='utf-8') as f:
        f.write(healthcare_guide_content)
    print(" -> 'healthcare_solution.txt' created.")

    # 5. Write healthcare_ea.json
    with open(os.path.join(paths['eas'], 'healthcare_ea.json'), 'w', encoding='utf-8') as f:
        json.dump(healthcare_ea_data, f, indent=4)
    print(" -> 'healthcare_ea.json' created.")

    # 6. Write historical_quotes.csv
    with open(os.path.join(paths['quotes'], 'quotes_2023.csv'), 'w', newline='', encoding='utf-8') as f:
        writer = csv.DictWriter(f, fieldnames=historical_quotes_data[0].keys())
        writer.writeheader()
        writer.writerows(historical_quotes_data)
    print(" -> 'quotes_2023.csv' created.")
    
    print("\nDummy data creation complete!")


if __name__ == "__main__":
    create_files()

In [9]:
# Define os caminhos relativos √† raiz do projeto
raw_data_path = 'data/raw'
vector_store_path = 'data/processed/vector_store'

print("--- Carregando, dividindo e vetorizando os dados... ---")

# 1. Carrega os documentos
csv_loader = CSVLoader(file_path=f'{raw_data_path}/product_catalog.csv')
text_loader = DirectoryLoader(path=f'{raw_data_path}/solution_guides/', glob="**/*.txt", show_progress=True)
all_docs = csv_loader.load() + text_loader.load()
print(f"Documentos carregados: {len(all_docs)}")

# 2. Divide os documentos
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)
splits = text_splitter.split_documents(all_docs)
print(f"Documentos divididos em {len(splits)} chunks.")

# 3. Cria e persiste o Vector Store
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=OpenAIEmbeddings(model="text-embedding-3-small"),
    persist_directory=vector_store_path  # Salva o DB no disco para uso futuro
)
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})

print(f"Base de conhecimento criada e salva em '{vector_store_path}'.")

--- Carregando, dividindo e vetorizando os dados... ---


  0%|          | 0/1 [00:00<?, ?it/s]libmagic is unavailable but assists in filetype detection. Please consider installing libmagic for better results.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:09<00:00,  9.12s/it]


Documentos carregados: 8
Documentos divididos em 10 chunks.
Base de conhecimento criada e salva em 'data/processed/vector_store'.


In [11]:
print("--- Construindo a cadeia RAG e testando... ---")

# Define o LLM
llm = ChatOpenAI(model="gpt-4o", temperature=0.1)

# Define o Prompt
prompt_template = """
You are an expert Cisco product assistant. Your role is to help a salesperson create a quote.
Use ONLY the context provided below to answer the question. Do not make up products or information.

Context:
{context}

Salesperson's Question:
{question}

Expert Answer:
"""
prompt = PromptTemplate.from_template(prompt_template)

# Constr√≥i a cadeia RAG
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# ---- TESTE ----
query = "What do you recommend for a small office that needs security and easy management?"
print(f"\n--- Question ---\n{query}\n")
print("--- Generating AI Response ---")

# Invoca a cadeia
response = rag_chain.invoke(query)

# Imprime a resposta
print(response)

--- Construindo a cadeia RAG e testando... ---

--- Question ---
What do you recommend for a small office that needs security and easy management?

--- Generating AI Response ---
For a small office that needs security and easy management, I recommend the following Cisco products:

1. **Security:** The Firepower 1120 ASA (SKU: FPR1120-ASA-K9) is a next-generation firewall suitable for small to medium-sized businesses. It provides essential threat defense and intrusion prevention capabilities.

2. **Wireless:** Consider using Meraki cloud-managed Wi-Fi 6 access points like the MR46. These access points offer secure and easy-to-manage wireless connectivity, which is ideal for small office environments.


In [5]:
import os
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.document_loaders import CSVLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# Carrega as vari√°veis do arquivo .env
load_dotenv()

# Verifica se a chave foi carregada com sucesso
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    print("Chave da OpenAI (OPENAI_API_KEY) n√£o encontrada. Verifique seu arquivo .env na raiz do projeto.")
else:
    print("Chave da OpenAI carregada com sucesso.")

Chave da OpenAI carregada com sucesso.


In [7]:
# Importa a classe usando o novo nome da pasta com underscore
from services.ai_engine.app.core.rag_service import CiscoRAGService

print("--- Initializing the full AI Service ---")
# Cria uma inst√¢ncia do nosso servi√ßo.
rag_service = CiscoRAGService()
print("--- Service Initialized ---")


# Agora, vamos testar o servi√ßo
query = "What do you recommend for a small office that needs security and easy management?"
print(f"\n--- Sending Query to the Service ---\n{query}\n")

# Usa o m√©todo da nossa classe para gerar a resposta
response = rag_service.generate_response(query)

print("--- AI Response ---")
print(response)

2025-07-13 12:19:27,497 - INFO - Initializing CiscoRAGService...
2025-07-13 12:19:27,498 - INFO - Loading documents from source...


--- Initializing the full AI Service ---


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 78.52it/s]
2025-07-13 12:19:27,518 - INFO - Loaded 8 documents.
2025-07-13 12:19:27,518 - INFO - Splitting documents into chunks...
2025-07-13 12:19:27,521 - INFO - Documents split into 10 chunks.
2025-07-13 12:19:27,522 - INFO - Creating and persisting Vector Store...
2025-07-13 12:19:29,244 - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-07-13 12:19:30,899 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-07-13 12:19:31,378 - INFO - Vector Store created at: data/processed/vector_store
2025-07-13 12:19:32,458 - INFO - RAG Service initialized successfully.
2025-07-13 12:19:32,458 - INFO - Received new query: What do you recommend for a small office that needs security and easy management?


--- Service Initialized ---

--- Sending Query to the Service ---
What do you recommend for a small office that needs security and easy management?



2025-07-13 12:19:32,987 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-07-13 12:19:36,783 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"
2025-07-13 12:19:36,802 - INFO - Response generated successfully.


--- AI Response ---
For a small office that requires security and easy management, I recommend the following components:

1. **Switching:** Use the **Catalyst 9300 series** (e.g., **C9300-24P**) for the core network. This switch offers advanced security features and stacking capabilities, which are beneficial for managing network traffic efficiently.

2. **Wireless:** Implement **Meraki cloud-managed Wi-Fi 6 access points** like the **MR46**. These access points provide secure wireless connectivity and are easy to manage through the Meraki dashboard. Additionally, you can configure a separate guest SSID with traffic shaping rules to enhance network performance.

3. **Security:** Deploy a **next-generation firewall** such as the **Firepower 1000 series** (e.g., **FPR1120-ASA-K9**). This firewall is essential for threat defense and intrusion prevention, ensuring that your office network remains secure.

These recommendations focus on providing robust security while ensuring ease of manag

In [11]:
# Passo 1: Instalar as bibliotecas necess√°rias
# beautifulsoup4 √© usado pelo WebBaseLoader para processar o HTML
!pip install -q -U langchain-openai langchain chromadb beautifulsoup4

In [13]:


import os
from langchain_community.document_loaders import WebBaseLoader
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

# --- CONFIGURA√á√ÉO ---
# ‚ö†Ô∏è Cole sua chave de API da OpenAI aqui
OPENAI_API_KEY = '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'
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

# URL da p√°gina da Cisco que usaremos como base de conhecimento
# Exemplo: p√°gina da fam√≠lia de firewalls Meraki MX
url = "https://meraki.cisco.com/products/security-sd-wan/"
print(f"--- Usando como base de conhecimento a URL: {url} ---")


# --- IN√çCIO DO PROCESSO RAG ---

# 1. Carregar o conte√∫do da p√°gina web
print("1. Carregando conte√∫do da web...")
loader = WebBaseLoader(url)
docs = loader.load()

# 2. Dividir o conte√∫do em peda√ßos e criar a base vetorial em mem√≥ria
print("2. Criando base de conhecimento em mem√≥ria...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()

# 3. Construir a cadeia de resposta
print("3. Construindo a cadeia de IA...")
llm = ChatOpenAI(model="gpt-4o", temperature=0)
prompt_template = """You are a helpful assistant. Answer the user's question based ONLY on the following context.
If the information is not in the context, say that you don't know.

Context:
{context}

Question: {question}

Answer:"""
prompt = PromptTemplate.from_template(prompt_template)

rag_chain = {"context": retriever, "question": RunnablePassthrough()} | prompt | llm | StrOutputParser()

# 4. Fazer a pergunta e obter a resposta
print("4. Executando a consulta...")
query = "What are the main features of the Meraki MX security appliances according to this page?"

print(f"\n--- Pergunta ---\n{query}")
print("\n--- Gerando Resposta da IA ---")
response = rag_chain.invoke(query)

print(response)

--- Usando como base de conhecimento a URL: https://meraki.cisco.com/products/security-sd-wan/ ---
1. Carregando conte√∫do da web...
2. Criando base de conhecimento em mem√≥ria...


2025-07-13 13:39:39,384 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


3. Construindo a cadeia de IA...
4. Executando a consulta...

--- Pergunta ---
What are the main features of the Meraki MX security appliances according to this page?

--- Gerando Resposta da IA ---


2025-07-13 13:39:41,559 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
2025-07-13 13:39:43,317 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


I don't know.


In [15]:
import os
from langchain_community.document_loaders import WebBaseLoader
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser



# --- IN√çCIO DO PROCESSO DE DEPURA√á√ÉO ---

# 1. Carregar o conte√∫do da p√°gina web
print("1. Carregando conte√∫do da web...")
loader = WebBaseLoader(url)
docs = loader.load()

# ==============================================================================
# PASSO DE DEPURA√á√ÉO 1: IMPRIMIR O CONTE√öDO BRUTO CARREGADO
# Vamos ver o que o WebBaseLoader realmente "viu" na p√°gina.
# ==============================================================================
print("\n--- IN√çCIO DO CONTE√öDO BRUTO EXTRA√çDO (primeiros 2000 caracteres) ---")
if docs:
    print(docs[0].page_content[:2000])
else:
    print("Nenhum conte√∫do foi extra√≠do da p√°gina.")
print("--- FIM DO CONTE√öDO BRUTO EXTRA√çDO ---\n")


# 2. Dividir e criar a base vetorial
print("2. Criando base de conhecimento em mem√≥ria...")
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()

# 3. Fazer a pergunta e inspecionar o contexto recuperado
query = "What are the main features of the Meraki MX security appliances according to this page?"

# ==============================================================================
# PASSO DE DEPURA√á√ÉO 2: VERIFICAR O CONTEXTO RECUPERADO ANTES DE ENVIAR AO LLM
# Vamos ver exatamente quais peda√ßos de texto foram selecionados para responder √† pergunta.
# ==============================================================================
print(f"--- CONTEXTO RECUPERADO PARA A PERGUNTA: '{query}' ---")
retrieved_docs = retriever.invoke(query)

for i, doc in enumerate(retrieved_docs):
    print(f"\n--- CHUNK RELEVANTE {i+1} ---\n")
    print(doc.page_content)
print("--- FIM DO CONTEXTO RECUPERADO ---")

# 4. (Opcional) Voc√™ pode comentar o resto do c√≥digo se quiser apenas depurar,
# ou deix√°-lo para ver se com algum ajuste o LLM agora responde.

# print("\n--- Gerando Resposta Final da IA ---")
# llm = ChatOpenAI(model="gpt-4o", temperature=0)
# prompt_template = "Answer the question based ONLY on the following context:\n{context}\nQuestion: {question}\nAnswer:"
# prompt = PromptTemplate.from_template(prompt_template)
# rag_chain = {"context": retriever, "question": RunnablePassthrough()} | prompt | llm | StrOutputParser()
# response = rag_chain.invoke(query)
# print(response)

1. Carregando conte√∫do da web...

--- IN√çCIO DO CONTE√öDO BRUTO EXTRA√çDO (primeiros 2000 caracteres) ---




















Enterprise Network Security and SD-WAN | Cloud-Managed Solutions | Cisco Meraki














































































 Skip to primary navigation Skip to main content






Skip to content
Skip to footer







United States (English)


Australia (English)Brazil (Portugu√™s)Canada (Fran√ßais)China (ÁÆÄ‰ΩìÂ≠ó)France (Fran√ßais)Germany (Deutsch)Japan (Êó•Êú¨Ë™û)Korea (ÌïúÍµ≠Ïù∏)Latin America (Espa√±ol)United Kingdom (English)United States (English)Contact usLog In







Experiences





Technologies





Touchpoints





Resources











                        Get a Demo
                    








Search














Quick Links


All
Product
Case
Collateral
Webinars











Experiences
From hybrid workforces to smarter workspaces, bring together technology and touchpoints to deliver exceptional experiences.
 L

2025-07-13 13:42:08,744 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


--- CONTEXTO RECUPERADO PARA A PERGUNTA: 'What are the main features of the Meraki MX security appliances according to this page?' ---


2025-07-13 13:42:10,251 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"



--- CHUNK RELEVANTE 1 ---

Instant, always-on visibility for critical SaaS apps at scale.										















												Proactive monitoring 											








											Identify problems before users are impacted, whether apps are in use or not.										















												Smart root-cause analysis 											








											ML-powered corrective recommendations, including confidence ratings across LAN, WAN, and app servers.										













						TRY IT ON 						











Resource Hub





Datasheet


MX family datasheet

Learn more about the multifunctional network security and SD-WAN building blocks of a SASE architecture.


Learn more






Webinar


Introduction to Cisco Meraki Security and SD-WAN

Hear about the security and SD-WAN features of Meraki MX appliances and get a deep-dive demo.


Learn More 






At-a-Glance


Cisco SD-WAN powered by Meraki overview

--- CHUNK RELEVANTE 2 ---

Instant, always-on visibility for critical SaaS apps at sca

In [17]:
!pip install -q -U langchain-openai tavily-python

## Editando meu agente de IA

In [19]:
# 1. Instala as bibliotecas necess√°rias
#!pip install -q -U langchain-openai tavily-python

import os
from langchain_openai import ChatOpenAI
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain import hub
from langchain.agents import create_openai_tools_agent, AgentExecutor

import warnings
warnings.filterwarnings('ignore')

# --- CONFIGURA√á√ÉO DAS CHAVES DE API ---
# ‚ö†Ô∏è Cole suas chaves aqui. Use o gerenciador de segredos do Colab se preferir.
OPENAI_API_KEY = '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'
TAVILY_API_KEY = "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" # <-- SUA CHAVE DA TAVILY AQUI

os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ['TAVILY_API_KEY'] = TAVILY_API_KEY


# --- CONSTRU√á√ÉO DO AGENTE ---

# 2. Defina as ferramentas que o agente pode usar
# Neste caso, apenas a busca na web da Tavily. `max_results=3` limita a 3 resultados.
search_tool = TavilySearchResults(max_results=3)
tools = [search_tool]

# 3. Crie o agente
llm = ChatOpenAI(model="gpt-4o", temperature=0)

# Puxa um prompt pr√©-constru√≠do da comunidade LangChain, otimizado para agentes
# Este prompt instrui o LLM sobre como pensar e usar as ferramentas
prompt = hub.pull("hwchase17/openai-tools-agent")

# Cria o agente, unindo o LLM, as ferramentas e o prompt
agent = create_openai_tools_agent(llm, tools, prompt)

# Cria o "Executor", que √© o que de fato roda o ciclo de pensamento do agente
# verbose=True √© MUITO importante para vermos o "racioc√≠nio" do agente
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)


# --- TESTE DO AGENTE ---
query = "What are the latest security advisories for Cisco Firepower 1000 series published in 2025?"

print(f"--- Pergunta para o Agente ---\n{query}")

# Invoca o agente e aguarda a resposta final
response = agent_executor.invoke({"input": query})

print("\n--- Resposta Final do Agente ---")
print(response['output'])

  search_tool = TavilySearchResults(max_results=3)


--- Pergunta para o Agente ---
What are the latest security advisories for Cisco Firepower 1000 series published in 2025?


[1m> Entering new AgentExecutor chain...[0m


2025-07-13 13:47:33,314 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3m
Invoking: `tavily_search_results_json` with `{'query': 'Cisco Firepower 1000 series security advisories 2025'}`



2025-07-13 13:47:39,485 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


[32;1m[1;3mHere are some of the latest security advisories for the Cisco Firepower 1000 series published in 2025:

1. **Cisco Adaptive Security Appliance Software, Firepower Threat Defense Software, IOS Software, and IOS XE Software IKEv2 Denial of Service Vulnerability**
   - **Advisory ID:** cisco-sa-multiprod-ikev2-dos-gPctUqv2
   - **Published:** May 7, 2025
   - **Description:** This advisory is part of the May 2025 release of the Cisco IOS and IOS XE Software Security Advisory Bundled Publication. It addresses a vulnerability affecting Cisco products running a vulnerable release of Cisco ASA, FTD, IOS, or IOS XE Software.
   - **Link:** [Cisco Security Advisory](https://sec.cloudapps.cisco.com/security/center/content/CiscoSecurityAdvisory/cisco-sa-multiprod-ikev2-dos-gPctUqv2)

2. **Cisco Security Advisory (AV25-356)**
   - **Published:** June 18, 2025
   - **Description:** This advisory addresses vulnerabilities in various Cisco products, including ClamAV UDF File Parsing Out-

In [21]:
# 1. Instala√ß√µes necess√°rias
#!pip install -q -U langchain-openai tavily-python langchain beautifulsoup4 chromadb pypdf unstructured

import os
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain.tools.retriever import create_retriever_tool
from langchain_community.document_loaders import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain import hub
from langchain.agents import create_openai_tools_agent, AgentExecutor
import warnings

warnings.filterwarnings('ignore')

# --- CONFIGURA√á√ÉO DAS CHAVES DE API ---
# ‚ö†Ô∏è Cole suas chaves aqui.
OPENAI_API_KEY = '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'
TAVILY_API_KEY = "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" # <-- SUA CHAVE DA TAVILY AQUI

os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ['TAVILY_API_KEY'] = TAVILY_API_KEY

In [5]:



# --- PASSO A: CRIAR A FERRAMENTA DE BUSCA NOS SEUS ARQUIVOS ---

print("--- Criando ferramenta de busca de arquivos locais... ---")
# Carrega os dados do seu arquivo CSV
# Certifique-se de que o arquivo 'Pricelist.csv' est√° na pasta 'data/raw/'
loader = CSVLoader(file_path='data/raw/Pricelist.csv')
docs = loader.load()

# Divide os documentos e cria o vector store
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
splits = text_splitter.split_documents(docs)
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())
retriever = vectorstore.as_retriever()

# Cria a ferramenta de busca de arquivos (RAG)
# A 'description' √© MUITO importante. √â como o agente sabe quando usar esta ferramenta.
file_search_tool = create_retriever_tool(
    retriever,
    "cisco_pricelist_search",
    "Use this tool when you need to find information about Cisco product part numbers, descriptions, or list prices from the NCDPI pricelist file."
)
print("Ferramenta de busca de arquivos criada com sucesso.\n")


# --- PASSO B: DEFINIR A FERRAMENTA DE BUSCA NA WEB ---
web_search_tool = TavilySearchResults(name="web_search", max_results=3)


# --- PASSO C: MONTAR O AGENTE COM AS DUAS FERRAMENTAS ---

# Agora, a lista de ferramentas cont√©m tanto a busca em arquivos quanto a busca na web
tools = [file_search_tool, web_search_tool]

# O restante da cria√ß√£o do agente √© igual, mas agora ele √© mais poderoso
llm = ChatOpenAI(model="gpt-4o", temperature=0)
prompt = hub.pull("hwchase17/openai-tools-agent")
agent = create_openai_tools_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)


# --- PASSO D: TESTAR O AGENTE H√çBRIDO ---

# Teste 1: Uma pergunta que deve ser respondida pelo ARQUIVO CSV
print("\n--- EXECUTANDO TESTE 1 (usando o arquivo local) ---")
query_local = "What is the list price for part number C9200L-24P-4G-A?"
response_local = agent_executor.invoke({"input": query_local})
print("\n--- Resposta Final (do arquivo) ---")
print(response_local['output'])

# Teste 2: Uma pergunta que precisa da INTERNET
print("\n\n--- EXECUTANDO TESTE 2 (usando a busca na web) ---")
query_web = "What is the latest news about Cisco's quarterly earnings?"
response_web = agent_executor.invoke({"input": query_web})
print("\n--- Resposta Final (da web) ---")
print(response_web['output'])

--- Criando ferramenta de busca de arquivos locais... ---
Ferramenta de busca de arquivos criada com sucesso.


--- EXECUTANDO TESTE 1 (usando o arquivo local) ---


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `cisco_pricelist_search` with `{'query': 'C9200L-24P-4G-A'}`


[0m[36;1m[1;3mCategory;Sub_Category;Part Number;Desc;price;Elig %: Hardware;Switches;C9200L-24PXG4X-EDU;Catalyst 9200L 24-p
None: 8xmGig,16x1G,4x10G uplinks,K12;$6.740,00;100%

Category;Sub_Category;Part Number;Desc;price;Elig %: Hardware;Switches;C9200L-24PXG2Y-EDU;Catalyst 9200L 24-p
None: 8xmGig,16x1G,2x25G uplinks,K12;$7.290,00;100%

Category;Sub_Category;Part Number;Desc;price;Elig %: Hardware;Switches;C9200L-48PXG4X-EDU;Catalyst 9200L 48-p
None: 12xmGig,36x1G,4x10G uplinks,K12;$10.380,00;100%

Category;Sub_Category;Part Number;Desc;price;Elig %: Hardware;Switches;C9200L-48PXG2Y-EDU;Catalyst 9200L 48-p
None: 12xmGig,36x1G,2x25G uplinks,K12;$10.930,00;100%[0m[32;1m[1;3mI couldn't fi

In [7]:
import os
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma

print("--- Iniciando o processo de indexa√ß√£o ---")

# --- Carregar o documento ---
# Certifique-se de que o seu arquivo est√° no caminho correto
loader = CSVLoader(file_path='data/raw/Pricelist.csv') 
docs = loader.load()
print(f"Documento CSV carregado com {len(docs)} linhas.")

# --- Dividir em chunks ---
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
splits = text_splitter.split_documents(docs)
print(f"Documento dividido em {len(splits)} chunks.")

# --- Criar e Salvar o Banco de Dados Vetorial ---
# Define o local onde o banco de dados ser√° salvo
vector_db_path = 'data/processed/cisco_pricelist_db'

print(f"Criando e salvando o banco de dados vetorial em: '{vector_db_path}'...")
# Cria o vector store e usa 'persist_directory' para salv√°-lo
vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=OpenAIEmbeddings(),
    persist_directory=vector_db_path
)

print("\n--- Processo de Indexa√ß√£o Conclu√≠do! ---")
print("Sua base de conhecimento foi criada e salva no disco.")

--- Iniciando o processo de indexa√ß√£o ---
Documento CSV carregado com 4267 linhas.
Documento dividido em 4267 chunks.
Criando e salvando o banco de dados vetorial em: 'data/processed/cisco_pricelist_db'...

--- Processo de Indexa√ß√£o Conclu√≠do! ---
Sua base de conhecimento foi criada e salva no disco.


In [19]:
# --- PASSO A: CARREGAR O BANCO DE DADOS VETORIAL EXISTENTE ---
print("--- Carregando a base de conhecimento do disco... ---")
vector_db_path = 'data/processed/cisco_pricelist_db'

# Carrega o banco de dados vetorial que foi salvo no passo anterior
vectorstore = Chroma(
    persist_directory=vector_db_path,
    embedding_function=OpenAIEmbeddings()
)
retriever = vectorstore.as_retriever()
print("Base de conhecimento carregada.\n")


# --- PASSO B: CRIAR A FERRAMENTA DE BUSCA DE ARQUIVOS ---
file_search_tool = create_retriever_tool(
    retriever,
    "cisco_product_and_price_search",
    "Use this tool when you need to find information about Cisco product part numbers, descriptions, or prices. It contains a detailed price."
)


# --- PASSO C: DEFINIR A FERRAMENTA DE BUSCA NA WEB E MONTAR O AGENTE ---
web_search_tool = TavilySearchResults(name="web_search", max_results=3)
tools = [file_search_tool, web_search_tool]

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
prompt = hub.pull("hwchase17/openai-tools-agent")
agent = create_openai_tools_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)


# --- PASSO D: TESTAR O AGENTE H√çBRIDO ---
query = "What is the price for part number C9200L-24P-4G-A?"
print(f"--- Executando a pergunta: {query} ---")

response = agent_executor.invoke({"input": query})

print("\n--- Resposta Final do Agente ---")
print(response['output'])

--- Carregando a base de conhecimento do disco... ---
Base de conhecimento carregada.

--- Executando a pergunta: What is the price for part number C9200L-24P-4G-A? ---


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `cisco_product_and_price_search` with `{'query': 'C9200L-24P-4G-A'}`


[0m[36;1m[1;3mCategory;Sub_Category;Part Number;Desc;price;Elig %: Hardware;Switches;C9200L-24PXG4X-EDU;Catalyst 9200L 24-p
None: 8xmGig,16x1G,4x10G uplinks,K12;$6.740,00;100%

Category;Sub_Category;Part Number;Desc;price;Elig %: Hardware;Switches;C9200L-24PXG2Y-EDU;Catalyst 9200L 24-p
None: 8xmGig,16x1G,2x25G uplinks,K12;$7.290,00;100%

Category;Sub_Category;Part Number;Desc;price;Elig %: Hardware;Switches;C9200L-48PXG4X-EDU;Catalyst 9200L 48-p
None: 12xmGig,36x1G,4x10G uplinks,K12;$10.380,00;100%

Category;Sub_Category;Part Number;Desc;price;Elig %: Hardware;Switches;C9200L-48PXG2Y-EDU;Catalyst 9200L 48-p
None: 12xmGig,36x1G,2x25G uplinks,K12;$10.930,00;100%[0m[32;1m[1;3m

In [69]:
import pandas as pd
import os
import csv

# Define os caminhos para os arquivos de entrada e sa√≠da
# Certifique-se de que o nome do arquivo de entrada est√° correto
input_excel_path = 'data/raw/Attachment_3_NCDPI_eRate_IFB_Pricelist.xlsx'
output_csv_path = 'data/raw/Pricelist_corrigido.csv'

print(f"Tentando carregar o arquivo Excel de: {input_excel_path}")

try:
    # Carrega o arquivo .xlsx para um DataFrame do Pandas
    df = pd.read_excel(input_excel_path)
    print("Arquivo Excel carregado com sucesso.")

    # Salva o DataFrame como um novo arquivo .csv
    # A parte mais importante √© o 'quoting=csv.QUOTE_ALL'
    df.to_csv(
        output_csv_path, 
        index=False,                  # N√£o salva o √≠ndice do DataFrame como uma coluna
        encoding='utf-8',             # Define a codifica√ß√£o para evitar erros de caracteres
        quoting=csv.QUOTE_ALL         # For√ßa que todos os campos sejam envoltos por aspas duplas
    )

    print(f"Arquivo convertido com sucesso e salvo como: {output_csv_path}")

except FileNotFoundError:
    print(f"ERRO: Arquivo n√£o encontrado em '{input_excel_path}'. Por favor, verifique se o arquivo est√° no local correto.")
except Exception as e:
    print(f"Ocorreu um erro inesperado: {e}")

Tentando carregar o arquivo Excel de: data/raw/Attachment_3_NCDPI_eRate_IFB_Pricelist.xlsx
Arquivo Excel carregado com sucesso.
Arquivo convertido com sucesso e salvo como: data/raw/Pricelist_corrigido.csv


In [77]:
# 1. Instala√ß√µes necess√°rias
#!pip install -q -U langchain-openai tavily-python langchain pandas

import os
import pandas as pd
from langchain_openai import ChatOpenAI
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain.agents import tool
from langchain import hub
from langchain.agents import create_openai_tools_agent, AgentExecutor
import warnings

warnings.filterwarnings('ignore')


# --- CARREGAR OS DADOS COM PANDAS ---
# Carrega o CSV em um DataFrame para acesso r√°pido e direto
try:
    pricelist_df = pd.read_csv('data/raw/Pricelist_corrigido.csv', engine='python', on_bad_lines='warn')
    print("Arquivo Pricelist.csv carregado com sucesso no Pandas.")
    # Converte a coluna de Part Number para string para garantir correspond√™ncias exatas
    pricelist_df['Part Number'] = pricelist_df['Part Number'].astype(str)
except FileNotFoundError:
    print("ERRO: O arquivo 'data/raw/Pricelist.csv' n√£o foi encontrado.")
    pricelist_df = None

Arquivo Pricelist.csv carregado com sucesso no Pandas.


In [79]:
pricelist_df[pricelist_df['Part Number']=="C9200L-24P-4G-A"]

Unnamed: 0,Category,Sub_Category,Part Number,Desc,price,Elig %
1597,Hardware,Switches,C9200L-24P-4G-A,"Catalyst 9200L 24-port PoE+, 4 x 1G, Network A...",2745.0,1


In [49]:
# --- PASSO A: CRIAR A FERRAMENTA DE BUSCA DIRETA ---

@tool
def search_product_price(part_number: str) -> str:
    """
    Use this tool to find the exact list price and description for a specific Cisco Part Number.
    The input must be the exact Part Number string.
    """
    if pricelist_df is None:
        return "Error: Pricelist data is not available."
    
    # Busca exata (case-insensitive) no DataFrame
    result = pricelist_df[pricelist_df['Part Number'].str.lower() == part_number.lower()]
    
    if result.empty:
        return f"Part Number '{part_number}' not found in the pricelist."
    
    # Formata a resposta
    product_info = result.iloc[0]
    return (
        f"Found information for Part Number '{part_number}':\n"
        f"- Description: {product_info.get('Description', 'N/A')}\n"
        f"- List Price: ${product_info.get('List Price', 'N/A')}"
    )


# --- PASSO B: DEFINIR A FERRAMENTA DE BUSCA NA WEB E MONTAR O AGENTE ---

web_search_tool = TavilySearchResults(name="web_search", max_results=3)

# Agora a lista de ferramentas cont√©m nossa nova ferramenta de busca direta e a de busca na web
tools = [search_product_price, web_search_tool]

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
prompt = hub.pull("hwchase17/openai-tools-agent")
agent = create_openai_tools_agent(llm, tools, prompt)
agent_executor = AgentExecutor(agent=agent, tools=tools, verbose=True)


# --- PASSO C: TESTAR O AGENTE COM A NOVA FERRAMENTA ---

# Pergunta que antes falhava, mas agora deve funcionar perfeitamente
query = "What is the list price for Part Number C9200L-24P-4G-A ?"
print(f"--- Executando a pergunta: {query} ---")

response = agent_executor.invoke({"input": query})

print("\n--- Resposta Final do Agente ---")
print(response['output'])

--- Executando a pergunta: What is the list price for Part Number C9200L-24P-4G-A ? ---


[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `search_product_price` with `{'part_number': 'C9200L-24P-4G-A'}`


[0m[36;1m[1;3mFound information for Part Number 'C9200L-24P-4G-A':
- Description: N/A
- List Price: $N/A[0m[32;1m[1;3mThe list price for Part Number C9200L-24P-4G-A is not available at this time, and the description is also not provided.[0m

[1m> Finished chain.[0m

--- Resposta Final do Agente ---
The list price for Part Number C9200L-24P-4G-A is not available at this time, and the description is also not provided.


In [87]:
# --- Load the JSON data ---
product_list = []
try:
    # Path to your JSON file
    pricelist_path = 'data/raw/pricelist.json'
    
    with open(pricelist_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

        # --- CORRE√á√ÉO AQUI ---
        # Como o JSON j√° √© uma lista, atribu√≠mos diretamente.
        if isinstance(data, list):
            product_list = data
        else:
            # Caso o formato mude no futuro, ainda tentamos pegar a chave 'products'
            product_list = data.get('products', [])
            
    print(f"Successfully loaded {len(product_list)} products from {pricelist_path}.")

except FileNotFoundError:
    print(f"ERROR: The file '{pricelist_path}' was not found.")
    product_list = []
except json.JSONDecodeError:
    print(f"ERROR: The file '{pricelist_path}' is not a valid JSON file.")
    product_list = []

# --- Create the specialized Pricing Agent Tool ---
@tool
def get_product_price_and_description(part_number: str) -> str:
    """
    Use this tool to find the exact list price, description, and manufacturer
    for a specific Cisco Part Number from the JSON pricelist.
    The input must be the exact Part Number string.
    """
    if not product_list:
        return "Error: Product list data is not available."
    
    # Search for the product in the list of dictionaries (case-insensitive)
    found_product = next((p for p in product_list if p.get('part_number', '').lower() == part_number.lower()), None)
    
    if not found_product:
        return f"Part Number '{part_number}' not found in the pricelist."
    
    # Format a clean response string from the found product dictionary
    return (
        f"Info for '{found_product['part_number']}':\n"
        f"  Manufacturer: {found_product.get('manufacturer', 'N/A')}\n"
        f"  Description: {found_product.get('description', 'N/A')}\n"
        f"  List Price: ${found_product.get('list_price', 'N/A')}"
    )

print("Tool 'get_product_price_and_description' created and ready.")

Successfully loaded 16 products from data/raw/pricelist.json.
Tool 'get_product_price_and_description' created and ready.


In [103]:
# Importa o ChatPromptTemplate que estava faltando
from langchain.prompts import ChatPromptTemplate
from pydantic import BaseModel, Field
from typing import List
from langchain_openai import ChatOpenAI

# A defini√ß√£o do Pydantic (Skus) e do LLM (llm, structured_llm) continua a mesma
class Skus(BaseModel):
    """A list of product SKUs extracted from the user's query."""
    sku_list: List[str] = Field(description="A list of part numbers mentioned in the query.")

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
structured_llm = llm.with_structured_output(Skus)


# --- CORRE√á√ÉO AQUI ---
# 1. Criamos um template de prompt para instruir o LLM sobre o que fazer com a query.
technical_prompt = ChatPromptTemplate.from_template(
    "From the following user query, extract all and only the product cisco_product_id. If no part numbers are mentioned, return an empty list.\n\nUser Query: {query}"
)

# 2. A cadeia agora inclui o prompt para formatar a entrada para o LLM.
technical_agent_chain = (
    # O primeiro passo ainda cria um dicion√°rio com a query do usu√°rio
    {"query": lambda x: x} 
    # O segundo passo (novo) usa o prompt para formatar o dicion√°rio em um texto de instru√ß√£o
    | technical_prompt
    # O terceiro passo envia o prompt formatado para o LLM
    | structured_llm
)

print("Technical Agent (SKU extractor) created correctly.")

Technical Agent (SKU extractor) created correctly.


In [105]:
def run_quote_flow(user_query: str):
    """
    Orchestrates the two-agent flow to process a user query.
    """
    print("--- STARTING QUOTE FLOW ---")
    
    # 1. Call Technical Agent to identify SKUs
    print(f"\n[Orchestrator] Sending to Technical Agent: '{user_query}'")
    skus_result = technical_agent_chain.invoke(user_query)
    extracted_skus = skus_result.sku_list
    
    if not extracted_skus:
        print("[Orchestrator] No SKUs identified. Ending flow.")
        return "I could not identify any specific Part Numbers in your request."
        
    print(f"[Orchestrator] Technical Agent identified SKUs: {extracted_skus}")
    
    # 2. Call Pricing Agent (our tool) for each SKU
    final_quote_details = []
    print("\n[Orchestrator] Querying Pricing Agent for each SKU...")
    for sku in extracted_skus:
        print(f"  - Looking up price for: {sku}")
        price_info = get_product_price_and_description.invoke(sku)
        final_quote_details.append(price_info)
        
    # 3. Synthesize the final response
    print("\n--- FLOW COMPLETE. GENERATING FINAL RESPONSE ---")
    final_response = "\n\n".join(final_quote_details)
    return f"Here is the information you requested:\n\n{final_response}"



In [107]:
# --- EXECUTE THE TEST ---
# Query containing Part Numbers from your JSON file
user_query = "I need a price for the Catalyst switch C9200L-24P-4G-A and also for the Meraki access point QSFP-100G-SR4-S."

final_quote = run_quote_flow(user_query)

# Print the final, user-facing result
print("\n" + "="*50)
print("FINAL RESPONSE TO USER:")
print("="*50)
print(final_quote)

--- STARTING QUOTE FLOW ---

[Orchestrator] Sending to Technical Agent: 'I need a price for the Catalyst switch C9200L-24P-4G-A and also for the Meraki access point QSFP-100G-SR4-S.'
[Orchestrator] Technical Agent identified SKUs: ['C9200L-24P-4G-A', 'QSFP-100G-SR4-S']

[Orchestrator] Querying Pricing Agent for each SKU...
  - Looking up price for: C9200L-24P-4G-A
  - Looking up price for: QSFP-100G-SR4-S

--- FLOW COMPLETE. GENERATING FINAL RESPONSE ---

FINAL RESPONSE TO USER:
Here is the information you requested:

Part Number 'C9200L-24P-4G-A' not found in the pricelist.

Part Number 'QSFP-100G-SR4-S' not found in the pricelist.


In [81]:
import pandas as pd
import re
import json
from typing import Dict, Any

def normalize_price(price_str: str) -> float:
    """Converte strings de pre√ßo no formato '$1.099,00' para float 1099.00"""
    if not isinstance(price_str, str) or price_str.strip() == "":
        return 0.0
        
    clean_str = price_str.replace('$', '').replace('.', '').replace(',', '.')
    try:
        return float(clean_str)
    except ValueError:
        return 0.0

def extract_tech_specs(category: str, sub_category: str, description: str) -> Dict[str, Any]:
    """Extrai especifica√ß√µes t√©cnicas baseadas na descri√ß√£o do produto"""
    specs = {}
    description = description.lower()
    
    # Mapeamento de categorias para atributos
    category_map = {
        'Antennas': {
            'category': 'antenna',
            'subcategory': lambda d: 'sector' if 'sector' in d else 'omni' if 'omni' in d else 'patch'
        },
        'Cabling': {
            'category': 'cable',
            'connector_type': lambda d: re.search(r'(BNC|DB15|MPO|LC|RJ-45)', d, re.I).group(0) if re.search(r'(BNC|DB15|MPO|LC|RJ-45)', d, re.I) else None,
            'length': lambda d: re.search(r'(\d+ ?m)', d, re.I).group(0) if re.search(r'(\d+ ?m)', d, re.I) else None
        },
        'Connectors': {
            'category': 'transceiver',
            'standard': lambda d: re.search(r'(\d+G?BASE?-?[\w\d]+)', d, re.I).group(0) if re.search(r'(\d+G?BASE?-?[\w\d]+)', d, re.I) else None,
            'fiber_type': lambda d: 'SMF' if 'smf' in d else 'MMF' if 'mmf' in d else None,
            'max_distance': lambda d: re.search(r'(\d+ ?km|\d+ ?m)', d, re.I).group(0) if re.search(r'(\d+ ?km|\d+ ?m)', d, re.I) else None
        },
        'Firewall': {
            'category': 'firewall',
            'model': lambda d: re.search(r'(ASA ?[\d\-X]+)', d, re.I).group(0) if re.search(r'(ASA ?[\d\-X]+)', d, re.I) else None,
            'throughput': lambda d: re.search(r'(\d+ ?Gbps|\d+ ?Mbps)', d, re.I).group(0) if re.search(r'(\d+ ?Gbps|\d+ ?Mbps)', d, re.I) else None
        }
    }
    
    # Aplica regras baseadas na categoria principal
    if category in category_map:
        category_rules = category_map[category]
        specs = {'category': category, 'subcategory': sub_category}
        
        for attr, rule in category_rules.items():
            if attr in ['category', 'subcategory']:
                continue
                
            try:
                if callable(rule):
                    result = rule(description)
                    if result:
                        specs[attr] = result
            except:
                pass
                
        # Adiciona atributos espec√≠ficos para firewalls
        if category == 'Firewall':
            specs['encryption'] = '3DES/AES' if '3DES/AES' in description else 'DES' if 'DES' in description else None
    
    return specs

def convert_to_unified_format(input_csv: str, output_json: str):
    """Converte CSV de produtos para formato JSON estruturado"""
    # Carrega dados do CSV
    df = pd.read_csv(input_csv, sep=',')  # Assumindo separador tabular
    
    # Lista para armazenar produtos convertidos
    unified_products = []
    
    for _, row in df.iterrows():
        # Extrai campos b√°sicos
        product = {
            "cisco_product_id": row['Part Number'].strip(),
            "commercial_name": row['Desc'].strip(),
            "product_type": "hardware",
            "lifecycle": {
                "status": "active",
                "eos_announced": "2028-12-31" if 'ASA' in row['Desc'] else "2026-12-31"
            }
        }
        
        # Adiciona perfil t√©cnico
        tech_specs = extract_tech_specs(
            row['Category'],
            row['Sub_Category'],
            row['Desc']
        )
        product["technical_profile"] = {"hardware_attributes": tech_specs}
        
        # Modelo de precifica√ß√£o
        normalized_price = normalize_price(row['price'])
        product["pricing_model"] = {
            "type": "one_time",
            "currency": "USD",
            "base_price": normalized_price,
            "pricing_tiers": [
                {
                    "min_quantity": 1,
                    "price": normalized_price,
                    "effective": "2025-01-01",
                    "discount_rules": [
                        {"type": "volume", "threshold": 5, "discount_pct": 10},
                        {"type": "volume", "threshold": 20, "discount_pct": 20}
                    ]
                }
            ]
        }
        
        # Depend√™ncias (preenchidas para firewalls)
        if row['Category'] == 'Firewall':
            product["dependencies"] = {
                "optional_accessories": [],
                "required_services": ["Smart Net Total Care"]
            }
            
            # Adiciona SSD como depend√™ncia quando mencionado
            if 'SSD' in row['Desc']:
                ssd_part = row['Part Number'].replace('=', '') + '-SSD'
                product["dependencies"]["required_components"] = [ssd_part]
        
        unified_products.append(product)
    
    # Salva resultado em JSON
    with open(output_json, 'w', encoding='utf-8') as f:
        json.dump(unified_products, f, indent=2, ensure_ascii=False)
    
    print(f"Convers√£o conclu√≠da! {len(unified_products)} produtos convertidos.")

# Uso
if __name__ == "__main__":
    INPUT_CSV = "data/raw/Pricelist_corrigido.csv"  # Seu arquivo de entrada
    OUTPUT_JSON = "cisco_products_unified.json"  # Arquivo de sa√≠da
    
    convert_to_unified_format(INPUT_CSV, OUTPUT_JSON)

Convers√£o conclu√≠da! 4267 produtos convertidos.


In [112]:
!pip install -U langgraph



In [116]:
import json
from langchain.tools import tool
from langchain.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from langgraph.graph import END, StateGraph

# ==============================
# 1. Carregar dados dos produtos
# ==============================
product_list = []
try:
    pricelist_path = 'data/raw/pricelist.json'
    with open(pricelist_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        product_list = data if isinstance(data, list) else data.get('products', [])
    print(f"‚úÖ Dados carregados: {len(product_list)} produtos")
except Exception as e:
    print(f"‚ùå Erro ao carregar dados: {str(e)}")
    product_list = []

# ===================================
# 2. Ferramenta do Agente de Pre√ßos
# ===================================
@tool
def get_product_price_and_description(part_number: str) -> str:
    """
    Busca pre√ßo e descri√ß√£o para um n√∫mero de parte Cisco.
    Retorna informa√ß√µes detalhadas sobre o produto.
    """
    if not product_list:
        return "Erro: Base de produtos n√£o carregada"
    
    # Busca insens√≠vel a mai√∫sculas/min√∫sculas
    part_number_clean = part_number.strip().upper()
    
    # Primeiro busca por correspond√™ncia exata
    exact_match = next(
        (p for p in product_list if p.get('cisco_product_id', '').upper() == part_number_clean),
        None
    )
    
    # Se n√£o encontrar, busca por correspond√™ncia parcial
    if not exact_match:
        partial_match = next(
            (p for p in product_list if part_number_clean in p.get('cisco_product_id', '').upper()),
            None
        )
        if partial_match:
            exact_match = partial_match
    
    if not exact_match:
        return f"Produto '{part_number}' n√£o encontrado"
    
    # Formata resposta
    price = exact_match['pricing_model']['base_price']
    currency = exact_match['pricing_model'].get('currency', 'USD')
    description = exact_match['commercial_name']
    
    return (f"üì¶ {exact_match['cisco_product_id']}: {description}\n"
            f"üíµ Pre√ßo: {currency} {price:.2f}\n"
            f"üîß Categoria: {exact_match['technical_profile']['hardware_attributes'].get('category', 'N/A')}")

# ======================================
# 3. Modelo Pydantic para extra√ß√£o de SKUs
# ======================================
class ProductSKUs(BaseModel):
    """Lista de n√∫meros de parte extra√≠dos da consulta do usu√°rio"""
    skus: List[str] = Field(description="Lista de identificadores de produtos Cisco (ex: MR53E-HW, ASA5516)")

# ======================================
# 4. Agente T√©cnico (Identificador de SKUs)
# ======================================
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

tech_prompt = ChatPromptTemplate.from_template(
    "Voc√™ √© um especialista em produtos Cisco. Sua tarefa √© extrair TODOS os n√∫meros de parte de produtos Cisco "
    "mencionados na consulta do usu√°rio. Retorne APENAS os n√∫meros de parte v√°lidos, mesmo que escritos de forma incompleta.\n\n"
    "Dicas importantes:\n"
    "- Cisco Part Numbers geralmente seguem padr√µes como: 'MR53E-HW', 'ASA5516', 'QSFP-100G-SR4-S'\n"
    "- Ignore palavras gen√©ricas como 'switch', 'firewall', 'router'\n"
    "- Se n√£o encontrar nenhum, retorne lista vazia\n\n"
    "Consulta do usu√°rio: {input}"
)

tech_agent = tech_prompt | llm.with_structured_output(ProductSKUs)

# ======================================
# 5. Estado do Fluxo (LangGraph)
# ======================================
class AgentState(BaseModel):
    user_query: str
    identified_skus: List[str] = Field(default_factory=list)
    price_results: List[str] = Field(default_factory=list)
    final_response: str = ""

# ======================================
# 6. Definindo N√≥s do Grafo (CORRIGIDOS)
# ======================================
def identify_skus_node(state: AgentState) -> dict:
    """N√≥: Identifica SKUs usando o agente t√©cnico"""
    print(f"\nüîç Identificando SKUs na consulta: '{state.user_query[:50]}...'")
    result = tech_agent.invoke({"input": state.user_query})
    print(f"‚úÖ SKUs identificados: {result.skus}")
    return {"identified_skus": result.skus}

def price_lookup_node(state: AgentState) -> dict:
    """N√≥: Busca pre√ßos para cada SKU identificado"""
    if not state.identified_skus:
        print("‚ö†Ô∏è Nenhum SKU para consultar")
        return {"final_response": "N√£o identifiquei produtos espec√≠ficos na sua solicita√ß√£o."}
    
    print(f"\nüí∞ Consultando pre√ßos para {len(state.identified_skus)} SKUs...")
    price_results = []
    for sku in state.identified_skus:
        try:
            result = get_product_price_and_description(sku)
            price_results.append(f"Consulta para {sku}:\n{result}")
            print(f"  - {sku}: Encontrado")
        except Exception as e:
            price_results.append(f"‚ö†Ô∏è Erro ao consultar {sku}: {str(e)}")
            print(f"  - {sku}: Erro - {str(e)}")
    
    return {"price_results": price_results}

def synthesize_response_node(state: AgentState) -> dict:
    """N√≥: Sintetiza resposta final"""
    if not state.price_results:
        return {"final_response": "N√£o consegui encontrar informa√ß√µes para os produtos solicitados."}
    
    response = "Aqui est√£o as informa√ß√µes solicitadas:\n\n"
    response += "\n\n".join(state.price_results)
    return {"final_response": response}

# ======================================
# 7. Construindo o Grafo de Fluxo
# ======================================
workflow = StateGraph(AgentState)

# Adiciona n√≥s
workflow.add_node("identify_skus", identify_skus_node)
workflow.add_node("price_lookup", price_lookup_node)
workflow.add_node("synthesize", synthesize_response_node)

# Define fluxo
workflow.set_entry_point("identify_skus")
workflow.add_edge("identify_skus", "price_lookup")
workflow.add_edge("price_lookup", "synthesize")
workflow.add_edge("synthesize", END)

# Compila o grafo
app = workflow.compile()

# ======================================
# 8. Fun√ß√£o para Executar o Fluxo (CORRIGIDA)
# ======================================
def run_quote_flow(user_query: str) -> str:
    """Orquestra todo o fluxo de cota√ß√£o"""
    print("\n" + "="*50)
    print(f"üöÄ INICIANDO COTA√á√ÉO PARA: '{user_query}'")
    print("="*50)
    
    # Executa o fluxo com o estado inicial
    final_state = app.invoke(AgentState(user_query=user_query))
    
    print("\n" + "="*50)
    print("‚úÖ COTA√á√ÉO FINALIZADA")
    print("="*50)
    
    # Acessa a resposta final corretamente
    return final_state["final_response"]

# ======================================
# 9. Teste do Sistema (CORRIGIDO)
# ======================================
if __name__ == "__main__":
    # Caso de teste 1: Consulta com SKUs v√°lidos
    test_query_1 = "Pre√ßo para o firewall ASA5516-FPWR-K9 e o access point MR53E-HW"
    result_1 = run_quote_flow(test_query_1)
    print("\nüí¨ RESULTADO 1:")
    print(result_1)
    
    # Caso de teste 2: Consulta sem SKUs espec√≠ficos
    test_query_2 = "Preciso de um firewall com throughput de 10 Gbps "
    result_2 = run_quote_flow(test_query_2)
    print("\nüí¨ RESULTADO 2:")
    print(result_2)
    
    # Caso de teste 3: SKU parcial
    test_query_3 = "Quanto custa o QSFP-100G-SR4?"
    result_3 = run_quote_flow(test_query_3)
    print("\nüí¨ RESULTADO 3:")
    print(result_3)

‚úÖ Dados carregados: 16 produtos

üöÄ INICIANDO COTA√á√ÉO PARA: 'Pre√ßo para o firewall ASA5516-FPWR-K9 e o access point MR53E-HW'

üîç Identificando SKUs na consulta: 'Pre√ßo para o firewall ASA5516-FPWR-K9 e o access p...'
‚úÖ SKUs identificados: ['ASA5516-FPWR-K9', 'MR53E-HW']

üí∞ Consultando pre√ßos para 2 SKUs...
  - ASA5516-FPWR-K9: Encontrado
  - MR53E-HW: Encontrado

‚úÖ COTA√á√ÉO FINALIZADA

üí¨ RESULTADO 1:
Aqui est√£o as informa√ß√µes solicitadas:

Consulta para ASA5516-FPWR-K9:
üì¶ ASA5516-FPWR-K9: ASA 5516-X with FirePOWER Services
üíµ Pre√ßo: USD 5995.00
üîß Categoria: security

Consulta para MR53E-HW:
üì¶ MR53E-HW: Meraki MR53E Access Point
üíµ Pre√ßo: USD 1699.00
üîß Categoria: wireless

üöÄ INICIANDO COTA√á√ÉO PARA: 'Preciso de um switch Cisco de 24 portas'

üîç Identificando SKUs na consulta: 'Preciso de um switch Cisco de 24 portas...'
‚úÖ SKUs identificados: []
‚ö†Ô∏è Nenhum SKU para consultar

‚úÖ COTA√á√ÉO FINALIZADA

üí¨ RESULTADO 2:
N√£o consegui 

In [3]:
# 1. Instala√ß√µes necess√°rias
#!pip install -q -U langchain-openai tavily-python langchain beautifulsoup4 chromadb pypdf unstructured

import os
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain.tools.retriever import create_retriever_tool
from langchain_community.document_loaders import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain import hub
from langchain.agents import create_openai_tools_agent, AgentExecutor
import warnings

warnings.filterwarnings('ignore')

# --- CONFIGURA√á√ÉO DAS CHAVES DE API ---
# ‚ö†Ô∏è Cole suas chaves aqui.
OPENAI_API_KEY = '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'
TAVILY_API_KEY = "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" # <-- SUA CHAVE DA TAVILY AQUI

os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ['TAVILY_API_KEY'] = TAVILY_API_KEY

In [130]:
import json
from langchain.tools import tool
from langchain.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from langgraph.graph import END, StateGraph
from typing import List, Dict, TypedDict, Annotated, Literal, Union

# ==============================
# 1. Load Product Data
# ==============================
product_list = []
try:
    pricelist_path = 'data/raw/pricelist.json'
    with open(pricelist_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        product_list = data if isinstance(data, list) else data.get('products', [])
    print(f"‚úÖ Data loaded: {len(product_list)} products")
except Exception as e:
    print(f"‚ùå Error loading data: {str(e)}")
    product_list = []

# ===================================
# 2. Agent Tools
# ===================================
@tool
def get_product_price(part_number: str) -> Dict:
    """Retrieves pricing information for Cisco products"""
    print(f"üîç Searching price for: {part_number}")
    try:
        # Busca o produto
        for product in product_list:
            if product['part_number'] == part_number:
                return {
                    'price': product.get('price', 0.0),
                    'currency': product.get('currency', 'USD'),
                    'description': product.get('description', ''),
                    'part_number': part_number
                }
        
        # Se n√£o encontrou
        return {
            'error': f"Product {part_number} not found in pricelist",
            'part_number': part_number
        }
        
    except Exception as e:
        return {
            'error': f"Error retrieving price: {str(e)}",
            'part_number': part_number
        }

@tool
def get_technical_specs(part_number: str) -> Dict:
    """Retrieves detailed technical specifications for Cisco products"""
    print(f"üîç Searching technical specs for: {part_number}")
    try:
        # Busca o produto na lista
        for product in product_list:
            if product['part_number'] == part_number:
                # Retorna as especifica√ß√µes t√©cnicas, se existirem
                specs = product.get('specifications', {})
                if not specs:
                    return {
                        'error': f"No technical specifications available for {part_number}",
                        'part_number': part_number
                    }
                return {
                    'part_number': part_number,
                    'specifications': specs,
                    'description': product.get('description', '')
                }
        
        # Se n√£o encontrou o produto
        return {
            'error': f"Product {part_number} not found in database",
            'part_number': part_number
        }
        
    except Exception as e:
        return {
            'error': f"Error retrieving technical specs: {str(e)}",
            'part_number': part_number
        }

# ======================================
# 3. Pydantic Models
# ======================================
class AgentRoutingDecision(BaseModel):
    """Orchestrator's decision about agent routing"""
    needs_technical: bool = Field(description="Whether technical agent is required")
    needs_pricing: bool = Field(description="Whether pricing agent is required")
    needs_compliance: bool = Field(False, description="Whether compliance agent is required")
    query_parts: Dict[str, str] = Field(
        default_factory=dict,  # Valor padr√£o vazio
        description="Decomposed query parts for each agent"
    )

class ProductInfo(BaseModel):
    """Unified product information model"""
    part_number: str
    description: str
    price: Union[float, None]
    technical_specs: Dict[str, str]

# ======================================
# 4. Agent Definitions
# ======================================
llm = ChatOpenAI(model="gpt-4o", temperature=0)

# Orchestrator Agent
orchestrator_prompt = ChatPromptTemplate.from_template(
    "You are a Cisco sales orchestration system. Analyze the user query and:\n"
    "1. Determine which specialized agents are needed\n"
    "2. Decompose the query into specific parts for each agent\n"
    "3. ALWAYS include a 'query_parts' dictionary with agent names as keys\n\n"
    "Example structure for output:\n"
    "{{\n"
    "  \"needs_technical\": true,\n"
    "  \"needs_pricing\": true,\n"
    "  \"query_parts\": {{\n"
    "    \"technical\": \"part numbers and specs\",\n"
    "    \"pricing\": \"part numbers for pricing\"\n"
    "  }}\n"
    "}}\n\n"
    "User Query: {query}\n\n"
    "Output Instructions: {format_instructions}"
)
orchestrator_agent = orchestrator_prompt | llm.with_structured_output(AgentRoutingDecision)

# Technical Agent (SKU Extraction + Specs)
tech_prompt = ChatPromptTemplate.from_template(
    "You are a Cisco technical specialist. From the following query segment, "
    "extract ALL product identifiers and retrieve their specifications:\n\n"
    "Query Segment: {query_part}\n\n"
    "Output Instructions: {format_instructions}"
)

# Pricing Agent
def pricing_agent(products: List[Dict]) -> List[Dict]:
    """Retrieves pricing for multiple products"""
    results = []
    for product in products:
        result = get_product_price(product['part_number'])
        if 'error' not in result:
            product.update(result)
        results.append(product)
    return results

# ======================================
# 5. State Definition
# ======================================

class AgentState(TypedDict):
    """State of the agent workflow"""
    user_query: str
    orchestrator_decision: Union[AgentRoutingDecision, None]
    technical_results: List[Dict]  # Removido Annotated
    pricing_results: List[Dict]    # Removido Annotated
    final_response: str

# ======================================
# 6. Graph Nodes Implementation
# ======================================
def orchestrator_node(state: AgentState) -> AgentState:
    print(f"\nüéª [Orchestrator] Analyzing query: '{state['user_query']}'")
    
    try:
        decision = orchestrator_agent.invoke({
            "query": state["user_query"],
            "format_instructions": AgentRoutingDecision.schema()
        })
    except Exception as e:
        print(f"‚ö†Ô∏è Orchestrator error: {str(e)}")
        # Fallback decision if parsing fails
        decision = AgentRoutingDecision(
            needs_technical="technical" in state["user_query"].lower(),
            needs_pricing="price" in state["user_query"].lower(),
            query_parts={}
        )
    
    state["orchestrator_decision"] = decision
    
    # Logging melhorado
    print(f"‚úÖ Routing decision: "
          f"Technical={decision.needs_technical} "
          f"Pricing={decision.needs_pricing}")
    
    if decision.query_parts:
        print(f"Query parts: {decision.query_parts}")
    else:
        print("‚ö†Ô∏è No query parts decomposed, using full query")
        decision.query_parts = {
            "technical": state["user_query"],
            "pricing": state["user_query"]
        }
    
    return state

def technical_agent_node(state: AgentState) -> AgentState:
    """Handles technical aspects of products"""
    if not state["orchestrator_decision"].needs_technical:
        print("‚è© Skipping technical agent (not required)")
        return state
        
    query_part = state["orchestrator_decision"].query_parts.get("technical", "")
    print(f"\nüîß [Technical Agent] Processing: '{query_part}'")
    
    # Extrai SKUs da query usando regex (poderia ser substitu√≠do por LLM em produ√ß√£o)
    import re
    skus = re.findall(r'[A-Z0-9\-]+', query_part)
    skus = [sku for sku in skus if len(sku) > 5]  # Filtra strings curtas
    
    if not skus:
        print("‚ö†Ô∏è No SKUs found in technical query part")
        skus = ["UNKNOWN"]
    
    state["technical_results"] = []
    for sku in skus:
        # Busca as especifica√ß√µes t√©cnicas
        result = get_technical_specs(sku)
        state["technical_results"].append(result)
    
    # Log dos resultados
    for result in state["technical_results"]:
        status = "‚úÖ" if "specifications" in result else "‚ùå"
        print(f"  {status} {result.get('part_number', 'Unknown')}: "
              f"{result.get('error', 'Specs found')}")
    
    return state

def pricing_agent(products: List[Dict]) -> List[Dict]:
    """Retrieves pricing for multiple products"""
    results = []
    for product in products:
        # Garante que temos um n√∫mero de pe√ßa
        pn = product.get('part_number', 'UNKNOWN')
        if not pn or pn == 'UNKNOWN':
            results.append({
                'error': 'Missing part number',
                'details': product
            })
            continue
            
        # Busca pre√ßo
        result = get_product_price(pn)
        
        # Combina resultados
        combined = {**product, **result}
        results.append(combined)
        
    return results

def pricing_agent_node(state: AgentState) -> AgentState:
    """Handles pricing aspects of products"""
    if not state["orchestrator_decision"].needs_pricing:
        print("‚è© Skipping pricing agent (not required)")
        return state
        
    # Use technical results if available
    products = state["technical_results"] if state["technical_results"] else []
    
    # Se n√£o tem resultados t√©cnicos, tenta extrair da query
    if not products:
        query_part = state["orchestrator_decision"].query_parts.get("pricing", "")
        print(f"‚ö†Ô∏è No technical results, extracting from pricing query: '{query_part}'")
        
        # Simula√ß√£o de extra√ß√£o de SKUs - na implementa√ß√£o real, usar√≠amos um LLM
        # Aqui apenas para demonstra√ß√£o
        if "MR53E-HW" in query_part:
            products = [{"part_number": "MR53E-HW"}]
        elif "QSFP" in query_part:
            products = [{"part_number": "QSFP-100G-SR4-S"}]
        else:
            # Tenta extrair qualquer coisa que pare√ßa um SKU
            import re
            skus = re.findall(r'[A-Z0-9\-]+', query_part)
            products = [{"part_number": sku} for sku in skus if len(sku) > 5]
            
        if not products:
            products = [{"part_number": "UNKNOWN", "error": "No SKUs extracted"}]
    
    print(f"\nüí∞ [Pricing Agent] Processing {len(products)} products...")
    state["pricing_results"] = pricing_agent(products)
    
    for result in state["pricing_results"]:
        status = "‚úÖ" if "price" in result and not result.get('error') else "‚ùå"
        print(f"  {status} {result.get('part_number', 'Unknown')}: "
              f"{result.get('price', result.get('error', 'No info'))}")
    
    return state

def synthesize_response_node(state: AgentState) -> AgentState:
    """Creates final response by combining agent outputs"""
    print("\nüéØ [Synthesizer] Combining agent results")
    
    response = ["Here's the information you requested:"]
    
    # Mostrar resultados de pre√ßos se dispon√≠veis
    if state["pricing_results"]:
        response.append("\nüì¶ Pricing Information:")
        for product in state["pricing_results"]:
            if "error" in product:
                response.append(f"\n‚ö†Ô∏è {product.get('part_number', 'Unknown')}: {product['error']}")
            else:
                response.append(
                    f"\n   ‚Ä¢ {product['part_number']}"
                    f"\n      Description: {product.get('description', 'N/A')}"
                    f"\n      üíµ Price: {product.get('currency', 'USD')} {product.get('price', 'N/A')}"
                )
    
    # Mostrar resultados t√©cnicos se dispon√≠veis
    if state["technical_results"]:
        response.append("\n\nüîß Technical Specifications:")
        for product in state["technical_results"]:
            if "error" in product:
                response.append(f"\n‚ö†Ô∏è {product.get('part_number', 'Unknown')}: {product['error']}")
            else:
                specs = product.get('specifications', {})
                if specs:
                    specs_str = "\n      ".join([f"{k}: {v}" for k, v in specs.items()])
                    response.append(f"\n   ‚Ä¢ {product['part_number']}:\n      {specs_str}")
                else:
                    response.append(f"\n   ‚Ä¢ {product['part_number']}: No specifications available")
    
    # Se n√£o houver resultados
    if not state["pricing_results"] and not state["technical_results"]:
        response.append("\n‚ùå No relevant information found for your query")
    
    state["final_response"] = "\n".join(response)
    return state

# ======================================
# 7. Conditional Routing Logic
# ======================================
def route_after_orchestrator(state: AgentState) -> str:
    """Decides which agent to call first based on orchestrator decision"""
    decision = state["orchestrator_decision"]
    
    if decision.needs_technical:
        return "technical_agent"
    elif decision.needs_pricing:
        return "pricing_agent"
    else:
        return "synthesize"

def route_after_technical(state: AgentState) -> str:
    """Decides next step after technical agent"""
    decision = state["orchestrator_decision"]
    
    if decision.needs_pricing:
        return "pricing_agent"
    else:
        return "synthesize"

def route_after_pricing(state: AgentState) -> str:
    """Always goes to synthesizer after pricing"""
    return "synthesize"

# ======================================
# 8. Build Agent Workflow Graph
# ======================================
workflow = StateGraph(AgentState)

# Add nodes
workflow.add_node("orchestrator", orchestrator_node)
workflow.add_node("technical_agent", technical_agent_node)
workflow.add_node("pricing_agent", pricing_agent_node)
workflow.add_node("synthesize", synthesize_response_node)

# Define workflow with conditional routing
workflow.set_entry_point("orchestrator")

workflow.add_conditional_edges(
    "orchestrator",
    route_after_orchestrator,
    {
        "technical_agent": "technical_agent",
        "pricing_agent": "pricing_agent",
        "synthesize": "synthesize"
    }
)

workflow.add_conditional_edges(
    "technical_agent",
    route_after_technical,
    {
        "pricing_agent": "pricing_agent",
        "synthesize": "synthesize"
    }
)

workflow.add_conditional_edges(
    "pricing_agent",
    route_after_pricing,
    {
        "synthesize": "synthesize"
    }
)

workflow.add_edge("synthesize", END)

# Compile the graph
app = workflow.compile()

# ======================================
# 9. Run the Agent Workflow
# ======================================
def run_sales_quote(user_query: str) -> str:
    """Execute the full agent workflow"""
    print("\n" + "="*60)
    print(f"üöÄ STARTING QUOTE PROCESSING: '{user_query}'")
    print("="*60)
    initial_state = {
        "user_query": user_query,
        "orchestrator_decision": None,
        "technical_results": [],  # Inicializado aqui
        "pricing_results": [],    # Inicializado aqui
        "final_response": ""
    }
    final_state = app.invoke(initial_state)
    print("\n" + "="*60)
    print("‚úÖ QUOTE PROCESSING COMPLETE")
    print("="*60)
    return final_state["final_response"]
    

# ======================================
# 10. Test Cases
# ======================================
if __name__ == "__main__":
    # Test 1: Technical + Pricing request
    test_query_1 = "I need technical specs for ASA5516-FPWR-K9 and pricing for MR53E-HW"
    result_1 = run_sales_quote(test_query_1)
    print("\nüí¨ CLIENT RESPONSE 1:")
    print(result_1)
    
    # Test 2: Pricing-only request
    test_query_2 = "How much does QSFP-100G-SR4-S cost?"
    result_2 = run_sales_quote(test_query_2)
    print("\nüí¨ CLIENT RESPONSE 2:")
    print(result_2)
    
    # Test 3: Technical-only request
    test_query_3 = "What are the specifications for the Catalyst 9300 switch?"
    result_3 = run_sales_quote(test_query_3)
    print("\nüí¨ CLIENT RESPONSE 3:")
    print(result_3)

‚úÖ Data loaded: 16 products

üöÄ STARTING QUOTE PROCESSING: 'I need technical specs for ASA5516-FPWR-K9 and pricing for MR53E-HW'

üéª [Orchestrator] Analyzing query: 'I need technical specs for ASA5516-FPWR-K9 and pricing for MR53E-HW'
‚úÖ Routing decision: Technical=True Pricing=True
Query parts: {'technical': 'technical specs for ASA5516-FPWR-K9', 'pricing': 'pricing for MR53E-HW'}

üîß [Technical Agent] Processing: 'technical specs for ASA5516-FPWR-K9'
üîç Searching technical specs for: ASA5516-FPWR-K9
  ‚ùå ASA5516-FPWR-K9: Error retrieving technical specs: 'part_number'

üí∞ [Pricing Agent] Processing 1 products...
üîç Searching price for: ASA5516-FPWR-K9
  ‚ùå ASA5516-FPWR-K9: Error retrieving price: 'part_number'

üéØ [Synthesizer] Combining agent results

‚úÖ QUOTE PROCESSING COMPLETE

üí¨ CLIENT RESPONSE 1:
Here's the information you requested:

üì¶ Pricing Information:

‚ö†Ô∏è ASA5516-FPWR-K9: Error retrieving price: 'part_number'


üîß Technical Specifications:



In [158]:
import json
from langchain.tools import tool
from langchain.prompts import ChatPromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_openai import ChatOpenAI
from langgraph.graph import END, StateGraph
from typing import List, Dict, TypedDict, Union, Optional
import re


import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from langchain_core.runnables import RunnableLambda

# ==============================
# 1. Load Product Data (CORRIGIDO)
# ==============================
# ======================================
# 1. Pr√©-processamento de Dados para Recomenda√ß√µes
# ======================================
# Prepara embeddings para busca sem√¢ntica
def prepare_recommendation_data():
    product_texts = []
    for product_id, product in product_dict.items():
        commercial_name = product.get('commercial_name', '')
        product_type = product.get('product_type', '')
        tech_profile = product.get('technical_profile', {})
        hardware = tech_profile.get('hardware_attributes', {})
        
        # Cria texto descritivo para embeddings
        text = f"{commercial_name} {product_type} "
        if hardware:
            text += " ".join([f"{k}={v}" for k,v in hardware.items()])
        product_texts.append(text.strip())
    
    # Cria vetores TF-IDF
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(product_texts)
    
    return vectorizer, tfidf_matrix

# Pr√©-processa dados uma vez no in√≠cio
vectorizer, tfidf_matrix = prepare_recommendation_data()
print("‚úÖ Recommendation data prepared")

# Criar dicion√°rio para acesso r√°pido por ID
product_dict = {}
try:
    pricelist_path = 'data/raw/pricelist.json'
    with open(pricelist_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        products = data if isinstance(data, list) else data.get('products', [])
        
        # Criar dicion√°rio de produtos indexado por ID
        for product in products:
            product_id = product.get('cisco_product_id')
            if product_id:
                product_dict[product_id] = product
        
    print(f"‚úÖ Data loaded: {len(product_dict)} products")
except Exception as e:
    print(f"‚ùå Error loading data: {str(e)}")
    product_dict = {}

# ===================================
# 2. Agent Tools (COMPLETO E CORRIGIDO)
# ===================================
@tool
def get_product_price(part_number: str) -> Dict:
    """Retrieves pricing information for Cisco products"""
    print(f"üîç Searching price for: {part_number}")
    try:
        product = product_dict.get(part_number)
        if not product:
            return {
                'error': f"Product {part_number} not found",
                'part_number': part_number
            }
        
        pricing = product.get('pricing_model', {})
        return {
            'price': pricing.get('base_price', 0.0),
            'currency': pricing.get('currency', 'USD'),
            'description': product.get('commercial_name', ''),
            'part_number': part_number,
            'product_type': product.get('product_type', '')
        }
    except Exception as e:
        return {
            'error': f"Error retrieving price: {str(e)}",
            'part_number': part_number
        }

@tool
def get_technical_specs(part_number: str) -> Dict:
    """Retrieves technical specifications for Cisco products"""
    print(f"üîç Searching specs for: {part_number}")
    try:
        product = product_dict.get(part_number)
        if not product:
            return {
                'error': f"Product {part_number} not found",
                'part_number': part_number
            }
        
        tech_profile = product.get('technical_profile', {})
        hardware = tech_profile.get('hardware_attributes', {})
        
        if not hardware:
            return {
                'error': f"No technical specs available for {part_number}",
                'part_number': part_number
            }
        
        return {
            'specifications': hardware,
            'description': product.get('commercial_name', ''),
            'part_number': part_number,
            'product_type': product.get('product_type', '')
        }
    except Exception as e:
        return {
            'error': f"Error retrieving specs: {str(e)}",
            'part_number': part_number
        }


# ======================================
# 2. Nova Ferramenta de Recomenda√ß√£o
# ======================================
@tool
def recommend_products(requirements: str, max_results: int = 3) -> List[Dict]:
    """Recommends Cisco products based on technical requirements"""
    print(f"üîç Recommending products for: {requirements}")
    
    try:
        # Transforma a consulta no mesmo espa√ßo vetorial
        query_vec = vectorizer.transform([requirements])
        
        # Calcula similaridade de cosseno
        cosine_similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()
        
        # Obt√©m os √≠ndices dos produtos mais relevantes
        top_indices = np.argsort(cosine_similarities)[::-1][:max_results]
        
        recommendations = []
        product_list = list(product_dict.values())
        for idx in top_indices:
            product = product_list[idx]
            recommendations.append({
                'part_number': product['cisco_product_id'],
                'commercial_name': product['commercial_name'],
                'product_type': product['product_type'],
                'similarity_score': float(cosine_similarities[idx]),
                'description': f"{product['commercial_name']} ({product['cisco_product_id']})"
            })
        
        return recommendations
    
    except Exception as e:
        return [{
            'error': f"Recommendation error: {str(e)}",
            'requirements': requirements
        }]

# ======================================
# 3. Pydantic Models (ATUALIZADO)
# ======================================
class AgentRoutingDecision(BaseModel):
    """Orchestrator's decision about agent routing"""
    needs_technical: bool = Field(description="Whether technical agent is required")
    needs_pricing: bool = Field(description="Whether pricing agent is required")
    needs_design: bool = Field(False, description="Whether solution design is required")  # Novo campo
    query_parts: Dict[str, str] = Field(
        default_factory=dict,
        description="Decomposed query parts for each agent"
    )

class SolutionComponent(BaseModel):
    part_number: str = Field(description="Cisco product ID")
    quantity: int = Field(description="Quantity required", default=1)
    role: str = Field(description="Role in the solution")

class SolutionDesign(BaseModel):
    """Comprehensive solution design for customer requirements"""
    summary: str = Field(description="High-level solution summary")
    components: List[SolutionComponent] = Field(description="List of required products")
    justification: str = Field(description="Technical and business justification")

# ======================================
# 4. Agent Definitions (ATUALIZADO)
# ======================================
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

# Orchestrator Agent
orchestrator_prompt = ChatPromptTemplate.from_template(
    "You are a Cisco sales orchestration system. Analyze the user query and:\n"
    "1. Determine which specialized agents are needed\n"
    "2. Decompose the query into specific parts for each agent\n"
    "3. ALWAYS include a 'query_parts' dictionary with agent names as keys\n\n"
    "Special handling for solution design requests:\n"
    "- If the user asks for a complete solution, architecture, or system design, set needs_design=True\n"
    "- Examples: 'design a solution for...', 'how to implement...', 'architecture for...'\n\n"
    "Example structure for output:\n"
    "{{\n"
    "  \"needs_design\": true,\n"
    "  \"needs_technical\": false,\n"
    "  \"needs_pricing\": true,\n"
    "  \"query_parts\": {{\n"
    "    \"design\": \"secure cloud infrastructure for healthcare\"\n"
    "  }}\n"
    "}}\n\n"
    "User Query: {query}\n\n"
    "Output Instructions: {format_instructions}"
)
orchestrator_agent = orchestrator_prompt | llm.with_structured_output(AgentRoutingDecision)

# (nova inst√¢ncia, obrigat√≥ria depois de adicionar needs_design)
orchestrator_agent = (
    orchestrator_prompt
    | llm.with_structured_output(AgentRoutingDecision)  # ‚Üê agora inclui needs_design
)

# Agente de Design de Solu√ß√£o
design_prompt = ChatPromptTemplate.from_template(
    "You are a Cisco Solution Architect. Design a complete solution based on the customer requirements:\n\n"
    "Customer Requirements: {requirements}\n\n"
    "Available Cisco Products:\n{product_list}\n\n"
    "Output Instructions: {format_instructions}"
)

# Gerar lista de produtos para contexto (resumida)
def get_product_list_str():
    return "\n".join([f"- {p['cisco_product_id']}: {p['commercial_name']} ({p['product_type']})" 
                      for p in product_dict.values()])

design_agent = (
    {
        "requirements": lambda x: x["requirements"],
        "product_list": RunnableLambda(get_product_list_str),
        "format_instructions": lambda x: x["format_instructions"]
    }
    | design_prompt
    | llm.with_structured_output(SolutionDesign)
)
# ======================================
# 5. State Definition (SIMPLIFICADO)
# ======================================
class AgentState(TypedDict):
    user_query: str
    orchestrator_decision: Optional[AgentRoutingDecision]
    technical_results: List[Dict]
    pricing_results: List[Dict]
    solution_design: Optional[SolutionDesign]  # Novo campo
    final_response: str

# ======================================
# 6. Graph Nodes Implementation (COMPLETO E CORRIGIDO)
# ======================================
def orchestrator_node(state: AgentState) -> AgentState:
    """Analyzes query and plans agent workflow"""
    print(f"\nüéª [Orchestrator] Analyzing query: '{state['user_query']}'")
    
    try:
        decision = orchestrator_agent.invoke({
            "query": state["user_query"],
            "format_instructions": AgentRoutingDecision.schema()
        })
    except Exception as e:
        print(f"‚ö†Ô∏è Orchestrator error: {str(e)}")
        # Fallback decision
        decision = AgentRoutingDecision(
            needs_technical="spec" in state["user_query"].lower(),
            needs_pricing="price" in state["user_query"].lower() or "cost" in state["user_query"].lower(),
            query_parts={}
        )
    
    state["orchestrator_decision"] = decision
    print(f"‚úÖ Routing decision: "
          f"Technical={decision.needs_technical} "
          f"Pricing={decision.needs_pricing}")
    
    if decision.query_parts:
        print(f"Query parts: {decision.query_parts}")
    else:
        print("‚ö†Ô∏è No query parts decomposed, using fallback")
        decision.query_parts = {
            "technical": state["user_query"],
            "pricing": state["user_query"]
        }
    
    return state

# ======================================
# 3. Atualiza√ß√£o do Agente T√©cnico
# ======================================
def technical_agent_node(state: AgentState) -> AgentState:
    """Handles technical aspects including recommendations"""
    if not state["orchestrator_decision"].needs_technical:
        print("‚è© Skipping technical agent (not required)")
        return state
        
    query_part = state["orchestrator_decision"].query_parts.get("technical", "")
    print(f"\nüîß [Technical Agent] Processing: '{query_part}'")
    
    # Verifica se √© uma solicita√ß√£o de recomenda√ß√£o
    is_recommendation_request = any(word in query_part.lower() 
                                   for word in ["recommend", "suggest", "what", "which", "choose"])
    
    # Extrai produtos espec√≠ficos se mencionados
    found_ids = []
    pattern = r'\b([A-Z]{2,}\d+[A-Z]?-\w+-\w+)\b'
    found_ids = re.findall(pattern, query_part)
    
    # Se for solicita√ß√£o de recomenda√ß√£o ou n√£o encontrou produtos espec√≠ficos
    if is_recommendation_request or not found_ids:
        print("üîé Detected recommendation request")
        recommendations = recommend_products.invoke({"requirements": query_part, "max_results": 5})
        
        if recommendations:
            state["technical_results"] = []
            for rec in recommendations:
                if 'error' not in rec:
                    # Obt√©m especifica√ß√µes completas para os recomendados
                    specs = get_technical_specs(rec['part_number'])
                    if 'error' not in specs:
                        specs['recommendation_score'] = rec.get('similarity_score', 0)
                        state["technical_results"].append(specs)
            
            print(f"‚úÖ Generated {len(state['technical_results'])} recommendations")
        else:
            state["technical_results"] = [{
                'error': 'No products match your requirements',
                'query': query_part
            }]
    else:
        # Busca produtos espec√≠ficos mencionados
        state["technical_results"] = []
        for product_id in set(found_ids):
            result = get_technical_specs(product_id)
            state["technical_results"].append(result)
        
        print(f"‚úÖ Found {len(found_ids)} specific products")
    
    # Log dos resultados
    for result in state["technical_results"]:
        if 'error' in result:
            print(f"  ‚ùå {result.get('part_number', 'Unknown')}: {result['error']}")
        else:
            score = result.get('recommendation_score', '')
            score_str = f" [Score: {score:.2f}]" if score else ""
            print(f"  ‚úÖ {result.get('part_number', 'Unknown')}: Specs found{score_str}")
    
    return state


def pricing_agent_node(state: AgentState) -> AgentState:
    """Handles pricing aspects of products"""
    if not state["orchestrator_decision"].needs_pricing:
        print("‚è© Skipping pricing agent (not required)")
        return state
        
    # Usar resultados t√©cnicos se dispon√≠veis
    products = []
    if state["technical_results"]:
        products = [{
            'part_number': item.get('part_number', '')
        } for item in state["technical_results"]]
    
    # Se n√£o tem resultados t√©cnicos, extrair da query de pre√ßos
    if not products:
        query_part = state["orchestrator_decision"].query_parts.get("pricing", "")
        print(f"‚ö†Ô∏è No technical results, extracting from pricing query: '{query_part}'")
        
        # Extrair todos os IDs de produto conhecidos
        known_ids = list(product_dict.keys())
        
        # Encontrar IDs mencionados na query
        for product_id in known_ids:
            if product_id in query_part:
                products.append({'part_number': product_id})
        
        # Se n√£o encontrou, tentar padr√µes Cisco
        if not products:
            pattern = r'\b([A-Z]{2,}\d+[A-Z]?-\w+-\w+)\b'
            found_ids = re.findall(pattern, query_part)
            products = [{'part_number': pid} for pid in found_ids]
    
    if not products:
        print("‚ö†Ô∏è No products identified for pricing")
        products = [{'part_number': 'UNKNOWN'}]
    
    print(f"\nüí∞ [Pricing Agent] Processing {len(products)} products...")
    state["pricing_results"] = []
    
    for product in products:
        pn = product.get('part_number', 'UNKNOWN')
        result = get_product_price(pn)
        state["pricing_results"].append(result)
    
    # Log dos resultados
    for result in state["pricing_results"]:
        status = "‚úÖ" if "price" in result and not result.get('error') else "‚ùå"
        print(f"  {status} {result.get('part_number', 'Unknown')}: "
              f"{result.get('price', result.get('error', 'No info'))}")
    
    return state

def synthesize_response_node(state: AgentState) -> AgentState:
    """Creates final response with solution designs"""
    print("\nüéØ [Synthesizer] Combining agent results")
    
    response = []
    
    # Tratar designs de solu√ß√£o
    if state.get("solution_design"):
        design = state["solution_design"]
        response.append("üöÄ Solution Design:")
        response.append(f"\n{design.summary}")
        
        response.append("\n\nüîß Solution Components:")
        for i, comp in enumerate(design.components, 1):
            # Buscar informa√ß√µes t√©cnicas deste componente
            tech_info = next((t for t in state["technical_results"] 
                            if t.get('part_number') == comp.part_number), {})
            
            desc = tech_info.get('description', comp.part_number)
            response.append(f"\n{i}. {desc} ({comp.quantity}x) - {comp.role}")
            
            # Adicionar especifica√ß√µes se dispon√≠veis
            specs = tech_info.get('specifications', {})
            if specs:
                for key, value in specs.items():
                    response.append(f"   - {key.replace('_', ' ').title()}: {value}")
        
        response.append(f"\n\n‚úÖ Justification:\n{design.justification}")
        
        # Adicionar pre√ßos se dispon√≠veis
        if state["pricing_results"]:
            total = 0
            response.append("\n\nüíµ Pricing Breakdown:")
            for product in state["pricing_results"]:
                if "error" not in product:
                    qty = next((c.quantity for c in design.components 
                               if c.part_number == product['part_number']), 1)
                    price = product.get('price', 0) * qty
                    total += price
                    response.append(
                        f"\n- {product['description']} ({product['part_number']}) "
                        f"{qty}x: {product['currency']} {price:.2f}"
                    )
            response.append(f"\n\nüíé TOTAL ESTIMATED COST: {product['currency']} {total:.2f}")
    
    # Restante da l√≥gica anterior (para consultas n√£o relacionadas a design)
    else:
        # ... (manter l√≥gica anterior para consultas t√©cnicas e de pre√ßo)
        pass
    
    # Sem resultados
    if not response:
        response.append("‚ùå No relevant information found for your query")
    
    state["final_response"] = "\n".join(response)
    return state


def solution_design_node(state: AgentState) -> AgentState:
    """Designs comprehensive solutions based on requirements"""
    print(f"\nüé® [Solution Architect] Designing solution for: {state['user_query']}")
    
    try:
        design = design_agent.invoke({
            "requirements": state["user_query"],
            "format_instructions": SolutionDesign.schema()
        })
        state["solution_design"] = design
        print(f"‚úÖ Solution design created with {len(design.components)} components")
        
        # Extrai componentes para processamento t√©cnico e de pre√ßos
        components = [{"part_number": c.part_number, "quantity": c.quantity} for c in design.components]
        state["technical_results"] = []
        for comp in components:
            # Busca especifica√ß√µes t√©cnicas para cada componente
            result = get_technical_specs(comp['part_number'])
            if 'error' not in result:
                result['quantity'] = comp['quantity']
            state["technical_results"].append(result)
        
        print(f"‚úÖ Technical specs retrieved for {len(components)} components")
        
    except Exception as e:
        state["solution_design"] = {
            "error": f"Solution design failed: {str(e)}"
        }
        print(f"‚ùå Solution design error: {str(e)}")
    
    return state

# ======================================
# 7. Conditional Routing Logic (MANTIDO)
# ======================================
def route_after_orchestrator(state: AgentState) -> str:
    decision = state["orchestrator_decision"]
    
    if decision.needs_design:
        return "solution_designer"
    elif decision.needs_technical:
        return "technical_agent"
    elif decision.needs_pricing:
        return "pricing_agent"
    else:
        return "synthesize"

def route_after_designer(state: AgentState) -> str:
    """After solution design, always go to technical agent first"""
    return "technical_agent"

def route_after_technical(state: AgentState) -> str:
    decision = state["orchestrator_decision"]
    if decision.needs_pricing:
        return "pricing_agent"
    else:
        return "synthesize"

def route_after_pricing(state: AgentState) -> str:
    return "synthesize"

# ======================================
# 8. Build Agent Workflow Graph (MANTIDO)
# ======================================

workflow = StateGraph(AgentState)

# Adicionar n√≥s (incluindo o novo designer de solu√ß√µes)
workflow.add_node("orchestrator", orchestrator_node)
workflow.add_node("solution_designer", solution_design_node)  # Novo n√≥
workflow.add_node("technical_agent", technical_agent_node)
workflow.add_node("pricing_agent", pricing_agent_node)
workflow.add_node("synthesize", synthesize_response_node)


workflow.set_entry_point("orchestrator")

# Roteamento do orquestrador
workflow.add_conditional_edges(
    "orchestrator",
    route_after_orchestrator,
    {
        "solution_designer": "solution_designer",
        "technical_agent": "technical_agent",
        "pricing_agent": "pricing_agent",
        "synthesize": "synthesize"
    }
)

# Roteamento do designer de solu√ß√µes
workflow.add_conditional_edges(
    "solution_designer",
    route_after_designer,
    {
        "technical_agent": "technical_agent"
    }
)

# Roteamento do agente t√©cnico
workflow.add_conditional_edges(
    "technical_agent",
    route_after_technical,
    {
        "pricing_agent": "pricing_agent",
        "synthesize": "synthesize"
    }
)

# Roteamento do agente de pre√ßos
workflow.add_conditional_edges(
    "pricing_agent",
    route_after_pricing,
    {
        "synthesize": "synthesize"
    }
)

workflow.add_edge("synthesize", END)

app = workflow.compile()

# ======================================
# 9. Run the Agent Workflow (ATUALIZADO)
# ======================================
def run_sales_quote(user_query: str) -> str:
    print("\n" + "="*60)
    print(f"üöÄ STARTING QUOTE PROCESSING: '{user_query}'")
    print("="*60)
    
    initial_state = {
        "user_query": user_query,
        "orchestrator_decision": None,
        "technical_results": [],
        "pricing_results": [],
        "final_response": ""
    }
    
    final_state = app.invoke(initial_state)
    
    print("\n" + "="*60)
    print("‚úÖ QUOTE PROCESSING COMPLETE")
    print("="*60)
    
    return final_state["final_response"]



‚úÖ Recommendation data prepared
‚úÖ Data loaded: 16 products


In [None]:
# ======================================
# 10. Test Cases (ATUALIZADO)
# ======================================
if __name__ == "__main__":
    # Test 1: Solicita√ß√£o t√©cnica + pre√ßo
    test_query_1 = "I need technical specs for ASA5516-FPWR-K9 and pricing for MR53E-HW"
    result_1 = run_sales_quote(test_query_1)
    print("\nüí¨ CLIENT RESPONSE 1:")
    print(result_1)
    
    # Test 2: Solicita√ß√£o de pre√ßo com nome comercial
    test_query_2 = "How much does QSFP-100G-SR4-S cost?"
    result_2 = run_sales_quote(test_query_2)
    print("\nüí¨ CLIENT RESPONSE 2:")
    print(result_2)
    
    # Test 3: Solicita√ß√£o t√©cnica com nome comercial
    test_query_3 = "What are the specifications for the Catalyst 9300 switch?"
    result_3 = run_sales_quote(test_query_3)
    print("\nüí¨ CLIENT RESPONSE 3:")
    print(result_3)
    
    # Test 4: Produto n√£o encontrado
    test_query_4 = "Price for NONEXISTENT-PRODUCT-123"
    result_4 = run_sales_quote(test_query_4)
    print("\nüí¨ CLIENT RESPONSE 4:")
    print(result_4)

In [160]:
# Teste de design de solu√ß√£o
test_query = "How can I create a quote for a healthcare client with 200 users who needs cloud security and hybrid infrastructure?"
result = run_sales_quote(test_query)
print(result)



üöÄ STARTING QUOTE PROCESSING: 'How can I create a quote for a healthcare client with 200 users who needs cloud security and hybrid infrastructure?'

üéª [Orchestrator] Analyzing query: 'How can I create a quote for a healthcare client with 200 users who needs cloud security and hybrid infrastructure?'
‚úÖ Routing decision: Technical=True Pricing=True
Query parts: {'technical': 'cloud security and hybrid infrastructure', 'pricing': 'quote for a healthcare client with 200 users'}

üîß [Technical Agent] Processing: 'cloud security and hybrid infrastructure'
üîé Detected recommendation request
üîç Recommending products for: cloud security and hybrid infrastructure
üîç Searching specs for: ASA5555-X
üîç Searching specs for: ASA5525-X
üîç Searching specs for: ASA5516-FPWR-K9
üîç Searching specs for: QSFP-100G-SR4-S
üîç Searching specs for: QSFP-100G-LR4-S
‚úÖ Generated 5 recommendations
  ‚úÖ ASA5555-X: Specs found [Score: 0.21]
  ‚úÖ ASA5525-X: Specs found [Score: 0.20]
  ‚úÖ AS

In [142]:
# Test 1: Solicita√ß√£o t√©cnica + pre√ßo
test_query_1 = "I need technical specs for ASA5516-FPWR-K9 and pricing for MR53E-HW"
result_1 = run_sales_quote(test_query_1)
print("\nüí¨ CLIENT RESPONSE 1:")
print(result_1)


üöÄ STARTING QUOTE PROCESSING: 'I need technical specs for ASA5516-FPWR-K9 and pricing for MR53E-HW'

üéª [Orchestrator] Analyzing query: 'I need technical specs for ASA5516-FPWR-K9 and pricing for MR53E-HW'
‚úÖ Routing decision: Technical=True Pricing=True
Query parts: {'technical': 'technical specs for ASA5516-FPWR-K9', 'pricing': 'pricing for MR53E-HW'}

üîß [Technical Agent] Processing: 'technical specs for ASA5516-FPWR-K9'
üîç Searching specs for: ASA5516-FPWR-K9
‚úÖ Found 1 specific products
  ‚úÖ ASA5516-FPWR-K9: Specs found

üí∞ [Pricing Agent] Processing 1 products...
üîç Searching price for: ASA5516-FPWR-K9
  ‚úÖ ASA5516-FPWR-K9: 5995.0

üéØ [Synthesizer] Combining agent results

‚úÖ QUOTE PROCESSING COMPLETE

üí¨ CLIENT RESPONSE 1:

üîß Technical Specifications:

‚Ä¢ ASA 5516-X with FirePOWER Services (ASA5516-FPWR-K9)
  - Category: security
  - Subcategory: firewall
  - Throughput: 4 Gbps
  - Interfaces: 8x GE
  - Vpn Throughput: 1 Gbps
  - Threat Throughput: 500 

In [156]:
# Test 2: Solicita√ß√£o de pre√ßo com nome comercial
test_query_2 = "I need some firewall with throughput of only 4 Gbps and a price igual to 5995.00, what do  you recommend?"
result_2 = run_sales_quote(test_query_2)
print("\nüí¨ CLIENT RESPONSE 2:")
print(result_2)


üöÄ STARTING QUOTE PROCESSING: 'I need some firewall with throughput of only 4 Gbps and a price igual to 5995.00, what do  you recommend?'

üéª [Orchestrator] Analyzing query: 'I need some firewall with throughput of only 4 Gbps and a price igual to 5995.00, what do  you recommend?'
‚úÖ Routing decision: Technical=True Pricing=True
Query parts: {'technical': 'firewall recommendations with throughput of 4 Gbps', 'pricing': 'price equal to 5995.00'}

üîß [Technical Agent] Processing: 'firewall recommendations with throughput of 4 Gbps'
üîé Detected recommendation request
üîç Recommending products for: firewall recommendations with throughput of 4 Gbps
üîç Searching specs for: ASA5555-X
üîç Searching specs for: ASA5525-X
üîç Searching specs for: ASA5516-FPWR-K9
üîç Searching specs for: MR53E-HW
üîç Searching specs for: MR42E-HW
‚úÖ Generated 5 recommendations
  ‚úÖ ASA5555-X: Specs found [Score: 0.64]
  ‚úÖ ASA5525-X: Specs found [Score: 0.62]
  ‚úÖ ASA5516-FPWR-K9: Specs found

In [174]:
# =============================================================
# Cisco Sales Assistant ‚Äì fluxo completo (julho/2025)
# =============================================================
"""
Cria um agente LangGraph capaz de:
  ‚Ä¢ analisar a consulta do cliente
  ‚Ä¢ projetar uma solu√ß√£o (Solution Designer)
  ‚Ä¢ buscar especifica√ß√µes t√©cnicas
  ‚Ä¢ precificar os componentes
  ‚Ä¢ sintetizar tudo em uma resposta final

Requisitos:
  pip install langchain langgraph langchain-openai scikit-learn numpy
"""

# -------------------------------------------------------------
# 0. Imports
# -------------------------------------------------------------
import json
import re
from typing import List, Dict, TypedDict, Optional

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from langchain.tools import tool
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.runnables import RunnableLambda
from langgraph.graph import END, StateGraph

# -------------------------------------------------------------
# 1. Carregar cat√°logo Cisco
# -------------------------------------------------------------
product_dict: Dict[str, Dict] = {}
PRICELIST_PATH = "data/raw/pricelist.json"      # ajuste se necess√°rio

try:
    with open(PRICELIST_PATH, encoding="utf-8") as f:
        data = json.load(f)
        products = data if isinstance(data, list) else data.get("products", [])
        for p in products:
            pid = p.get("cisco_product_id")
            if pid:
                product_dict[pid] = p
    print(f"‚úÖ Data loaded: {len(product_dict)} products")
except Exception as e:
    print(f"‚ùå Error loading product data: {e}")

# -------------------------------------------------------------
# 2. Preparar embeddings TF‚ÄëIDF para recomenda√ß√µes
# -------------------------------------------------------------
def prepare_recommendation_data():
    """Gera matriz TF‚ÄëIDF a partir do cat√°logo para busca sem√¢ntica."""
    texts: List[str] = []
    for p in product_dict.values():
        hw = p.get("technical_profile", {}).get("hardware_attributes", {})
        txt = (
            f"{p.get('commercial_name', '')} {p.get('product_type', '')} "
            + " ".join(f"{k}={v}" for k, v in hw.items())
        ).strip()
        texts.append(txt)
    vectorizer = TfidfVectorizer(stop_words="english")
    matrix = vectorizer.fit_transform(texts) if texts else None
    return vectorizer, matrix


vectorizer, tfidf_matrix = prepare_recommendation_data()
print("‚úÖ Recommendation data prepared")

# -------------------------------------------------------------
# 3. Helper ‚Äì lista enxuta de produtos (TOP‚ÄëK)
# -------------------------------------------------------------
def get_product_list_str(requirements: str, top_k: int = 50) -> str:
    if tfidf_matrix is None:
        return "(catalog empty)"
    vec = vectorizer.transform([requirements])
    sims = cosine_similarity(vec, tfidf_matrix).flatten()
    idxs = sims.argsort()[::-1][:top_k]
    prods = [list(product_dict.values())[i] for i in idxs]
    return "\n".join(
        f"- {p['cisco_product_id']}: {p['commercial_name']} "
        f"({p['product_type']})"
        for p in prods
    )

# -------------------------------------------------------------
# 4. Ferramentas (LangChain¬†@tool)
# -------------------------------------------------------------
@tool
def get_product_price(part_number: str) -> Dict:
    """Retrieve pricing information for a Cisco product."""
    prod = product_dict.get(part_number)
    if not prod:
        return {"error": f"Product {part_number} not found", "part_number": part_number}
    pricing = prod.get("pricing_model", {})
    return {
        "price": pricing.get("base_price", 0.0),
        "currency": pricing.get("currency", "USD"),
        "description": prod.get("commercial_name", ""),
        "part_number": part_number,
        "product_type": prod.get("product_type", ""),
    }


@tool
def get_technical_specs(part_number: str) -> Dict:
    """Retrieve hardware specifications for a Cisco product."""
    prod = product_dict.get(part_number)
    if not prod:
        return {"error": f"Product {part_number} not found", "part_number": part_number}
    hw = prod.get("technical_profile", {}).get("hardware_attributes", {})
    if not hw:
        return {"error": f"No technical specs for {part_number}", "part_number": part_number}
    return {
        "specifications": hw,
        "description": prod.get("commercial_name", ""),
        "part_number": part_number,
        "product_type": prod.get("product_type", ""),
    }


@tool
def recommend_products(requirements: str, max_results: int = 3) -> List[Dict]:
    """Recommend Cisco products that best match the given requirements."""
    if tfidf_matrix is None:
        return [{"error": "Catalog not indexed"}]
    vec = vectorizer.transform([requirements])
    sims = cosine_similarity(vec, tfidf_matrix).flatten()
    idxs = sims.argsort()[::-1][:max_results]
    base = list(product_dict.values())
    return [
        {
            "part_number": base[i]["cisco_product_id"],
            "commercial_name": base[i]["commercial_name"],
            "product_type": base[i]["product_type"],
            "similarity_score": float(sims[i]),
        }
        for i in idxs
    ]

# -------------------------------------------------------------
# 5. Pydantic¬†models
# -------------------------------------------------------------
class SolutionComponent(BaseModel):
    part_number: str = Field(description="Cisco product ID")
    quantity: int = Field(default=1, description="Quantity required")
    role: str = Field(description="Role in the solution")


class SolutionDesign(BaseModel):
    summary: str
    components: List[SolutionComponent]
    justification: str


class AgentRoutingDecision(BaseModel):
    needs_design: bool = False
    needs_technical: bool = False
    needs_pricing: bool = False
    query_parts: Dict[str, str] = Field(default_factory=dict)

# -------------------------------------------------------------
# 6. LLM¬†e¬†prompts
# -------------------------------------------------------------
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

# ‚Äî Orchestrator (corrigido)
orchestrator_prompt = ChatPromptTemplate.from_template(
    """You are a Cisco sales orchestration system.

Analyse the user query and decide which specialised agents are needed:
  ‚Ä¢ Solution Designer   ‚Üí needs_design
  ‚Ä¢ Technical Agent     ‚Üí needs_technical
  ‚Ä¢ Pricing Agent       ‚Üí needs_pricing

ALWAYS output a JSON object that matches the schema shown in
{{format_instructions}}.

User query: {{query}}
"""
)

orchestrator_agent = orchestrator_prompt | llm.with_structured_output(
    AgentRoutingDecision
)

# ‚Äî Solution¬†Designer
design_prompt = ChatPromptTemplate.from_template(
    """You are a Cisco Solution Architect. Design a complete solution.

Customer Requirements:
{requirements}

Available Cisco Products:
{product_list}

Return only part_numbers that appear above.
Output as JSON in the schema provided."""
)
design_agent = (
    {
        "requirements": lambda x: x["requirements"],
        "product_list": lambda x: get_product_list_str(x["requirements"]),
        "format_instructions": lambda x: x["format_instructions"],
    }
    | design_prompt
    | llm.with_structured_output(SolutionDesign)
)

# -------------------------------------------------------------
# 7. State¬†type
# -------------------------------------------------------------
class AgentState(TypedDict):
    user_query: str
    orchestrator_decision: Optional[AgentRoutingDecision]
    solution_design: Optional[SolutionDesign]
    technical_results: List[Dict]
    pricing_results: List[Dict]
    final_response: str

# -------------------------------------------------------------
# 8. N√≥¬†‚Äî¬†Orchestrator
# -------------------------------------------------------------
def orchestrator_node(state: AgentState) -> AgentState:
    print(f"\nüéª [Orchestrator] ¬´{state['user_query']}¬ª")

    q = state["user_query"]

    # 1) tentativa normal com o LLM
    try:
        decision = orchestrator_agent.invoke(
            {
                "query": q,
                "format_instructions": AgentRoutingDecision.schema(),
            }
        )
    except Exception:
        print("‚ö†Ô∏è LLM parse fail ‚Üí empty decision")
        decision = AgentRoutingDecision()

    # 2) heur√≠stica se vier tudo falso
    if not any([decision.needs_design, decision.needs_technical, decision.needs_pricing]):
        q_low = q.lower()
        decision = AgentRoutingDecision(
            needs_design=any(w in q_low for w in ["design", "architecture", "solution"]),
            needs_technical="spec" in q_low,
            needs_pricing=any(w in q_low for w in ["price", "cost", "quote", "pricing"]),
            query_parts={},
        )

    # 3) salva no estado e retorna
    state["orchestrator_decision"] = decision
    return state


# -------------------------------------------------------------
# 9. N√≥¬†‚Äî¬†Solution¬†Designer
# -------------------------------------------------------------
def solution_design_node(state: AgentState) -> AgentState:
    print("\nüé® [Solution Designer]")
    design = design_agent.invoke(
        {"requirements": state["user_query"], "format_instructions": SolutionDesign.schema()}
    )
    state["solution_design"] = design
    # for√ßa que o agente de pre√ßo rode depois
    state["orchestrator_decision"].needs_pricing = True

    # specs de cada componente
    state["technical_results"] = []
    for comp in design.components:
        res = get_technical_specs(comp.part_number)
        if "error" not in res:
            res["quantity"] = comp.quantity
        state["technical_results"].append(res)
    return state

# -------------------------------------------------------------
# 10. N√≥ ‚Äî Technical Agent  (substitua TODO o bloco)
# -------------------------------------------------------------
### helper: extrai poss√≠veis Cisco part‚Äënumbers do texto
PART_RE = re.compile(r"[A-Z]{2,}\d+[A-Z]*-[A-Za-z0-9]+(?:-[A-Za-z0-9]+)*")

def extract_part_numbers(text: str) -> List[str]:
    return list({m.group(0) for m in PART_RE.finditer(text)})

def technical_agent_node(state: AgentState) -> AgentState:
    # pula se j√° h√° design
    if state.get("solution_design") is not None:
        print("‚è© Technical Agent skipped (solution design already provides specs)")
        return state

    query_part = state["orchestrator_decision"].query_parts.get(
        "technical", state["user_query"]
    )
    ids = extract_part_numbers(query_part)

    if ids:
        print(f"\nüîß [Technical Agent] Found explicit IDs: {ids}")
        state["technical_results"] = [get_technical_specs(pid) for pid in ids]
        return state

    # fallback para recomenda√ß√£o sem IDs
    if not state["orchestrator_decision"].needs_technical:
        print("‚è© Technical Agent skipped (flag false & no IDs)")
        return state

    print(f"\nüîß [Technical Agent] Generating recommendations for ¬´{query_part}¬ª")
    recs = recommend_products.invoke({"requirements": query_part, "max_results": 5})
    state["technical_results"] = []
    for r in recs:
        spec = get_technical_specs(r["part_number"])
        if "error" not in spec:
            spec["recommendation_score"] = r.get("similarity_score", 0)
        state["technical_results"].append(spec)
    return state


# -------------------------------------------------------------
# 11. N√≥ ‚Äî Pricing Agent  (substitua TODO o bloco)
# -------------------------------------------------------------
def pricing_agent_node(state: AgentState) -> AgentState:
    if not state["orchestrator_decision"].needs_pricing:
        print("‚è© Pricing Agent skipped")
        return state

    print("\nüí∞ [Pricing Agent]")

    # 1) Se houver design ‚Üí componentes do design
    if isinstance(state.get("solution_design"), SolutionDesign):
        items = [
            {"part_number": c.part_number, "quantity": c.quantity}
            for c in state["solution_design"].components
        ]
    else:
        # 2) Extrai IDs mencionados na parte de pre√ßo da query
        pricing_part = state["orchestrator_decision"].query_parts.get("pricing", state["user_query"])
        ids = extract_part_numbers(pricing_part)
        if ids:
            items = [{"part_number": pid, "quantity": 1} for pid in ids]
        else:
            # 3) fallback: usa IDs de technical_results
            items = [
                {"part_number": t.get("part_number"), "quantity": t.get("quantity", 1)}
                for t in state["technical_results"] if t.get("part_number")
            ]

    # deduplicar
    dedup: Dict[str, int] = {}
    for it in items:
        dedup[it["part_number"]] = dedup.get(it["part_number"], 0) + it["quantity"]

    state["pricing_results"] = []
    for pn, qty in dedup.items():
        price_info = get_product_price(pn)
        price_info.update(
            {
                "quantity": qty,
                "subtotal": price_info.get("price", 0) * qty,
            }
        )
        state["pricing_results"].append(price_info)

    return state



# -------------------------------------------------------------
# 12. N√≥¬†‚Äî¬†Synthesizer
# -------------------------------------------------------------
def synthesize_node(state: AgentState) -> AgentState:
    print("\nüéØ [Synthesizer]")
    lines: List[str] = []

    # caso haja Solution Design
    if isinstance(state.get("solution_design"), SolutionDesign):
        d = state["solution_design"]
        lines.append("üöÄ Solution Design\n" + d.summary)
        lines.append("\nüîß Components:")
        for i, c in enumerate(d.components, 1):
            desc = next(
                (
                    t.get("description")
                    for t in state["technical_results"]
                    if t.get("part_number") == c.part_number
                ),
                c.part_number,
            )
            lines.append(f"{i}. {desc} ({c.quantity}√ó) ‚Äì {c.role}")
        lines.append("\n‚úÖ Justification:\n" + d.justification)

    # pre√ßos
    if state["pricing_results"]:
        total = 0.0
        currency = "USD"
        lines.append("\nüíµ Pricing:")
        for p in state["pricing_results"]:
            if "error" in p:
                lines.append(f"- {p.get('part_number')}: {p['error']}")
                continue
            currency = p["currency"]
            total += p["subtotal"]
            lines.append(
                f"- {p['description']} ({p['quantity']}√ó): "
                f"{currency} {p['subtotal']:.2f}"
            )
        lines.append(f"\nTOTAL ESTIMATED: {currency} {total:.2f}")



    if not lines:
        lines.append("‚ùå No relevant information found")

    state["final_response"] = "\n".join(lines)
    return state

# -------------------------------------------------------------
# 13. Roteamento
# -------------------------------------------------------------
def route_after_orch(state: AgentState) -> str:
    dec = state["orchestrator_decision"]
    if dec.needs_design:
        return "designer"
    if dec.needs_technical:
        return "tech"
    if dec.needs_pricing:
        return "price"
    return "synth"


def route_after_designer(_):  # sempre vai para specs
    return "tech"


def route_after_tech(state: AgentState) -> str:
    return "price" if state["orchestrator_decision"].needs_pricing else "synth"


def route_after_price(_):
    return "synth"

# -------------------------------------------------------------
# 14. Construir o¬†grafo
# -------------------------------------------------------------
workflow = StateGraph(AgentState)
workflow.add_node("orch", orchestrator_node)
workflow.add_node("designer", solution_design_node)
workflow.add_node("tech", technical_agent_node)
workflow.add_node("price", pricing_agent_node)
workflow.add_node("synth", synthesize_node)

workflow.set_entry_point("orch")
workflow.add_conditional_edges("orch", route_after_orch, {
    "designer": "designer",
    "tech": "tech",
    "price": "price",
    "synth": "synth",
})
workflow.add_conditional_edges("designer", route_after_designer, {"tech": "tech"})
workflow.add_conditional_edges("tech", route_after_tech, {"price": "price", "synth": "synth"})
workflow.add_conditional_edges("price", route_after_price, {"synth": "synth"})
workflow.add_edge("synth", END)

app = workflow.compile()

# -------------------------------------------------------------
# 15. Helper para executar
# -------------------------------------------------------------
def run_sales_quote(query: str) -> str:
    init: AgentState = {
        "user_query": query,
        "orchestrator_decision": None,
        "solution_design": None,
        "technical_results": [],
        "pricing_results": [],
        "final_response": "",
    }
    final_state = app.invoke(init)
    return final_state["final_response"]

# -------------------------------------------------------------
# 16. Exemplo
# -------------------------------------------------------------
if __name__ == "__main__":
    q = (
        "Design a secure branch‚Äëoffice solution for 50 users with Wi‚ÄëFi¬†6, "
        "firewall and PoE switches. Provide pricing."
    )
    print(run_sales_quote(q))


‚úÖ Data loaded: 16 products
‚úÖ Recommendation data prepared

üéª [Orchestrator] ¬´Design a secure branch‚Äëoffice solution for 50 users with Wi‚ÄëFi¬†6, firewall and PoE switches. Provide pricing.¬ª

üé® [Solution Designer]
‚è© Technical Agent skipped (solution design already provides specs)

üí∞ [Pricing Agent]

üéØ [Synthesizer]
üöÄ Solution Design
Design a secure branch-office solution for 50 users with Wi-Fi 6, firewall, and PoE switches.

üîß Components:
1. ASA 5516-X with FirePOWER Services (1√ó) ‚Äì Firewall with FirePOWER Services
2. Meraki MR53E Access Point (3√ó) ‚Äì Wi-Fi 6 Access Points for coverage and capacity
3. ASA 5555-X Firewall (1√ó) ‚Äì Additional Firewall for redundancy and security

‚úÖ Justification:
The ASA5516-FPWR-K9 provides robust firewall capabilities with FirePOWER services, suitable for securing the branch office. The MR53E-HW access points support Wi-Fi 6, ensuring high performance and capacity for 50 users. The additional ASA5555-X firewall offe

In [1]:
import pandas as pd

# Define the path to your problematic JSON file
# NOTE: The file name might be different on your end. Please double-check.
problem_file_path = 'data/raw/pricelist.json' 
corrected_file_path = 'data/raw/pricelist_corrected.json' # We'll save it as a new file

try:
    print(f"Attempting to read the file: {problem_file_path}")
    
    # Read the file using 'latin-1' encoding, which is a common fallback
    with open(problem_file_path, 'r', encoding='latin-1') as f:
        content = f.read()

    # Write the content back out using the standard 'utf-8' encoding
    with open(corrected_file_path, 'w', encoding='utf-8') as f:
        f.write(content)
        
    print(f"‚úÖ Success! File has been re-saved with correct UTF-8 encoding at: {corrected_file_path}")
    print("\nPlease update your 'tools.py' and 'scripts/ingest_data.py' files to use this new filename.")

except FileNotFoundError:
    print(f"‚ùå ERROR: Could not find the file at '{problem_file_path}'. Please check the path and filename.")
except Exception as e:
    print(f"‚ùå An unexpected error occurred: {e}")

Attempting to read the file: data/raw/pricelist.json
‚úÖ Success! File has been re-saved with correct UTF-8 encoding at: data/raw/pricelist_corrected.json

Please update your 'tools.py' and 'scripts/ingest_data.py' files to use this new filename.


In [3]:
import json
from pathlib import Path

paths = [
    Path("data/processed/vector_store/docstore.json"),
    Path("data/processed/vector_store/index_store.json"),
]

for p in paths:
    print(f"\n--- lendo {p} ---")
    if not p.exists():
        print("n√£o existe")
        continue
    try:
        with open(p, encoding="utf-8") as f:
            _ = json.load(f)
        print("OK: √© JSON UTF-8 v√°lido")
    except UnicodeDecodeError as e:
        print("Erro de decodifica√ß√£o UTF-8:", e)
        try:
            with open(p, encoding="latin-1") as f:
                sample = f.read(500)
            print("Conte√∫do inicial (latin-1):", repr(sample))
        except Exception as e2:
            print("N√£o conseguiu ler nem em latin-1:", e2)
    except json.JSONDecodeError as e:
        print("Arquivo √© texto mas n√£o √© JSON v√°lido:", e)



--- lendo data\processed\vector_store\docstore.json ---
OK: √© JSON UTF-8 v√°lido

--- lendo data\processed\vector_store\index_store.json ---
OK: √© JSON UTF-8 v√°lido


In [5]:
import json
from pathlib import Path

RAW = Path("data/raw/pricelist.json")
CLEAN = Path("data/raw/pricelist_cleaned.json")

def clean_input_json():
    # 1. Leia em bin√°rio e decodifique com replacement para mostrar/neutralizar bytes inv√°lidos
    raw_bytes = RAW.read_bytes()
    try:
        text = raw_bytes.decode("utf-8")
        print("Arquivo j√° √© UTF-8 v√°lido.")
    except UnicodeDecodeError as e:
        print(f"Decoding error: {e}; repondo bytes inv√°lidos com replacement.")
        text = raw_bytes.decode("utf-8", errors="replace")  # substitui os inv√°lidos por ÔøΩ

    # 2. Tente carregar como JSON (isso tamb√©m valida estrutura)
    try:
        data = json.loads(text)
    except json.JSONDecodeError as e:
        print(f"Erro de JSON: {e}. Pode haver problemas estruturais al√©m do encoding.")
        raise

    # 3. Reescreve de forma ‚Äúlimpa‚Äù com UTF-8 sem BOM, indentado (opcional)
    with CLEAN.open("w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    print(f"Arquivo limpo salvo em {CLEAN}")

if __name__ == "__main__":
    clean_input_json()


Arquivo j√° √© UTF-8 v√°lido.
Arquivo limpo salvo em data\raw\pricelist_cleaned.json


In [7]:
import unicodedata

def normalize_text(s: str) -> str:
    if s is None:
        return ""
    return unicodedata.normalize("NFKC", s)

# ao montar text_content:
text_content = (
    f"Product: {normalize_text(product.get('commercial_name', ''))} (SKU: {normalize_text(product.get('cisco_product_id', ''))})\n"
    f"Description: {normalize_text(product.get('description', ''))}\n"
    f"Category: {normalize_text(product.get('technical_profile', {}).get('category', ''))}"
)


NameError: name 'product' is not defined

In [9]:
import unicodedata

def normalize_text(s: str) -> str:
    if s is None:
        return ""
    return unicodedata.normalize("NFKC", s)

# dentro do loop de cria√ß√£o dos documentos:
documents = []
for product in products:
    text_content = (
        f"Product: {normalize_text(product.get('commercial_name', ''))} "
        f"(SKU: {normalize_text(product.get('cisco_product_id', ''))})\n"
        f"Description: {normalize_text(product.get('description', ''))}\n"
        f"Category: {normalize_text(product.get('technical_profile', {}).get('category', ''))}"
    )
    doc = Document(
        text=text_content,
        metadata={
            "sku": normalize_text(product.get('cisco_product_id', '')),
            "name": normalize_text(product.get('commercial_name', '')),
            "full_data_json": json.dumps(product)  # esse pode ficar cru se quiser
        }
    )
    documents.append(doc)


NameError: name 'products' is not defined

In [13]:
import json
import unicodedata
from pathlib import Path

def normalize_text(s: str) -> str:
    if s is None:
        return ""
    return unicodedata.normalize("NFKC", s)

# --- 2. Load & clean data (em mem√≥ria, sem renomear o arquivo) ---
try:
    raw_path = Path("data/raw/pricelist.json")
    raw_bytes = raw_path.read_bytes()
    try:
        text = raw_bytes.decode("utf-8")
    except UnicodeDecodeError:
        # substitui caracteres inv√°lidos para n√£o quebrar o JSON
        text = raw_bytes.decode("utf-8", errors="replace")
        logging.warning("Encontrados bytes inv√°lidos no JSON de input; corrigidos com replacement.")

    data = json.loads(text)
    products = data if isinstance(data, list) else data.get("products", [])
    logging.info(f"Successfully loaded {len(products)} products from {raw_path}")
except Exception as e:
    logging.error(f"Failed to load or parse data/raw/pricelist.json: {e}")
    return

# --- 3. Create LlamaIndex Document Objects com normaliza√ß√£o ---
documents = []
for product in products:
    commercial_name = normalize_text(product.get("commercial_name", ""))
    sku = normalize_text(product.get("cisco_product_id", ""))
    description = normalize_text(product.get("description", ""))
    category = normalize_text(product.get("technical_profile", {}).get("category", ""))

    text_content = (
        f"Product: {commercial_name} (SKU: {sku})\n"
        f"Description: {description}\n"
        f"Category: {category}"
    )
    doc = Document(
        text=text_content,
        metadata={
            "sku": sku,
            "name": commercial_name,
            "full_data_json": json.dumps(product, ensure_ascii=False)
        }
    )
    documents.append(doc)
logging.info(f"Created {len(documents)} LlamaIndex Document objects.")


SyntaxError: 'return' outside function (1827490101.py, line 26)

In [15]:
import json

def fix_json_errors(data):
    for product in data:
        # Corre√ß√£o para QSFP-100G-SR4-S
        if product.get("cisco_product_id") == "QSFP-100G-SR4-S":
            # Corrigir pre√ßo inconsistente
            if "pricing_model" in product:
                base_price = product["pricing_model"]["base_price"]
                for tier in product["pricing_model"].get("pricing_tiers", []):
                    tier["price"] = base_price
            
            # Remover depend√™ncias incorretas
            if "dependencies" in product:
                del product["dependencies"]
        
        # Corre√ß√£o para ASA5555-X
        elif product.get("cisco_product_id") == "ASA5555-X":
            if "pricing_model" in product:
                for tier in product["pricing_model"].get("pricing_tiers", []):
                    for rule in tier.get("discount_rules", []):
                        if rule.get("type") == "enterprise":
                            rule["type"] = "partner"
                            rule["level"] = "enterprise"

# Carregar o arquivo original
with open("data/raw/pricelist_cleaned.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# Aplicar corre√ß√µes
fix_json_errors(data)

# Salvar arquivo corrigido
with open("data/raw/pricelist_fixed.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

print("Arquivo corrigido salvo como pricelist_fixed.json")

Arquivo corrigido salvo como pricelist_fixed.json


In [5]:
# --- Notebook cell: Normalizador v2 com enriquecimento por SKU ---
from pathlib import Path
import pandas as pd
import json, re
from datetime import date
from typing import Optional, Dict, List, Any

# ================= CONFIG =================
XLSX_PATH   = Path("data/raw/Cisco_Pricing.xlsx")
OUT_BASE    = Path("data/normalized")
CURRENCY    = "USD"

ENRICH_WITH_LLM = True      # >>> troque para True se quiser enriquecer
OPENAI_MODEL    = "gpt-4o-mini"

# ================= OpenAI (opcional) =================
client = None
if ENRICH_WITH_LLM:
    from openai import OpenAI
    client = OpenAI()

# ================= Helpers =================
def find_col(df: pd.DataFrame, substrs: list[str]) -> Optional[str]:
    for col in df.columns:
        lc = str(col).strip().lower()
        if any(s in lc for s in substrs):
            return col
    return None

def clean_price(val: Any) -> float:
    s = str(val)
    s = re.sub(r"[^\d,\.]", "", s)
    s = re.sub(r"\.(?=\d{3},)", "", s)  # 1.234,56 -> 1234,56
    s = s.replace(",", ".")
    try:
        return round(float(s), 2)
    except:
        return 0.0

def clean_pct(val: Any) -> float:
    s = re.sub(r"[^\d,\.]", "", str(val)).replace(",", ".")
    try:
        return float(s)/100.0
    except:
        return 0.0

def term_months_from_text(text: str) -> Optional[int]:
    if not text: return None
    t = text.upper()
    m = re.search(r"(\d+)\s*Y(R|EARS?)?", t) or re.search(r"(\d+)\s*YR", t)
    if m: return int(m.group(1)) * 12
    m = re.search(r"(\d+)\s*MONTH", t)
    if m: return int(m.group(1))
    m = re.search(r"-(\d+)YR\b", t)
    if m: return int(m.group(1)) * 12
    return None

def load_json_list(path: Path) -> List[dict]:
    if path.exists():
        with open(path, "r", encoding="utf-8") as f:
            try:
                return json.load(f)
            except Exception:
                return []
    return []

def save_json_list(path: Path, data: List[dict]):
    path.parent.mkdir(parents=True, exist_ok=True)
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

# ================= Categorias & Blueprints =================
# Campos esperados por categoria (hardware/software).
HW_ATTR_BLUEPRINT = {
    "switches":   ["port_count","poe_budget_w","uplinks","uplink_speed","layer","stacking","throughput_gbps"],
    "routers":    ["wan_ports","lan_ports","throughput_gbps","sdwan_support","vpn_tps","modules_supported"],
    "wireless":   ["wifi_standard","spatial_streams","mimo","antenna_type","indoor_outdoor","ip_rating","poe_class"],
    "firewall":   ["fw_throughput_gbps","ips_gbps","max_sessions","interfaces","ha_supported","vpn_peers"],
    "antennas":   ["bands","gain_dbi","connector_type","polarization","beamwidth","ip_rating"],
    "cabling":    ["type","length_m","connector_a","connector_b","shielding"],
    "connectors": ["type","gender","impedance_ohm","application"],
}

SW_ATTR_BLUEPRINT = {
    "wireless":           {"features": [], "bundles": []},
    "switches":           {"features": [], "bundles": []},
    "routers":            {"features": [], "bundles": []},
    "firewall":           {"features": [], "bundles": []},
    "sw_support_license": {"support_level": None, "sla_hrs": None, "on_site": None}
}

# Mapeamento de categoria da planilha -> slug final
CANON = {
    "hardware": {
        "antennas": "antennas",
        "cabling": "cabling",
        "connectors": "connectors",
        "firewall": "firewall",
        "routers": "routers",
        "switches": "switches",
        "wireless": "wireless",
    },
    "software": {
        "firewall": "firewall",
        "routers": "routers",
        "sw support license": "sw_support_license",
        "switches": "switches",
        "wireless": "wireless",
    }
}

def canon_hw(raw: str) -> str:
    k = raw.strip().lower()
    return CANON["hardware"].get(k, re.sub(r"[^a-z0-9]+","_", k))

def canon_sw(raw: str) -> str:
    k = raw.strip().lower()
    return CANON["software"].get(k, re.sub(r"[^a-z0-9]+","_", k))

# ================= Templates de registro =================
def base_product(sku: str, name: str, ptype: str) -> Dict:
    return {
        "cisco_product_id": sku,
        "commercial_name": name,
        "product_type": ptype,
        "lifecycle": {"status": "active", "eos_announced": None, "last_support_date": None},
        "regulatory": {"certifications": []}
    }

def hardware_record(sku: str, name: str, category: str, price: float, elig_frac: float) -> Dict:
    return {
        **base_product(sku, name, "hardware"),
        "technical_profile": {
            "category": category,
            "subcategory": "",
            "hardware_attributes": {k: None for k in HW_ATTR_BLUEPRINT.get(category, [])}
        },
        "pricing_model": {
            "type": "one_time",
            "currency": CURRENCY,
            "base_price": price,
            "elig_pct": elig_frac,
            "pricing_tiers": [{
                "min_quantity": 1,
                "price": price,
                "effective": str(date.today()),
                "discount_rules": []
            }]
        },
        "dependencies": {"required_components": [], "compatible_with": []}
    }

def software_record(sku: str, name: str, category: str, price: float, elig_frac: float, desc: str) -> Dict:
    term = term_months_from_text(sku) or term_months_from_text(name) or term_months_from_text(desc) or 12
    needs_hw = bool(re.search(r"(meraki|lic-)", f"{sku} {name} {desc}", re.I))
    profile = {"category": category, "license_model": "term", "term_months": int(term), "seat_unit": "device", "requires_hardware": needs_hw}

    # preenche os campos per-category
    if category in SW_ATTR_BLUEPRINT:
        blueprint = SW_ATTR_BLUEPRINT[category]
        profile.update(blueprint)

    return {
        **base_product(sku, name, "software"),
        "software_profile": profile,
        "entitlement": {"smart_account_required": needs_hw, "partner_tier_pricing": {}},
        "pricing_model": {
            "type": "subscription",
            "currency": CURRENCY,
            "list_price": price,
            "elig_pct": elig_frac,
            "billing": "prepaid",
            "term_months": int(term),
            "pricing_tiers": [{
                "min_quantity": 1,
                "price": price,
                "term_months": int(term),
                "effective": str(date.today()),
                "discount_rules": []
            }]
        }
    }

# ================= Enriquecimento 1-a-1 (opcional) =================
def llm_enrich_attributes(product: Dict) -> Dict:
    """Enriquece APENAS os campos previstos no blueprint, 1 SKU por vez."""
    if not (ENRICH_WITH_LLM and client):
        return product

    if product["product_type"] == "hardware":
        cat = product["technical_profile"]["category"]
        allowed = HW_ATTR_BLUEPRINT.get(cat, [])
        if not allowed:
            return product

        schema_hint = {k: "<value>" for k in allowed}
        prompt = f"""
Return JSON ONLY. Fill the keys in "hardware_attributes" using the allowed keys below.
Do not invent SKU/price. If unknown, keep null. No extra keys.

SKU: {product['cisco_product_id']}
Name: {product['commercial_name']}
Category: {cat}

Allowed keys: {allowed}

Respond exactly as:
{{
  "hardware_attributes": {json.dumps(schema_hint)}
}}
""".strip()
        try:
            resp = client.chat.completions.create(
                model=OPENAI_MODEL,
                response_format={"type": "json_object"},
                messages=[{"role":"user","content": prompt}],
                temperature=0.1
            )
            data = json.loads(resp.choices[0].message.content)
            attrs = data.get("hardware_attributes", {})
            # filtra apenas keys permitidas
            filtered = {k: attrs.get(k, None) for k in allowed}
            product["technical_profile"]["hardware_attributes"].update(filtered)
        except Exception:
            pass

    else:  # software
        cat = product["software_profile"]["category"]
        # duas modalidades: features/bundles OU suporte
        if cat == "sw_support_license":
            allowed = list(SW_ATTR_BLUEPRINT["sw_support_license"].keys())
            schema_hint = {k: "<value>" for k in allowed}
            prompt = f"""
Return JSON ONLY. Fill the keys in "support" using the allowed keys below.
If unknown, keep null. No extra keys.

SKU: {product['cisco_product_id']}
Name: {product['commercial_name']}
Category: {cat}

Allowed keys: {allowed}

Respond exactly as:
{{
  "support": {json.dumps(schema_hint)}
}}
""".strip()
            try:
                resp = client.chat.completions.create(
                    model=OPENAI_MODEL,
                    response_format={"type": "json_object"},
                    messages=[{"role":"user","content": prompt}],
                    temperature=0.1
                )
                data = json.loads(resp.choices[0].message.content)
                s = data.get("support", {})
                product["software_profile"]["support_level"] = s.get("support_level")
                product["software_profile"]["sla_hrs"] = s.get("sla_hrs")
                product["software_profile"]["on_site"] = s.get("on_site")
            except Exception:
                pass
        else:
            prompt = f"""
Return JSON ONLY. Provide "features" (list of short feature names) and optional "bundles" (e.g., DNA Essentials/Advantage, MR Enterprise).
If unknown, return empty arrays.

SKU: {product['cisco_product_id']}
Name: {product['commercial_name']}
Category: {cat}

Respond exactly as:
{{
  "features": [],
  "bundles": []
}}
""".strip()
            try:
                resp = client.chat.completions.create(
                    model=OPENAI_MODEL,
                    response_format={"type": "json_object"},
                    messages=[{"role":"user","content": prompt}],
                    temperature=0.1
                )
                data = json.loads(resp.choices[0].message.content)
                if isinstance(data.get("features"), list):
                    product["software_profile"]["features"] = data["features"]
                if isinstance(data.get("bundles"), list):
                    product["software_profile"]["bundles"] = data["bundles"]
            except Exception:
                pass

    return product

# ================= Pipeline principal =================
def normalize_catalog_per_item(xlsx_path: Path, out_dir: Path) -> Dict[str, int]:
    df = pd.read_excel(xlsx_path, engine="openpyxl")

    c_type = find_col(df, ["category-type", "category type"])
    c_cat  = find_col(df, ["category"])
    c_sku  = find_col(df, ["sku", "part"])
    c_desc = find_col(df, ["desc", "description", "name"])
    c_price= find_col(df, ["pri","price","list"])
    c_elig = find_col(df, ["elig"])

    missing = [n for n,c in [("Category-Type",c_type),("Category",c_cat),("SKU",c_sku),("Desc",c_desc),("Price",c_price),("Elig_%",c_elig)] if c is None]
    if missing:
        raise ValueError(f"Colunas ausentes: {missing}")

    df["_type"]   = df[c_type].astype(str).str.strip().str.lower()
    df["_catraw"] = df[c_cat].astype(str).str.strip()
    df["_sku"]    = df[c_sku].astype(str).str.strip()
    df["_name"]   = df[c_desc].astype(str).str.strip()
    df["_price"]  = df[c_price].apply(clean_price)
    df["_eligf"]  = df[c_elig].apply(clean_pct)

    (out_dir/"hardware").mkdir(parents=True, exist_ok=True)
    (out_dir/"software").mkdir(parents=True, exist_ok=True)

    counts = {"hardware":0, "software":0, "files":0}

    # --- HARDWARE: processa 1 a 1 e grava por categoria ---
    hw = df[df["_type"]=="hardware"]
    for raw_cat, chunk in hw.groupby("_catraw"):
        cat = canon_hw(raw_cat)
        out_file = out_dir/"hardware"/f"hw_{cat}.json"
        existing = load_json_list(out_file)
        seen = {p.get("cisco_product_id") for p in existing}

        for _, r in chunk.iterrows():
            sku = r["_sku"]
            if sku in seen: 
                continue
            prod = hardware_record(sku, r["_name"], cat, r["_price"], r["_eligf"])
            prod = llm_enrich_attributes(prod)  # 1 chamada por SKU (se habilitado)
            existing.append(prod)
            counts["hardware"] += 1

        save_json_list(out_file, existing)
        counts["files"] += 1

    # --- SOFTWARE: idem ---
    sw = df[df["_type"]=="software"]
    for raw_cat, chunk in sw.groupby("_catraw"):
        cat = canon_sw(raw_cat)
        out_file = out_dir/"software"/f"sw_{cat}.json"
        existing = load_json_list(out_file)
        seen = {p.get("cisco_product_id") for p in existing}

        for _, r in chunk.iterrows():
            sku = r["_sku"]
            if sku in seen:
                continue
            prod = software_record(sku, r["_name"], cat, r["_price"], r["_eligf"], r["_name"])
            prod = llm_enrich_attributes(prod)  # 1 chamada por SKU (se habilitado)
            existing.append(prod)
            counts["software"] += 1

        save_json_list(out_file, existing)
        counts["files"] += 1

    return counts

# ================= Run =================
OUT_BASE.mkdir(parents=True, exist_ok=True)
stats = normalize_catalog_per_item(XLSX_PATH, OUT_BASE)
stats


{'hardware': 0, 'software': 0, 'files': 2}

In [9]:
import os, json, re, math
from pathlib import Path
import pandas as pd

# ====== CONFIG ======
XLSX_PATH = "data/raw/Cisco_Pricing.xlsx"   # ajuste se necess√°rio
OUT_DIR   = Path("data/normalized")
ENRICH_WITH_LLM = True                     # mantenha False por enquanto

# Mapeamento can√¥nico de categorias (o que vimos nas telas)
CANON_HW = {
    "antennas": "antennas",
    "antenna": "antennas",
    "cabling": "cabling",
    "connectors": "connectors",
    "firewall": "firewall",
    "firewalls": "firewall",
    "routers": "routers",
    "router": "routers",
    "switches": "switches",
    "switch": "switches",
    "wireless": "wireless",
}

CANON_SW = {
    "wireless": "wireless",
    "switches": "switches",
    "routers": "routers",
    "firewall": "firewall",
    "firewalls": "firewall",
    "sw support license": "sw_support_license",
    "support license": "sw_support_license",
}

# Blueprints MUITO resumidos (use os completos que combinamos se quiser)
HW_ATTR_BLUEPRINT = {
    "switches":    {"port_count": None, "poe_budget_w": None, "layer": None},
    "routers":     {"wan_ports": None},
    "firewall":    {"throughput_gbps": None},
    "wireless":    {"wifi_standard": None, "antenna_type": None},
    "antennas":    {"connector_type": None, "gain_dbi": None},
    "cabling":     {"cable_type": None, "length_m": None},
    "connectors":  {"connector_type": None},
}
SW_ATTR_BLUEPRINT = {
    "wireless":           {"features": [], "term_months": None},
    "switches":           {"features": [], "term_months": None},
    "routers":            {"features": [], "term_months": None},
    "firewall":           {"features": [], "term_months": None},
    "sw_support_license": {"support_level": None, "sla_hrs": None, "on_site": None, "term_months": None},
}

def _find_col(df, *candidates):
    cols = {str(c).strip().lower(): c for c in df.columns}
    for want in candidates:
        for k, orig in cols.items():
            if want in k:
                return orig
    return None

def _norm_text(x):
    return str(x).strip()

def _norm_price(x):
    if pd.isna(x):
        return 0.0
    s = str(x)
    s = re.sub(r"[^\d,\.]", "", s)
    # trata formatos comuns: 1.234,56 ‚Üí 1234.56 ; 1234,56 ‚Üí 1234.56
    if s.count(",") == 1 and s.count(".") >= 1 and s.rfind(",") > s.rfind("."):
        s = s.replace(".", "").replace(",", ".")
    elif s.count(",") == 1 and s.count(".") == 0:
        s = s.replace(",", ".")
    try:
        return float(s)
    except:
        return 0.0

def _canon_category(raw, typ):
    base = _norm_text(raw).lower()
    if typ == "hardware":
        for k in CANON_HW:
            if k in base:
                return CANON_HW[k]
    else:
        for k in CANON_SW:
            if k in base:
                return CANON_SW[k]
    # fallback: slug simples
    return re.sub(r"[^a-z0-9]+", "_", base) or "uncategorized"

def _hardware_record(row, sku, name, cat_slug, price, elig):
    rec = {
        "cisco_product_id": sku,
        "commercial_name": name,
        "product_type": "hardware",
        "lifecycle": {"status": "unknown"},
        "technical_profile": {
            "hardware_attributes": {}
        },
        "pricing_model": {
            "type": "one_time",
            "currency": "USD",
            "base_price": price,
            "elig_pct": elig
        }
    }
    # aplica blueprint da categoria se existir
    attrs = HW_ATTR_BLUEPRINT.get(cat_slug)
    if attrs:
        rec["technical_profile"]["hardware_attributes"].update(attrs)
    return rec

def _software_record(row, sku, name, cat_slug, price, elig):
    rec = {
        "cisco_product_id": sku,
        "commercial_name": name,
        "product_type": "software",
        "software_profile": {
            "edition": None,
            "license_type": "subscription",
            "term_months": 12,
            "features": [],
            "bundles": []
        },
        "pricing_model": {
            "type": "term_subscription",
            "currency": "USD",
            "base_price": price,
            "elig_pct": elig
        }
    }
    bp = SW_ATTR_BLUEPRINT.get(cat_slug)
    if bp:
        rec["software_profile"].update({k: v for k, v in bp.items() if k not in rec["software_profile"]})
    return rec

def normalize_and_export(xlsx_path, out_dir):
    out_dir.mkdir(parents=True, exist_ok=True)
    df = pd.read_excel(xlsx_path, engine="openpyxl")
    # descoberta robusta de colunas
    col_type = _find_col(df, "category-type", "category type")
    col_cat  = _find_col(df, "category")
    col_sku  = _find_col(df, "sku", "part", "part number")
    col_desc = _find_col(df, "desc", "description", "name")
    col_pri  = _find_col(df, "pri", "price", "list price")
    col_elig = _find_col(df, "elig", "eligibility")

    print("‚Üí Columns detected:", {
        "type": col_type, "category": col_cat, "sku": col_sku, "desc": col_desc, "price": col_pri, "elig": col_elig
    })
    if not all([col_type, col_cat, col_sku, col_desc, col_pri, col_elig]):
        raise ValueError("N√£o encontrei todas as colunas necess√°rias. Confira os nomes no Excel.")

    # normaliza√ß√µes b√°sicas
    df["_type"] = df[col_type].astype(str).str.strip().str.lower()
    df["_cat"]  = df[col_cat].astype(str).str.strip()
    df["_sku"]  = df[col_sku].astype(str).str.strip()
    df["_desc"] = df[col_desc].astype(str).str.strip()
    df["_pri"]  = df[col_pri].apply(_norm_price)
    df["_elig"] = df[col_elig].astype(str).str.strip().str.replace("%","", regex=False)
    df["_elig"] = pd.to_numeric(df["_elig"].str.replace(",", ".", regex=False), errors="coerce").fillna(0)/100.0

    # diagn√≥stico de valores
    print("‚Üí Type counts:", df["_type"].value_counts().to_dict())
    print("‚Üí Sample categories:", df["_cat"].dropna().astype(str).str.strip().str.lower().value_counts().head(15).to_dict())

    # coletores por categoria
    buckets_hw = {}
    buckets_sw = {}

    hw_rows = df[df["_type"].str.contains("hard")]
    sw_rows = df[df["_type"].str.contains("soft")]

    for _, r in hw_rows.iterrows():
        cat_slug = _canon_category(r["_cat"], "hardware")
        rec = _hardware_record(r, r["_sku"], r["_desc"], cat_slug, r["_pri"], r["_elig"])
        buckets_hw.setdefault(cat_slug, []).append(rec)

    for _, r in sw_rows.iterrows():
        cat_slug = _canon_category(r["_cat"], "software")
        rec = _software_record(r, r["_sku"], r["_desc"], cat_slug, r["_pri"], r["_elig"])
        buckets_sw.setdefault(cat_slug, []).append(rec)

    # salvar
    hw_dir = out_dir / "hardware"
    sw_dir = out_dir / "software"
    hw_dir.mkdir(parents=True, exist_ok=True)
    sw_dir.mkdir(parents=True, exist_ok=True)

    files = 0
    for cat, items in buckets_hw.items():
        p = hw_dir / f"hw_{cat}.json"
        p.write_text(json.dumps(items, indent=2))
        files += 1
    for cat, items in buckets_sw.items():
        p = sw_dir / f"sw_{cat}.json"
        p.write_text(json.dumps(items, indent=2))
        files += 1

    print("‚úÖ Done:", {"hardware": sum(len(v) for v in buckets_hw.values()),
                       "software": sum(len(v) for v in buckets_sw.values()),
                       "files": files})
    print("‚Üí Output folders:", hw_dir, "and", sw_dir)

normalize_and_export(XLSX_PATH, OUT_DIR)


‚Üí Columns detected: {'type': 'Category-Type', 'category': 'Category-Type', 'sku': 'SKU', 'desc': 'Desc', 'price': 'price', 'elig': 'Elig_%'}
‚Üí Type counts: {'hardware': 3004, 'software': 1263}
‚Üí Sample categories: {'hardware': 3004, 'software': 1263}
‚úÖ Done: {'hardware': 3004, 'software': 1263, 'files': 2}
‚Üí Output folders: data\normalized\hardware and data\normalized\software


In [14]:
# %% [markdown]
# Normalizador + Enriquecimento (por categoria / por SKU)
# L√™ Cisco_Pricing.xlsx, cria JSONs por categoria (hardware/software),
# e opcionalmente enriquece campos faltantes com a LLM (um produto por vez).

# %%
import os, json, re, time, math
from pathlib import Path
from typing import Dict, List, Any
import pandas as pd
from tqdm import tqdm

# OpenAI v1
try:
    from openai import OpenAI
    _openai_available = True
except Exception:
    _openai_available = False

# =================== CONFIG ===================
XLSX_PATH = "data/raw/Cisco_Pricing.xlsx"    # <<< ajuste se necess√°rio
OUT_DIR   = Path("data/normalized")

# Toggle de enriquecimento
ENRICH_WITH_LLM   = True     # <<< defina para True quando quiser enriquecer
REFRESH_ENRICHMENT= True    # True = reprocessa/enriquece mesmo se j√° existir no JSON
OPENAI_MODEL      = "gpt-4o-mini"
OPENAI_API_KEY    = os.getenv("OPENAI_API_KEY")  # pode setar aqui string literal se preferir
RATE_LIMIT_SLEEP  = 0.5      # seconds entre chamadas, ajuste conforme necess√°rio

# =================== MAPEAMENTOS ===================
# Canonicaliza√ß√£o das categorias (use os nomes do seu Excel)
CANON_HW = {
    "antennas": "antennas", "antenna": "antennas",
    "cabling": "cabling",
    "connectors": "connectors", "connector": "connectors",
    "firewall": "firewall", "firewalls": "firewall",
    "routers": "routers", "router": "routers",
    "switches": "switches", "switch": "switches",
    "wireless": "wireless", "access point": "wireless", "access points": "wireless",
}
CANON_SW = {
    "wireless": "wireless",
    "switches": "switches",
    "routers": "routers",
    "firewall": "firewall", "firewalls": "firewall",
    "sw support license": "sw_support_license", "support license": "sw_support_license",
}

# Blueprints de atributos por categoria (pode estender quando quiser)
HW_ATTR_BLUEPRINT: Dict[str, Dict[str, Any]] = {
    "switches": {
        "category": "switch",
        "subcategory": None,
        "port_count": None,
        "poe_budget_w": None,
        "layer": None,                  # L2/L3
        "uplink": None,                 # ex: "2x10G SFP+"
        "stacking": None,               # True/False/None
        "mounting": None,               # rack/desktop
        "throughput_gbps": None,
        "power_requirements": None,
    },
    "routers": {
        "category": "router",
        "wan_ports": None,
        "sdwan_capable": None,
        "vpn_throughput_mbps": None,
        "throughput_gbps": None,
        "power_requirements": None,
    },
    "firewall": {
        "category": "firewall",
        "throughput_gbps": None,
        "ngfw_features": [],
        "vpn_throughput_mbps": None,
        "ha_supported": None,
        "power_requirements": None,
    },
    "wireless": {
        "category": "wireless",
        "subcategory": "access_point",
        "wifi_standard": None,          # ex: 802.11ac/ax
        "throughput": None,             # ex: "1.7 Gbps"
        "antenna_type": None,           # internal/external
        "mounting": None,               # indoor/outdoor
        "power_requirements": None,     # PoE/PoE+
        "mu_mimo": None,                # True/False/None
    },
    "antennas": {
        "category": "antenna",
        "connector_type": None,
        "gain_dbi": None,
        "polarity": None,               # single/dual
        "mounting": None,
        "band": None,                   # 2.4/5/6 GHz
    },
    "cabling": {
        "category": "cable",
        "cable_type": None,             # cat5e/cat6/sfp+/dac/etc
        "length_m": None,
        "connector_a": None,
        "connector_b": None,
        "shielding": None,              # UTP/STP
    },
    "connectors": {
        "category": "connector",
        "connector_type": None,         # rj45/sfp/sfp+/qsfp
        "gender": None,
        "plating": None,
    }
}

SW_ATTR_BLUEPRINT: Dict[str, Dict[str, Any]] = {
    # subscriptions "funcionais"
    "wireless": {
        "edition": None,
        "license_type": "subscription",
        "term_months": None,
        "features": [],
        "bundles": [],
        "per_device_or_site": None      # per_device/per_site/unknown
    },
    "switches": {
        "edition": None,
        "license_type": "subscription",
        "term_months": None,
        "features": [],
        "bundles": [],
        "per_device_or_site": None
    },
    "routers": {
        "edition": None,
        "license_type": "subscription",
        "term_months": None,
        "features": [],
        "bundles": [],
        "per_device_or_site": None
    },
    "firewall": {
        "edition": None,
        "license_type": "subscription",
        "term_months": None,
        "features": [],
        "bundles": [],
        "per_device_or_site": None
    },
    # licen√ßas de suporte
    "sw_support_license": {
        "support_level": None,          # base/enterprise/24x7 etc
        "sla_hrs": None,                # 8x5/24x7
        "on_site": None,                # True/False/None
        "term_months": None
    }
}

# =================== HELPERS ===================
def _find_col(df: pd.DataFrame, *candidates) -> str | None:
    cols = {str(c).strip().lower(): c for c in df.columns}
    for want in candidates:
        for k, orig in cols.items():
            if want in k:
                return orig
    return None

def _slugify(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "_", str(s).strip().lower()).strip("_") or "uncategorized"

def _norm_price(x) -> float:
    if pd.isna(x): return 0.0
    s = str(x)
    s = re.sub(r"[^\d,\.]", "", s)
    if s.count(",") == 1 and s.count(".") >= 1 and s.rfind(",") > s.rfind("."):
        s = s.replace(".", "").replace(",", ".")
    elif s.count(",") == 1 and s.count(".") == 0:
        s = s.replace(",", ".")
    try:
        return float(s)
    except:
        return 0.0

def _canon_category(raw: str, typ: str) -> str:
    base = str(raw or "").strip().lower()
    mapping = CANON_HW if typ == "hardware" else CANON_SW
    for k in mapping:
        if k in base:
            return mapping[k]
    return _slugify(base)

def _hardware_record(sku: str, name: str, cat_slug: str, price: float, elig: float) -> Dict:
    rec = {
        "cisco_product_id": sku,
        "commercial_name": name,
        "product_type": "hardware",
        "lifecycle": {"status": "unknown"},
        "technical_profile": {
            "hardware_attributes": {}
        },
        "pricing_model": {
            "type": "one_time",
            "currency": "USD",
            "base_price": price,
            "elig_pct": float(elig)
        }
    }
    attrs = HW_ATTR_BLUEPRINT.get(cat_slug)
    if attrs:
        rec["technical_profile"]["hardware_attributes"].update(attrs)
    return rec

def _software_record(sku: str, name: str, cat_slug: str, price: float, elig: float) -> Dict:
    # subs vs suporte
    if cat_slug == "sw_support_license":
        profile = {
            "support_level": None,
            "sla_hrs": None,
            "on_site": None,
            "term_months": None
        }
        pricing = {
            "type": "term_subscription",
            "currency": "USD",
            "base_price": price,
            "elig_pct": float(elig)
        }
        rec = {
            "cisco_product_id": sku,
            "commercial_name": name,
            "product_type": "software",
            "software_profile": profile,
            "pricing_model": pricing
        }
    else:
        profile = {
            "edition": None,
            "license_type": "subscription",
            "term_months": 12,
            "features": [],
            "bundles": [],
            "per_device_or_site": None
        }
        # merge com blueprint
        bp = SW_ATTR_BLUEPRINT.get(cat_slug)
        if bp:
            for k, v in bp.items():
                profile.setdefault(k, v)
        rec = {
            "cisco_product_id": sku,
            "commercial_name": name,
            "product_type": "software",
            "software_profile": profile,
            "pricing_model": {
                "type": "term_subscription",
                "currency": "USD",
                "base_price": price,
                "elig_pct": float(elig)
            }
        }
    return rec

def _load_existing(path: Path) -> List[Dict]:
    if path.exists():
        try:
            return json.loads(path.read_text(encoding="utf-8"))
        except Exception:
            return []
    return []

def _index_by_sku(items: List[Dict]) -> Dict[str, Dict]:
    out = {}
    for it in items:
        sku = str(it.get("cisco_product_id") or "").strip()
        if sku:
            out[sku] = it
    return out

# =================== LLM ENRICHMENT ===================
def _llm_client():
    if not _openai_available:
        raise RuntimeError("Pacote 'openai' n√£o encontrado. Instale com: pip install openai")
    key = OPENAI_API_KEY or os.getenv("OPENAI_API_KEY")
    if not key:
        raise RuntimeError("OPENAI_API_KEY n√£o definido.")
    client = OpenAI(api_key=key)
    return client

def _llm_prompt_for_enrichment(product: Dict, product_type: str, cat_slug: str) -> str:
    # Seleciona quais campos podem ser preenchidos
    if product_type == "hardware":
        allowed = list((HW_ATTR_BLUEPRINT.get(cat_slug) or {}).keys())
        path = "technical_profile.hardware_attributes"
    else:
        if cat_slug == "sw_support_license":
            allowed = list(SW_ATTR_BLUEPRINT["sw_support_license"].keys())
            path = "software_profile"
        else:
            allowed = list((SW_ATTR_BLUEPRINT.get(cat_slug) or {}).keys())
            # software funcional tem software_profile como raiz
            path = "software_profile"

    skeleton = {
        "fill_path": path,
        "allowed_fields": allowed,
        "only_fill_nulls": True,
        "return_unknown_as_null": True
    }

    name = product.get("commercial_name", "")
    sku  = product.get("cisco_product_id", "")
    return (
        "You are a data normalizer for Cisco product catalog.\n"
        "STRICT RULES:\n"
        " - Fill ONLY the fields listed in 'allowed_fields' below.\n"
        " - Fill ONLY when you are reasonably certain from the product name/SKU semantics.\n"
        " - If unsure, set the value to null (do NOT guess).\n"
        " - Return a single valid JSON object with only the keys to update (no prose).\n\n"
        f"PRODUCT_NAME: {name}\n"
        f"SKU: {sku}\n\n"
        f"FILL_INSTRUCTIONS_JSON:\n{json.dumps(skeleton, indent=2)}\n"
        "Output JSON example:\n"
        "{ \"<field1>\": <value or null>, \"<field2>\": <value or null> }\n"
    )

def enrich_product_with_llm(product: Dict, product_type: str, cat_slug: str) -> Dict:
    """
    Chama a LLM para preencher APENAS os campos null do blueprint da categoria.
    Retorna um dicion√°rio com as chaves atualizadas (ou {} se nada a fazer).
    """
    # Detecta se h√° algo para preencher
    if product_type == "hardware":
        path = ("technical_profile", "hardware_attributes")
        bp   = HW_ATTR_BLUEPRINT.get(cat_slug) or {}
    else:
        if cat_slug == "sw_support_license":
            path = ("software_profile",)
            bp   = SW_ATTR_BLUEPRINT["sw_support_license"]
        else:
            path = ("software_profile",)
            bp   = SW_ATTR_BLUEPRINT.get(cat_slug) or {}

    # verifica nulls
    node = product
    for p in path:
        node = node.get(p, {})
    needs = False
    for k in bp.keys():
        if node.get(k, None) in (None, [], ""):
            needs = True
            break
    if not needs:
        return {}

    client = _llm_client()
    prompt = _llm_prompt_for_enrichment(product, product_type, cat_slug)

    try:
        resp = client.chat.completions.create(
            model=OPENAI_MODEL,
            messages=[
                {"role": "system", "content": "You are a strict JSON filler that never hallucinates."},
                {"role": "user", "content": prompt}
            ],
            response_format={"type": "json_object"},
            temperature=0.0,
        )
        raw = resp.choices[0].message.content.strip()
        data = json.loads(raw)
        # filtra apenas campos permitidos
        allowed = set((HW_ATTR_BLUEPRINT if product_type=="hardware" else SW_ATTR_BLUEPRINT.get(cat_slug, {})).keys()) \
                  if product_type=="hardware" else \
                  (set(SW_ATTR_BLUEPRINT["sw_support_license"].keys()) if cat_slug=="sw_support_license"
                   else set(SW_ATTR_BLUEPRINT.get(cat_slug, {}).keys()))
        cleaned = {k: v for k, v in data.items() if k in allowed}
        # aplica no produto
        target = product
        for p in path:
            if p not in target:
                target[p] = {}
            target = target[p]
        for k, v in cleaned.items():
            if target.get(k) in (None, [], ""):
                target[k] = v
        return cleaned
    except Exception as e:
        # Falha de parsing ou de chamada -> n√£o enriquece
        # Voc√™ pode logar se quiser
        return {}

# =================== PIPELINE ===================
def normalize_and_export_by_category(xlsx_path: str, out_dir: Path):
    out_dir.mkdir(parents=True, exist_ok=True)
    df = pd.read_excel(xlsx_path, engine="openpyxl")

    # Detecta colunas
    col_type = _find_col(df, "category-type", "category type", "type")
    col_cat  = _find_col(df, "category")
    col_sku  = _find_col(df, "sku", "part", "part number")
    col_desc = _find_col(df, "desc", "description", "name")
    col_pri  = _find_col(df, "pri", "price", "list price")
    col_elig = _find_col(df, "elig", "eligibility", "elig_%", "eligibility %")

    print("‚Üí Columns detected:", {
        "type": col_type, "category": col_cat, "sku": col_sku, "desc": col_desc, "price": col_pri, "elig": col_elig
    })
    if not all([col_type, col_cat, col_sku, col_desc, col_pri, col_elig]):
        raise ValueError("N√£o encontrei todas as colunas necess√°rias. Confira os nomes no Excel.")

    # Normaliza campos base
    df["_type"] = df[col_type].astype(str).str.strip().str.lower()
    df["_cat"]  = df[col_cat].astype(str).str.strip()
    df["_sku"]  = df[col_sku].astype(str).str.strip()
    df["_desc"] = df[col_desc].astype(str).str.strip()
    df["_pri"]  = df[col_pri].apply(_norm_price)
    df["_elig"] = (
        df[col_elig].astype(str).str.strip().str.replace("%","", regex=False)
        .str.replace(",", ".", regex=False)
    )
    df["_elig"] = pd.to_numeric(df["_elig"], errors="coerce").fillna(0.0) / 100.0

    print("‚Üí Type counts:", df["_type"].value_counts().to_dict())
    print("‚Üí Sample categories:", df["_cat"].str.lower().value_counts().head(15).to_dict())

    # Buckets por categoria
    buckets_hw: Dict[str, List[Dict]] = {}
    buckets_sw: Dict[str, List[Dict]] = {}

    # Separa
    hw_rows = df[df["_type"].str.contains("hard", na=False)]
    sw_rows = df[df["_type"].str.contains("soft", na=False)]

    # Monta registros iniciais
    for _, r in hw_rows.iterrows():
        sku, name = r["_sku"], r["_desc"]
        cat_slug = _canon_category(r["_cat"], "hardware")
        rec = _hardware_record(sku, name, cat_slug, float(r["_pri"]), float(r["_elig"]))
        buckets_hw.setdefault(cat_slug, []).append(rec)

    for _, r in sw_rows.iterrows():
        sku, name = r["_sku"], r["_desc"]
        cat_slug = _canon_category(r["_cat"], "software")
        rec = _software_record(sku, name, cat_slug, float(r["_pri"]), float(r["_elig"]))
        buckets_sw.setdefault(cat_slug, []).append(rec)

    # Salva por categoria (merge com existentes + enriquecimento)
    hw_dir = out_dir / "hardware"
    sw_dir = out_dir / "software"
    hw_dir.mkdir(parents=True, exist_ok=True)
    sw_dir.mkdir(parents=True, exist_ok=True)

    files_written = 0

    # ---------- HARDWARE ----------
    for cat, items in buckets_hw.items():
        path = hw_dir / f"hw_{cat}.json"
        existing = _load_existing(path)
        index    = _index_by_sku(existing)

        # Merge + enrich
        to_save: List[Dict] = existing[:]  # start with existing
        seen = set(s.get("cisco_product_id") for s in existing)

        for prod in tqdm(items, desc=f"[HW] {cat}", unit="sku"):
            sku = prod["cisco_product_id"]
            if sku in seen and not REFRESH_ENRICHMENT:
                # j√° existe ‚Üí mant√©m
                continue

            # Se j√° existia e vamos refresh, substitui
            if sku in index and REFRESH_ENRICHMENT:
                # mant√©m campos j√° preenchidos
                merged = index[sku]
                # sobrep√µe pre√ßo/desc se mudou
                merged["commercial_name"] = prod["commercial_name"]
                merged["pricing_model"]["base_price"] = prod["pricing_model"]["base_price"]
                merged["pricing_model"]["elig_pct"]   = prod["pricing_model"]["elig_pct"]
                prod = merged

            if ENRICH_WITH_LLM:
                # preenche somente os campos null do blueprint
                _ = enrich_product_with_llm(prod, "hardware", cat)
                time.sleep(RATE_LIMIT_SLEEP)

            # se j√° existia, update lista; se n√£o, append
            if sku in index:
                # atualiza no array existente
                for i, old in enumerate(to_save):
                    if old.get("cisco_product_id") == sku:
                        to_save[i] = prod
                        break
            else:
                to_save.append(prod)
                index[sku] = prod
                seen.add(sku)

        path.write_text(json.dumps(to_save, indent=2), encoding="utf-8")
        files_written += 1

    # ---------- SOFTWARE ----------
    for cat, items in buckets_sw.items():
        path = sw_dir / f"sw_{cat}.json"
        existing = _load_existing(path)
        index    = _index_by_sku(existing)

        to_save: List[Dict] = existing[:]
        seen = set(s.get("cisco_product_id") for s in existing)

        for prod in tqdm(items, desc=f"[SW] {cat}", unit="sku"):
            sku = prod["cisco_product_id"]
            if sku in seen and not REFRESH_ENRICHMENT:
                continue

            if sku in index and REFRESH_ENRICHMENT:
                merged = index[sku]
                merged["commercial_name"] = prod["commercial_name"]
                merged["pricing_model"]["base_price"] = prod["pricing_model"]["base_price"]
                merged["pricing_model"]["elig_pct"]   = prod["pricing_model"]["elig_pct"]
                prod = merged

            if ENRICH_WITH_LLM:
                _ = enrich_product_with_llm(prod, "software", cat)
                time.sleep(RATE_LIMIT_SLEEP)

            if sku in index:
                for i, old in enumerate(to_save):
                    if old.get("cisco_product_id") == sku:
                        to_save[i] = prod
                        break
            else:
                to_save.append(prod)
                index[sku] = prod
                seen.add(sku)

        path.write_text(json.dumps(to_save, indent=2), encoding="utf-8")
        files_written += 1

    print("‚úÖ Done:", {
        "hardware": sum(len(json.loads((hw_dir/f).read_text(encoding='utf-8'))) for f in os.listdir(hw_dir) if f.endswith(".json")),
        "software": sum(len(json.loads((sw_dir/f).read_text(encoding='utf-8'))) for f in os.listdir(sw_dir) if f.endswith(".json")),
        "files": files_written,
        "out": str(out_dir)
    })

# %%
normalize_and_export_by_category(XLSX_PATH, OUT_DIR)


‚Üí Columns detected: {'type': 'Category-Type', 'category': 'Category-Type', 'sku': 'SKU', 'desc': 'Desc', 'price': 'price', 'elig': 'Elig_%'}
‚Üí Type counts: {'hardware': 3004, 'software': 1263}
‚Üí Sample categories: {'hardware': 3004, 'software': 1263}


[HW] hardware:  18%|‚ñà‚ñä        | 539/3004 [04:30<20:38,  1.99sku/s]


KeyboardInterrupt: 

In [21]:
# ============================================================
# NORMALIZA√á√ÉO E ENRIQUECIMENTO POR CATEGORIA (NOTEBOOK-ONLY)
# ============================================================

import os, re, json, time, math
from pathlib import Path
from typing import Dict, List, Optional

import pandas as pd
from tqdm import tqdm

# --- LLM (LangChain OpenAI) ---
from langchain_openai import ChatOpenAI

# =================== CONFIG ===================

# Caminhos
XLSX_PATH = "data/raw/Cisco_Pricing.xlsx"
OUT_DIR   = Path("data/normalized")

# Execu√ß√£o
ENRICH_WITH_LLM      = True   # habilita enriquecimento por IA SKU-a-SKU
REFRESH_ENRICHMENT   = True   # se j√° existe no JSON, re-enriquece e mescla
RATE_LIMIT_SLEEP     = 0.6    # sleep entre chamadas LLM (ajuste se precisar)

# Modo r√°pido para teste
SAMPLE_MODE          = True    # liga modo amostra
SAMPLE_PER_CATEGORY  = 5       # n¬∫ de SKUs por categoria
MAX_TOTAL_SKUS       = 40      # hard stop global (HW+SW)
MAX_LLM_CALLS        = 25      # m√°x. de chamadas LLM no modo r√°pido

# Filtros opcionais: processe s√≥ algumas categorias
INCLUDE_ONLY_CATS_HW = None    # ex.: ["switches", "access_points"]
INCLUDE_ONLY_CATS_SW = None    # ex.: ["sw_support_license", "sw_wireless"]

# OpenAI
OPENAI_MODEL = "gpt-4o-mini"
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")  # defina no ambiente
if not OPENAI_API_KEY and ENRICH_WITH_LLM:
    raise RuntimeError("Defina OPENAI_API_KEY no ambiente antes de rodar.")

# =================== HELPER: DETEC√á√ÉO DE COLUNAS ===================

def _find_col(df: pd.DataFrame, *cands) -> Optional[str]:
    cols = [str(c).strip().lower() for c in df.columns]
    for cand in cands:
        for i, c in enumerate(cols):
            if cand in c:
                return df.columns[i]
    return None

def _norm_price(x) -> float:
    s = str(x)
    s = re.sub(r"[^\d,\.]", "", s)
    # remove separador de milhar (.) quando h√° v√≠rgula decimal
    s = re.sub(r"\.(?=\d{3},)", "", s)
    s = s.replace(",", ".")
    try:
        return float(s)
    except:
        return 0.0

# =================== CATEGORIAS & ESQUEMAS ===================

# Mapeamento para slugs can√¥nicos (hardware)
def _canon_category_hw(raw: str) -> str:
    s = raw.strip().lower()
    # heur√≠sticas por palavra-chave
    if re.search(r"\b(ap|access\s*point|wireless)\b", s):
        return "access_points"
    if "switch" in s:
        return "switches"
    if "router" in s or "isr" in s or "asr" in s:
        return "routers"
    if "firewall" in s or "asa" in s or re.search(r"\bftd\b", s):
        return "firewalls"
    if "controller" in s or "wlc" in s:
        return "controllers"
    if "sd-wan" in s or "sdwan" in s or "vmanage" in s:
        return "sdwan"
    if "sfp" in s or "qsfp" in s or "transceiver" in s or "optic" in s:
        return "optics_transceivers"
    if "camera" in s or "iot" in s:
        return "cameras_iot"
    if "psu" in s or "power supply" in s or "ups" in s:
        return "power_psu_ups"
    if "module" in s or "line card" in s or "linecard" in s:
        return "modules_linecards"
    if "ucs" in s or "server" in s:
        return "servers"
    if "storage" in s:
        return "storage"
    if "phone" in s or "telepresence" in s or "collaboration" in s:
        return "collab_endpoints"
    # fallback: tenta derivar de nomes comuns (Meraki MS* => switches, MR* => AP)
    if re.match(r"^ms\d", s):  # Meraki Switch
        return "switches"
    if re.match(r"^mr\d", s):  # Meraki AP
        return "access_points"
    if re.match(r"^mx\d", s):  # Meraki Security/SD-WAN
        return "firewalls"
    return "other_hw"

# Mapeamento para slugs can√¥nicos (software)
def _canon_category_sw(raw: str, sku: str, desc: str) -> str:
    s = raw.strip().lower()
    sd = f"{sku} {desc}".lower()
    if "license" in s or "support" in s or "entitlement" in s:
        return "sw_support_license"
    if "firewall" in s or "asa" in s or "firepower" in s:
        return "sw_firewall"
    if "security" in s or "securex" in s or "duo" in s or "umbrella" in s:
        return "sw_security"
    if "wireless" in s or "wlc" in s or "meraki" in s:
        # Meraki LIC- por padr√£o licenciamento/gest√£o
        if "lic-" in sd:
            return "sw_support_license"
        return "sw_wireless"
    if "sdwan" in s or "sd-wan" in s:
        return "sw_routing_sdwan"
    if "collab" in s or "webex" in s:
        return "sw_collaboration"
    if "datacenter" in s or "intersight" in s or "ucs" in s:
        return "sw_datacenter"
    if "observability" in s or "appdynamics" in s or "thousandeyes" in s:
        return "sw_observability"
    # fallback por heur√≠stica de SKU
    if sku.upper().startswith("LIC-"):
        return "sw_support_license"
    return "sw_other"

# Campos por categoria (hardware)
HARDWARE_ATTR_FIELDS: Dict[str, List[str]] = {
    "switches": ["port_count","uplinks","poe_budget_watts","stacking","layer","throughput_gbps","mounting","redundancy"],
    "access_points": ["wifi_standard","throughput","antenna","indoor_outdoor","mimo","poe_class","mesh_support"],
    "routers": ["wan_ports","lan_ports","throughput_gbps","vpn_throughput","sdwan_ready","redundant_psu"],
    "firewalls": ["throughput_gbps","ngfw","ips","vpn_peers","ports","ha_support"],
    "controllers": ["ap_count_supported","redundancy","throughput_gbps","model"],
    "sdwan": ["role","controllers","overlay_type","max_tunnels","throughput_gbps"],
    "optics_transceivers": ["form_factor","speed_gbps","wavelength_nm","reach_m","connector"],
    "cameras_iot": ["resolution","lens","indoor_outdoor","ir","storage"],
    "power_psu_ups": ["wattage","input_voltage","output_voltage","form_factor"],
    "modules_linecards": ["slot_type","ports","speed_gbps","poe_support"],
    "servers": ["cpu","memory_gb","storage","form_factor","nic"],
    "storage": ["capacity_tb","form_factor","protocol"],
    "collab_endpoints": ["type","display","codec","microphones"],
    "other_hw": ["notes"]
}

# Campos por categoria (software)
SOFTWARE_ATTR_FIELDS: Dict[str, List[str]] = {
    "sw_support_license": ["term","edition","device_type","cloud_managed","support_level"],
    "sw_firewall": ["feature_set","ips","vpn","firepower_version","deployment_model"],
    "sw_security": ["feature_set","integrations","cloud","on_prem"],
    "sw_wireless": ["controller_type","ap_count","ai_rf","guest_access"],
    "sw_routing_sdwan": ["feature_set","controllers","overlay","cloud_gateway"],
    "sw_collaboration": ["workloads","capacity","recording","compliance"],
    "sw_datacenter": ["hypervisor","automation","integrations"],
    "sw_observability": ["sources","metrics","distributed_tracing","synthetics"],
    "sw_other": ["notes"]
}

# =================== RECORD BUILDERS ===================

def _hardware_record(sku: str, name: str, cat_slug: str, price: float, elig: float) -> Dict:
    # cria dict com todos os campos base + atributos por categoria
    attrs = {k: None for k in HARDWARE_ATTR_FIELDS.get(cat_slug, HARDWARE_ATTR_FIELDS["other_hw"])}
    return {
        "cisco_product_id": sku,
        "commercial_name": name,
        "product_type": "hardware",
        "lifecycle": {"status": "unknown", "eos_announced": None, "last_support_date": None},
        "technical_profile": {
            "category": cat_slug,
            "subcategory": None,
            "hardware_attributes": attrs
        },
        "pricing_model": {
            "type": "one_time",
            "currency": "USD",
            "base_price": float(price),
            "elig_pct": float(elig),
            "pricing_tiers": []
        },
        "dependencies": {"required_components": [], "compatible_with": []},
        "regulatory": {"certifications": []}
    }

def _software_record(sku: str, name: str, cat_slug: str, price: float, elig: float) -> Dict:
    attrs = {k: None for k in SOFTWARE_ATTR_FIELDS.get(cat_slug, SOFTWARE_ATTR_FIELDS["sw_other"])}
    return {
        "cisco_product_id": sku,
        "commercial_name": name,
        "product_type": "software",
        "lifecycle": {"status": "unknown", "eos_announced": None, "last_support_date": None},
        "technical_profile": {
            "category": cat_slug,
            "subcategory": None,
            "software_attributes": attrs
        },
        "pricing_model": {
            "type": "one_time",
            "currency": "USD",
            "base_price": float(price),
            "elig_pct": float(elig),
            "pricing_tiers": []
        },
        "license": {
            "term_months": None,
            "seats_included": None,
            "metering": None,
            "sku_family": None
        },
        "compatibility": {
            "requires_hardware": [],
            "compatible_platforms": [],
            "min_versions": {}
        }
    }

# =================== IO HELPERS ===================

def _load_existing(path: Path) -> List[Dict]:
    if path.exists():
        try:
            return json.loads(path.read_text(encoding="utf-8"))
        except Exception:
            return []
    return []

def _index_by_sku(items: List[Dict]) -> Dict[str, Dict]:
    idx = {}
    for it in items:
        sku = it.get("cisco_product_id")
        if sku:
            idx[sku] = it
    return idx

# =================== ENRIQUECIMENTO VIA LLM ===================

_llm = ChatOpenAI(model=OPENAI_MODEL, temperature=0.2) if ENRICH_WITH_LLM else None

def enrich_product_with_llm(prod: Dict, ptype: str, cat_slug: str) -> Dict:
    """Preenche apenas os atributos da categoria. Desconhecido => null. Sem inventar."""
    if not ENRICH_WITH_LLM:
        return prod

    if ptype == "hardware":
        keys = HARDWARE_ATTR_FIELDS.get(cat_slug, HARDWARE_ATTR_FIELDS["other_hw"])
        path = ["technical_profile", "hardware_attributes"]
    else:
        keys = SOFTWARE_ATTR_FIELDS.get(cat_slug, SOFTWARE_ATTR_FIELDS["sw_other"])
        path = ["technical_profile", "software_attributes"]

    # Prompt: pedimos S√ì o dicion√°rio de atributos, mais nada.
    sys_msg = (
        "You are a careful data normalizer. "
        "From the product name (and optional description), "
        "infer ONLY the requested attributes for the category. "
        "If unknown, return null. Respond strictly as a minified JSON object with those keys only."
    )
    user_msg = f"""
Product:
- SKU: {prod.get('cisco_product_id')}
- Name: {prod.get('commercial_name')}
- Category: {cat_slug}
- Type: {ptype}

Return JSON with keys exactly: {keys}
If an attribute is unknown, set it to null.
Numbers should be numeric (e.g., 740 not "740"), booleans true/false.
"""

    try:
        resp = _llm.invoke([{"role":"system","content":sys_msg},
                            {"role":"user","content":user_msg}])
        txt = resp.content.strip()
        # Sanitiza: tenta isolar JSON
        m = re.search(r"\{.*\}", txt, flags=re.S)
        if m:
            txt = m.group(0)
        data = json.loads(txt)
        # Merge no produto
        target = prod
        for key in path:
            target = target[key]
        # s√≥ sobrep√µe chaves declaradas
        for k in keys:
            if k in data:
                target[k] = data[k]
    except Exception as e:
        # Falha no enriquecimento: segue com o registro base
        pass

    return prod

# =================== PIPELINE ===================

def normalize_and_export_by_category(xlsx_path: str, out_dir: Path):
    out_dir.mkdir(parents=True, exist_ok=True)
    df = pd.read_excel(xlsx_path, engine="openpyxl")

    # Detecta colunas
    col_type = _find_col(df, "category-type", "category type", "type")
    col_cat  = _find_col(df, "category")
    col_sku  = _find_col(df, "sku", "part", "part number")
    col_desc = _find_col(df, "desc", "description", "name")
    col_pri  = _find_col(df, "pri", "price", "list price")
    col_elig = _find_col(df, "elig", "eligibility", "elig_%", "eligibility %")

    if not all([col_type, col_cat, col_sku, col_desc, col_pri, col_elig]):
        raise ValueError("N√£o encontrei todas as colunas necess√°rias. Confira os nomes no Excel.")

    # Normaliza campos base
    df["_type"] = df[col_type].astype(str).str.strip().str.lower()
    df["_cat"]  = df[col_cat].astype(str).str.strip()
    df["_sku"]  = df[col_sku].astype(str).str.strip()
    df["_desc"] = df[col_desc].astype(str).str.strip()
    df["_pri"]  = df[col_pri].apply(_norm_price)
    df["_elig"] = (
        df[col_elig].astype(str).str.strip().str.replace("%","", regex=False)
        .str.replace(",", ".", regex=False)
    )
    df["_elig"] = pd.to_numeric(df["_elig"], errors="coerce").fillna(0.0) / 100.0

    # Separa hardware/software
    hw_rows = df[df["_type"].str.contains("hard", na=False)].copy()
    sw_rows = df[df["_type"].str.contains("soft", na=False)].copy()

    # Map categorias ‚Üí listas
    buckets_hw: Dict[str, List[Dict]] = {}
    buckets_sw: Dict[str, List[Dict]] = {}

    # Build candidatos hardware
    for _, r in hw_rows.iterrows():
        sku, name = r["_sku"], r["_desc"]
        cat_slug = _canon_category_hw(r["_cat"])
        if INCLUDE_ONLY_CATS_HW and cat_slug not in INCLUDE_ONLY_CATS_HW:
            continue
        rec = _hardware_record(sku, name, cat_slug, float(r["_pri"]), float(r["_elig"]))
        buckets_hw.setdefault(cat_slug, []).append(rec)

    # Build candidatos software
    for _, r in sw_rows.iterrows():
        sku, name = r["_sku"], r["_desc"]
        cat_slug = _canon_category_sw(r["_cat"], sku, name)
        if INCLUDE_ONLY_CATS_SW and cat_slug not in INCLUDE_ONLY_CATS_SW:
            continue
        rec = _software_record(sku, name, cat_slug, float(r["_pri"]), float(r["_elig"]))
        buckets_sw.setdefault(cat_slug, []).append(rec)

    # Amostragem por categoria
    if SAMPLE_MODE:
        for cat in list(buckets_hw.keys()):
            buckets_hw[cat] = buckets_hw[cat][:SAMPLE_PER_CATEGORY]
        for cat in list(buckets_sw.keys()):
            buckets_sw[cat] = buckets_sw[cat][:SAMPLE_PER_CATEGORY]

    # Pastas
    hw_dir = out_dir / "hardware"
    sw_dir = out_dir / "software"
    hw_dir.mkdir(parents=True, exist_ok=True)
    sw_dir.mkdir(parents=True, exist_ok=True)

    files_written = 0
    processed_skus = 0
    llm_calls = 0

    # ---------- HARDWARE ----------
    for cat, items in buckets_hw.items():
        if MAX_TOTAL_SKUS and processed_skus >= MAX_TOTAL_SKUS:
            break
        path = hw_dir / f"hw_{cat}.json"
        existing = _load_existing(path)
        index    = _index_by_sku(existing)

        to_save: List[Dict] = existing[:]
        seen = set(s.get("cisco_product_id") for s in existing)

        for prod in tqdm(items, desc=f"[HW] {cat}", unit="sku"):
            if MAX_TOTAL_SKUS and processed_skus >= MAX_TOTAL_SKUS:
                break
            sku = prod["cisco_product_id"]

            # merge/refresh
            if sku in seen and not REFRESH_ENRICHMENT:
                continue
            if sku in index and REFRESH_ENRICHMENT:
                merged = index[sku]
                # atualiza campos base
                merged["commercial_name"] = prod["commercial_name"]
                merged["pricing_model"]["base_price"] = prod["pricing_model"]["base_price"]
                merged["pricing_model"]["elig_pct"]   = prod["pricing_model"]["elig_pct"]
                prod = merged

            # enrich
            if ENRICH_WITH_LLM and (not SAMPLE_MODE or llm_calls < MAX_LLM_CALLS):
                prod = enrich_product_with_llm(prod, "hardware", cat)
                llm_calls += 1
                time.sleep(RATE_LIMIT_SLEEP)

            # escreve
            if sku in index:
                for i, old in enumerate(to_save):
                    if old.get("cisco_product_id") == sku:
                        to_save[i] = prod
                        break
            else:
                to_save.append(prod)
                index[sku] = prod
                seen.add(sku)

            processed_skus += 1

        path.write_text(json.dumps(to_save, indent=2), encoding="utf-8")
        files_written += 1

        if MAX_TOTAL_SKUS and processed_skus >= MAX_TOTAL_SKUS:
            break

    # ---------- SOFTWARE ----------
    for cat, items in buckets_sw.items():
        if MAX_TOTAL_SKUS and processed_skus >= MAX_TOTAL_SKUS:
            break
        path = sw_dir / f"sw_{cat}.json"
        existing = _load_existing(path)
        index    = _index_by_sku(existing)

        to_save: List[Dict] = existing[:]
        seen = set(s.get("cisco_product_id") for s in existing)

        for prod in tqdm(items, desc=f"[SW] {cat}", unit="sku"):
            if MAX_TOTAL_SKUS and processed_skus >= MAX_TOTAL_SKUS:
                break

            sku = prod["cisco_product_id"]

            if sku in seen and not REFRESH_ENRICHMENT:
                continue
            if sku in index and REFRESH_ENRICHMENT:
                merged = index[sku]
                merged["commercial_name"] = prod["commercial_name"]
                merged["pricing_model"]["base_price"] = prod["pricing_model"]["base_price"]
                merged["pricing_model"]["elig_pct"]   = prod["pricing_model"]["elig_pct"]
                prod = merged

            if ENRICH_WITH_LLM and (not SAMPLE_MODE or llm_calls < MAX_LLM_CALLS):
                prod = enrich_product_with_llm(prod, "software", cat)
                llm_calls += 1
                time.sleep(RATE_LIMIT_SLEEP)

            if sku in index:
                for i, old in enumerate(to_save):
                    if old.get("cisco_product_id") == sku:
                        to_save[i] = prod
                        break
            else:
                to_save.append(prod)
                index[sku] = prod
                seen.add(sku)

            processed_skus += 1

        path.write_text(json.dumps(to_save, indent=2), encoding="utf-8")
        files_written += 1

        if MAX_TOTAL_SKUS and processed_skus >= MAX_TOTAL_SKUS:
            break

    # resumo
    def _count_jsons(dirp: Path):
        n = 0
        if not dirp.exists():
            return 0
        for f in os.listdir(dirp):
            if f.endswith(".json"):
                try:
                    n += len(json.loads((dirp/f).read_text(encoding="utf-8")))
                except Exception:
                    pass
        return n

    print("‚úÖ Done (fast mode)" if SAMPLE_MODE else "‚úÖ Done", {
        "hardware": _count_jsons(hw_dir),
        "software": _count_jsons(sw_dir),
        "files": files_written,
        "processed_skus": processed_skus,
        "llm_calls": llm_calls,
        "out": str(out_dir)
    })


In [22]:
normalize_and_export_by_category(XLSX_PATH, OUT_DIR)


[HW] other_hw: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:10<00:00,  2.07s/sku]
[SW] sw_other: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:09<00:00,  1.97s/sku]
[SW] sw_support_license: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 5/5 [00:09<00:00,  1.85s/sku]

‚úÖ Done (fast mode) {'hardware': 10, 'software': 15, 'files': 3, 'processed_skus': 15, 'llm_calls': 15, 'out': 'data\\normalized'}





In [33]:
# --- CONFIG --------------------------------------------------------------
import os, json, re
from pathlib import Path
import pandas as pd
from copy import deepcopy
from dotenv import load_dotenv

# OpenAI >=1.0
from openai import OpenAI
load_dotenv()

# ARQUIVOS / PASTAS
XLSX_PATH = Path("data/raw/Cisco_Pricing.xlsx")   # ajuste se preciso
SCHEMAS_HW_DIR = Path("schemas/hardware")         # onde est√£o seus schemas de hardware
SCHEMAS_SW_DIR = Path("schemas/software")         # (opcional) schemas de software
OUT_HW_DIR = Path("out/hardware")
OUT_SW_DIR = Path("out/software")

# TESTE R√ÅPIDO: limite por categoria (ex.: 5). Use None para completo.
SAMPLE_PER_CATEGORY = 5

# ENRIQUECIMENTO VIA LLM (True para enriquecer, False para s√≥ estruturar sem LLM)
DO_ENRICHMENT = True
OPENAI_MODEL = "gpt-4o-mini"   # pode trocar
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

# --- HELPERS: leitura de schemas -----------------------------------------
def load_schema_templates(dir_path: Path) -> dict:
    """
    Carrega todos os .json como templates, indexando por 'category' em lowercase.
    Exige que cada JSON tenha a chave 'category' no topo.
    """
    templates = {}
    if not dir_path.exists():
        return templates
    for f in dir_path.glob("*.json"):
        try:
            data = json.loads(f.read_text(encoding="utf-8"))
            cat = (data.get("category") or "").strip().lower()
            if not cat:
                print(f"[WARN] Schema sem 'category': {f}")
                continue
            templates[cat] = data
        except Exception as e:
            print(f"[WARN] Falha lendo schema {f}: {e}")
    return templates

SCHEMAS_HW = load_schema_templates(SCHEMAS_HW_DIR)
SCHEMAS_SW = load_schema_templates(SCHEMAS_SW_DIR)

if not SCHEMAS_HW:
    raise RuntimeError(
        f"Nenhum schema de hardware encontrado em {SCHEMAS_HW_DIR}. "
        "Coloque seus arquivos schema_*.json l√°."
    )

# --- HELPERS: detec√ß√£o de colunas -----------------------------------------
def find_col(df: pd.DataFrame, substrings) -> str | None:
    for col in df.columns:
        lc = str(col).strip().lower()
        for sub in substrings:
            if sub in lc:
                return col
    return None

# --- HELPERS: normaliza√ß√£o de campos do Excel -----------------------------
def clean_price_series(series: pd.Series) -> pd.Series:
    s = (series.astype(str)
               .str.replace(r"[^\d\.,]", "", regex=True)
               .str.replace(r"\.(?=\d{3},)", "", regex=True)  # 1.234,56 -> 1234,56
               .str.replace(",", ".", regex=False))
    return pd.to_numeric(s, errors="coerce").fillna(0.0)

def clean_pct_series(series: pd.Series) -> pd.Series:
    s = (series.astype(str)
               .str.replace(r"[^\d\.,]", "", regex=True)
               .str.replace(",", ".", regex=False))
    return pd.to_numeric(s, errors="coerce").fillna(0.0) / 100.0

# --- CLASSIFICA√á√ÉO: hardware vs software & mapeamento de categoria --------
SOFT_HINTS = re.compile(r"\b(lic|license|subscription|entitlement|support|software|dna|meraki lic)\b", re.I)

def guess_product_type(desc: str, category_cell: str | None) -> str:
    cat_str = (category_cell or "").lower()
    desc_l = (desc or "").lower()
    if "software" in cat_str or SOFT_HINTS.search(desc_l):
        return "software"
    return "hardware"

# mapeamento do Excel -> categorias dos schemas (ajuste se precisar)
# chaves em lowercase; valores = nome "category" exatamente como no schema
HW_CATEGORY_MAP = {
    "switch": "Switches",
    "switches": "Switches",
    "router": "Routers",
    "routers": "Routers",
    "firewall": "Firewall",
    "wireless": "Wireless",
    "ap": "Wireless",
    "antenna": "Antennas",
    "antennas": "Antennas",
    "cable": "Cabling",
    "cabling": "Cabling",
    "connector": "Connectors",
    "connectors": "Connectors",
    "meraki ms": "Switches",
    "meraki mx": "Routers",
    "asa": "Firewall",
}
def map_hw_category_from_row(desc: str, category_cell: str | None) -> str | None:
    text = f"{category_cell or ''} {desc or ''}".lower()
    # match por palavra-chave
    for k, v in HW_CATEGORY_MAP.items():
        if k in text:
            return v
    # fallback: se n√£o achar, tenta "Switches" se tiver "port"/"poe" no texto
    if re.search(r"\bpoe\b|\bports?\b", text):
        return "Switches"
    return None

def map_sw_category_from_row(desc: str, category_cell: str | None) -> str:
    # voc√™ pode criar schemas/software para: Management, Security, Switching, Wireless, Support etc.
    # Aqui coloco um bucket gen√©rico "Software" se n√£o houver schemas SW espec√≠ficos.
    if SCHEMAS_SW:
        # tentativa simples: usar "Management" se achar 'dna'/'meraki', sen√£o 'Security' se 'fw/ips', etc.
        text = f"{category_cell or ''} {desc or ''}".lower()
        if any(x in text for x in ["dna", "meraki", "dashboard", "cloud mgmt"]):
            target = "Management"
        elif any(x in text for x in ["firepower", "ips", "security", "amp", "umbrella"]):
            target = "Security"
        elif any(x in text for x in ["wireless", "ap", "wi-fi"]):
            target = "Wireless"
        elif any(x in text for x in ["switch", "ms2", "catalyst"]):
            target = "Switching"
        else:
            target = "Software"
        # se n√£o existir esse schema, cai no primeiro schema SW dispon√≠vel
        cat_key = target.lower()
        return next((schema["category"] for k, schema in SCHEMAS_SW.items() if k == cat_key),
                    list(SCHEMAS_SW.values())[0]["category"])
    else:
        return "Software"  # bucket gen√©rico

# --- HELPERS: ‚Äúinstanciar‚Äù um template e preencher com Excel ---------------
def instantiate_from_template(template: dict, sku: str, name: str, price: float, elig_pct: float, subcategory: str | None):
    obj = deepcopy(template)
    # garantias b√°sicas
    obj["cisco_product_id"] = sku
    obj["commercial_name"] = name
    obj["lifecycle"] = obj.get("lifecycle", {"status": "active", "eos_announced": None, "last_support_date": None})
    obj["pricing_model"] = obj.get("pricing_model", {})
    obj["pricing_model"].setdefault("type", "one_time")
    obj["pricing_model"].setdefault("currency", "USD")
    obj["pricing_model"]["base_price"] = float(price or 0.0)
    if "elig_pct" in obj["pricing_model"]:
        obj["pricing_model"]["elig_pct"] = float(elig_pct or 0.0)
    if subcategory is not None:
        obj["subcategory"] = subcategory or None
    return obj

# --- ENRIQUECIMENTO VIA LLM: produto por produto --------------------------
ENRICH_SYSTEM = """You are a strict JSON filler. 
You will receive a product JSON and a JSON schema template.
- Keep all given keys as-is; DO NOT add keys that are not in the template.
- Populate missing/null fields ONLY if deducible from product name/SKU/category semantics.
- If unsure, keep the default/null.
- Do not invent SKUs, specs, interfaces, or certifications.
- Return only valid JSON matching the template keys.
"""

def enrich_product_with_llm(product_obj: dict, template_obj: dict) -> dict:
    """
    Envia 1 produto + template para a LLM e pede para devolver o produto com
    os campos do template preenchidos quando poss√≠vel. Zero ‚Äúvoo solo‚Äù.
    """
    prompt = {
        "template": template_obj,
        "product": product_obj
    }
    try:
        resp = client.chat.completions.create(
            model=OPENAI_MODEL,
            temperature=0.0,
            messages=[
                {"role": "system", "content": ENRICH_SYSTEM},
                {"role": "user", "content": json.dumps(prompt, ensure_ascii=False)}
            ],
            response_format={"type": "json_object"}
        )
        enriched = json.loads(resp.choices[0].message.content)
        # Se a LLM retornou s√≥ o objeto final (sem wrapper), ok.
        # Se ela retornou {"product": {...}}, trate:
        if "product" in enriched and isinstance(enriched["product"], dict):
            return enriched["product"]
        return enriched if isinstance(enriched, dict) else product_obj
    except Exception as e:
        print(f"[ENRICH WARN] Falha ao enriquecer {product_obj.get('cisco_product_id')}: {e}")
        return product_obj

# --- PIPELINE --------------------------------------------------------------
def process_excel_to_category_jsons(
    xlsx_path: Path,
    sample_per_category: int | None = SAMPLE_PER_CATEGORY,
    do_enrichment: bool = DO_ENRICHMENT
):
    if not xlsx_path.exists():
        raise FileNotFoundError(f"Arquivo n√£o encontrado: {xlsx_path}")

    df = pd.read_excel(xlsx_path, engine="openpyxl")

    sku_col   = find_col(df, ["sku"])
    desc_col  = find_col(df, ["desc", "name"])
    price_col = find_col(df, ["price"])
    elig_col  = find_col(df, ["elig"])
    cat_col   = find_col(df, ["category"])
    sub_col   = find_col(df, ["sub_category", "subcategory"])

    missing = [n for n, c in [("SKU", sku_col), ("Desc", desc_col), ("Price", price_col)] if c is None]
    if missing:
        raise RuntimeError(f"Colunas obrigat√≥rias ausentes: {missing}")

    # normaliza√ß√£o
    df["__base_price"] = clean_price_series(df[price_col])
    df["__elig_pct"]   = clean_pct_series(df[elig_col]) if elig_col else 0.0
    df["__category"]   = df[cat_col] if cat_col else None
    df["__subcat"]     = df[sub_col] if sub_col else None

    # buffers por categoria
    buckets_hw: dict[str, list] = {}
    buckets_sw: dict[str, list] = {}

    for _, row in df.iterrows():
        sku  = str(row[sku_col]).strip()
        name = str(row[desc_col]).strip()
        price = float(row["__base_price"])
        elig  = float(row["__elig_pct"])
        cat_cell = (row["__category"] if row["__category"] is not None else "")
        subcat   = (str(row["__subcat"]).strip() if row["__subcat"] is not None else None)

        if not sku or not name:
            continue

        ptype = guess_product_type(name, cat_cell)

        if ptype == "hardware":
            mapped = map_hw_category_from_row(name, cat_cell)  # ex.: "Switches"
            if not mapped:
                # Se n√£o conseguirmos mapear, pule ou bucketize como "Unknown"
                mapped = "Switches"  # fallback educado
            template = SCHEMAS_HW.get(mapped.lower())
            if not template:
                print(f"[WARN] Sem template p/ '{mapped}' ‚Äî SKU {sku} pulado")
                continue

            base = instantiate_from_template(template, sku, name, price, elig, subcat)
            final = enrich_product_with_llm(base, template) if do_enrichment else base

            buckets_hw.setdefault(mapped, []).append(final)

        else:  # software
            # Escolhe categoria de software (se existir). Caso n√£o, bucket gen√©rico "Software"
            mapped_sw = map_sw_category_from_row(name, cat_cell)
            template_sw = SCHEMAS_SW.get(mapped_sw.lower())
            if not template_sw:
                # se n√£o houver schemas de software, cria um template m√≠nimo gen√©rico compat√≠vel
                template_sw = {
                    "cisco_product_id": "",
                    "commercial_name": "",
                    "product_type": "software",
                    "category": "Software",
                    "subcategory": None,
                    "lifecycle": {"status": "active","eos_announced": None,"last_support_date": None},
                    "pricing_model": {"type": "one_time","currency": "USD","base_price": 0.0,"elig_pct": 0.01}
                }

            base = instantiate_from_template(template_sw, sku, name, price, elig, subcat)
            final = enrich_product_with_llm(base, template_sw) if do_enrichment else base
            buckets_sw.setdefault(template_sw["category"], []).append(final)

    # aplica SAMPLE_PER_CATEGORY
    if sample_per_category is not None:
        for k in list(buckets_hw.keys()):
            buckets_hw[k] = buckets_hw[k][:sample_per_category]
        for k in list(buckets_sw.keys()):
            buckets_sw[k] = buckets_sw[k][:sample_per_category]

    # garante diret√≥rios
    OUT_HW_DIR.mkdir(parents=True, exist_ok=True)
    OUT_SW_DIR.mkdir(parents=True, exist_ok=True)

    # salva um arquivo por categoria
    for cat, items in buckets_hw.items():
        out_path = OUT_HW_DIR / f"{cat.lower().replace(' ','_')}.json"
        out_path.write_text(json.dumps(items, ensure_ascii=False, indent=2), encoding="utf-8")

    for cat, items in buckets_sw.items():
        out_path = OUT_SW_DIR / f"{cat.lower().replace(' ','_')}.json"
        out_path.write_text(json.dumps(items, ensure_ascii=False, indent=2), encoding="utf-8")

    print("‚úÖ Finalizado.")
    print("Hardware:", {k: len(v) for k, v in buckets_hw.items()})
    print("Software:", {k: len(v) for k, v in buckets_sw.items()})

# --- EXECU√á√ÉO --------------------------------------------------------------
process_excel_to_category_jsons(
    XLSX_PATH,
    sample_per_category=SAMPLE_PER_CATEGORY,
    do_enrichment=DO_ENRICHMENT
)


KeyboardInterrupt: 

In [36]:
1

1

In [62]:
# === Setup / Imports ==========================================================
import os, json, time, copy, re
from pathlib import Path
from collections import defaultdict

import pandas as pd
from tqdm import tqdm
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

# ---------- Switches de execu√ß√£o ----------
ENRICH_WITH_LLM = True          # True para preencher campos faltantes via LLM
MAX_ITEMS_PER_CATEGORY = 3       # None = sem limite (use 3 para testar)
LIMIT_CATEGORIES = None          # ex.: ["Wireless","Switches"] para filtrar

# ---------- Caminhos ----------
XLSX_PATH       = Path("data/raw/Cisco_Pricing.xlsx")
SCHEMAS_HW_DIR  = Path("schemas/hardware")   # ex.: schema_switches.json, schema_wireless.json ...
SCHEMAS_SW_DIR  = Path("schemas/software")   # opcional; se vazio, usa template gen√©rico
OUTPUT_DIR_HW   = Path("out/hardware")
OUTPUT_DIR_SW   = Path("out/software")
CACHE_PATH      = Path("out/enrichment_cache.json")

OUTPUT_DIR_HW.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR_SW.mkdir(parents=True, exist_ok=True)
CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)

# === Cache util ===============================================================
def _load_cache(path: Path) -> dict:
    if path.exists():
        try:
            return json.loads(path.read_text(encoding="utf-8"))
        except Exception:
            return {}
    return {}

def _save_cache(path: Path, data: dict):
    tmp = path.with_suffix(".tmp.json")
    tmp.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
    tmp.replace(path)

CACHE = _load_cache(CACHE_PATH)

# === Carregamento dos schemas =================================================
def load_schema_templates(dir_path: Path, expect_hw=True):
    templates = {}
    if dir_path.exists():
        for fp in sorted(dir_path.glob("*.json")):
            try:
                obj = json.loads(fp.read_text(encoding="utf-8"))
                cat = obj.get("category")
                if not cat:
                    base = fp.stem.replace("schema_", "")
                    cat = base.replace("_", " ").title()
                templates[cat] = obj
            except Exception as e:
                print(f"‚ö†Ô∏è Erro lendo {fp}: {e}")
    else:
        if expect_hw:
            print(f"‚ö†Ô∏è Diret√≥rio de schemas n√£o existe: {dir_path}")
    return templates

SCHEMAS_HW = load_schema_templates(SCHEMAS_HW_DIR, expect_hw=True)
SCHEMAS_SW = load_schema_templates(SCHEMAS_SW_DIR, expect_hw=False)

if not SCHEMAS_HW:
    raise RuntimeError(
        f"Nenhum schema de hardware encontrado em {SCHEMAS_HW_DIR}. "
        "Coloque seus arquivos schema_*.json l√° (ex.: schema_switches.json)."
    )

# === Template gen√©rico p/ software ===========================================
GENERIC_SW_SCHEMA = {
    "cisco_product_id": "",
    "commercial_name": "",
    "product_type": "software",
    "category": "",
    "subcategory": "",
    "lifecycle": {"status": "active", "eos_announced": None, "last_support_date": None},
    "license_model": {"type": "subscription", "term": None, "seats_or_nodes": None, "includes_support": True},
    "entitlements": {"features": [], "tier": None, "usage_limits": {}},
    "pricing_model": {"currency": "USD", "base_price": 0.0, "billing_cycle": "one_time", "pricing_tiers": []},
    "dependencies": {"requires": [], "compatibility": []},
    "regulatory": {"compliance": []},
    "metadata": {"vendor_sku_aliases": [], "notes": ""}
}

# === Normaliza√ß√£o das categorias =============================================
HW_CATEGORY_MAP = {
    "switch": "Switches", "switches": "Switches",
    "router": "Routers", "routers": "Routers",
    "firewall": "Firewall",
    "wireless": "Wireless", "access point": "Wireless", "ap": "Wireless",
    "antenna": "Antennas", "antennas": "Antennas",
    "cabling": "Cabling",
    "connector": "Connectors", "connectors": "Connectors",
}
SW_CATEGORY_MAP = {
    # se vier "Wireless" para software, manteremos "Wireless" (ver fallback)
    "license": "Licenses", "licensing": "Licenses",
    "subscription": "Subscriptions",
    "support": "Support",
    "cloud": "Cloud Services", "service": "Cloud Services",
}

def normalize_hw_category(raw: str) -> str | None:
    if not raw: return None
    s = str(raw).strip().lower()
    for key, cat in HW_CATEGORY_MAP.items():
        if key in s:
            return cat
    # igual ao nome
    if s in {v.lower() for v in HW_CATEGORY_MAP.values()}:
        return s.title()
    return None

def normalize_sw_category(raw: str) -> str | None:
    if not raw: return None
    s = str(raw).strip().lower()
    for key, cat in SW_CATEGORY_MAP.items():
        if key in s:
            return cat
    # permitir categorias como "Wireless" tamb√©m para software
    if s in {v.lower() for v in HW_CATEGORY_MAP.values()}:
        return s.title()
    return None

# === Detec√ß√£o de colunas no Excel ============================================
def detect_columns(df: pd.DataFrame) -> dict:
    cols = {c: re.sub(r"\s+", "", str(c).strip().lower()) for c in df.columns}
    out = {"sku": None, "desc": None, "price": None, "category": None, "category_type": None, "subcategory": None}

    for col, norm in cols.items():
        if out["sku"] is None and (norm == "sku" or "partnumber" in norm or norm == "part" or "sku" in norm):
            out["sku"] = col
        elif out["desc"] is None and (norm in {"desc","description","name"} or "desc" in norm or "description" in norm):
            out["desc"] = col
        elif out["price"] is None and (norm in {"price","pri","list"} or "price" in norm or norm == "pri"):
            out["price"] = col
        elif out["category_type"] is None and re.fullmatch(r"category[-_]?type", norm):
            out["category_type"] = col
        elif out["category"] is None and (norm == "category" or (("category" in norm) and ("type" not in norm))):
            out["category"] = col
        elif out["subcategory"] is None and ("subcategory" in norm or "sub-category" in norm or norm == "sub_category"):
            out["subcategory"] = col

    # sanity fallback
    if out["category"] is None and out["category_type"] is not None:
        # pelo menos garantimos que n√£o confundiremos
        pass

    missing = [k for k, v in out.items() if v is None and k in ("sku","desc","price")]
    if missing:
        raise RuntimeError(f"Excel sem colunas obrigat√≥rias: {missing}. Detectado: {out}")
    return out

def read_excel_rows(xlsx_path: Path) -> pd.DataFrame:
    if not xlsx_path.exists():
        raise FileNotFoundError(f"Excel n√£o encontrado: {xlsx_path}")
    df = pd.read_excel(xlsx_path, engine="openpyxl")
    cols = detect_columns(df)

    # Limpa pre√ßo (v√≠rgula decimal, separadores, etc.)
    price_series = (
        df[cols["price"]]
        .astype(str)
        .str.replace(r"[^\d\.,-]", "", regex=True)
        .str.replace(r"\.(?=\d{3},)", "", regex=True)  # remove separador de milhar antes de v√≠rgula decimal
        .str.replace(",", ".", regex=False)
    )
    df["_price"] = pd.to_numeric(price_series, errors="coerce").fillna(0.0)

    df["_sku"]   = df[cols["sku"]].astype(str).str.strip()
    df["_name"]  = df[cols["desc"]].astype(str).str.strip()
    df["_cat"]   = df[cols["category"]].astype(str).str.strip() if cols["category"] else ""
    df["_cat_type"] = df[cols["category_type"]].astype(str).str.strip() if cols["category_type"] else ""
    df["_sub"]   = df[cols["subcategory"]].astype(str).str.strip() if cols["subcategory"] else ""
    return df

# === LLM (enriquecimento opcional) ===========================================
def _extract_json(text: str) -> dict:
    """Aceita resposta com/sem fences e retorna um dict JSON."""
    t = text.strip()
    # remove codefence se vier
    if t.startswith("```"):
        t = re.sub(r"^```[a-zA-Z0-9]*\s*", "", t)
        t = re.sub(r"\s*```$", "", t)
    s = t.find("{"); e = t.rfind("}")
    if s != -1 and e != -1 and e > s:
        t = t[s:e+1]
    return json.loads(t)

llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.9)

ENRICH_PROMPT = ChatPromptTemplate.from_template(
    "you ara a Cisco expert\n"
    "You are a data normalizer. You receive a PARTIAL product JSON from Cisco catalog and a TARGET SCHEMA.\n"
    "Fill ONLY the missing fields, do not change existing values, do not invent extra keys.\n"
    "Keep structure and key names exactly as the schema. Use realistic, conservative values.\n"
    "If you don't know, keep null/empty.\n\n"
    "<<<PARTIAL_JSON>>>\n{partial}\n<<<END_PARTIAL_JSON>>>\n\n"
    "<<<TARGET_SCHEMA>>>\n{schema}\n<<<END_TARGET_SCHEMA>>>\n\n"
    "Return ONLY a valid JSON object matching the target schema."
)

def enrich_with_llm(partial: dict, schema_obj: dict, cache_key: str) -> dict:
    if not ENRICH_WITH_LLM:
        return partial
    if cache_key in CACHE:
        return CACHE[cache_key]
    chain = ENRICH_PROMPT | llm
    tries = 0
    while True:
        tries += 1
        try:
            resp = chain.invoke({
                "partial": json.dumps(partial, ensure_ascii=False, indent=2),
                "schema":  json.dumps(schema_obj, ensure_ascii=False, indent=2),
            })
            enriched = _extract_json(resp.content)

            out = copy.deepcopy(schema_obj)

            def deep_merge(dst, src):
                for k, v in src.items():
                    if isinstance(v, dict) and isinstance(dst.get(k), dict):
                        deep_merge(dst[k], v)
                    else:
                        dst[k] = v

            deep_merge(out, partial)
            deep_merge(out, enriched)

            CACHE[cache_key] = out
            if tries % 5 == 1:
                _save_cache(CACHE_PATH, CACHE)
            return out
        except Exception as e:
            if tries >= 3:
                print(f"‚ö†Ô∏è LLM falhou para {cache_key}: {e} ‚Äî usando partial.")
                return partial
            time.sleep(1.2)

# === Persist√™ncia incremental por categoria ===================================
def append_item_to_json(filepath: Path, item: dict):
    if filepath.exists():
        try:
            arr = json.loads(filepath.read_text(encoding="utf-8"))
            if not isinstance(arr, list):
                arr = []
        except Exception:
            arr = []
    else:
        arr = []
    arr.append(item)
    filepath.write_text(json.dumps(arr, ensure_ascii=False, indent=2), encoding="utf-8")

# === Pipeline principal =======================================================
def build_catalog_from_excel(
    xlsx_path: Path = XLSX_PATH,
    max_items_per_category: int | None = MAX_ITEMS_PER_CATEGORY,
    limit_categories: list[str] | None = LIMIT_CATEGORIES,
    verbose: bool = True
):
    df = read_excel_rows(xlsx_path)
    if verbose:
        print(f"Rows in Excel: {len(df)}")

    buckets_hw = defaultdict(list)
    buckets_sw = defaultdict(list)

    records = df.to_dict(orient="records")
    for rec in tqdm(records, desc="Classifying rows", disable=not verbose):
        sku   = rec.get("_sku", "").strip()
        name  = rec.get("_name", "").strip()
        price = float(rec.get("_price", 0.0) or 0.0)
        raw_cat = rec.get("_cat", "")
        raw_type= rec.get("_cat_type", "")
        subcat  = rec.get("_sub", "")

        # 1) Decide Hardware/Software pelo Category-Type, se existir
        pt = None
        rt = raw_type.strip().lower()
        if rt.startswith("hard"):
            pt = "hardware"
        elif rt.startswith("soft"):
            pt = "software"

        # 2) Normaliza categoria
        hw_cat = normalize_hw_category(raw_cat)    # ex.: "Wireless"
        sw_cat = normalize_sw_category(raw_cat)    # ex.: "Wireless" ou "Licenses"

        # 3) Se n√£o deu pra decidir tipo pelo Category-Type, decide por categoria
        if not pt:
            pt = "hardware" if hw_cat else ("software" if sw_cat else "hardware")

        # 4) Categoria final
        category = hw_cat if pt == "hardware" else (sw_cat or (raw_cat.strip().title() if raw_cat else "Licenses"))

        # 5) (Opcional) fallback por texto do produto
        if pt == "hardware" and not hw_cat:
            txt = f"{name} {sku}".lower()
            if "switch" in txt:             category = "Switches"
            elif "router" in txt:           category = "Routers"
            elif "firewall" in txt:         category = "Firewall"
            elif "access point" in txt or "ap " in txt or txt.startswith("ap-"): category = "Wireless"
            elif "antenna" in txt:          category = "Antennas"
            elif "cable" in txt:            category = "Cabling"
            elif "connector" in txt:        category = "Connectors"

        if limit_categories and category not in limit_categories:
            continue

        if pt == "hardware":
            schema = SCHEMAS_HW.get(category)
            if not schema:
                if verbose:
                    print(f"‚Ü™Ô∏è  Sem schema p/ HW category='{category}', SKU={sku} ‚Äî pulando")
                continue
            base = copy.deepcopy(schema)
            base["cisco_product_id"] = sku
            base["commercial_name"] = name
            base["product_type"] = "hardware"
            base.setdefault("pricing_model", {})
            base["pricing_model"]["currency"] = base["pricing_model"].get("currency", "USD")
            base["pricing_model"]["base_price"] = price
            base["technical_profile"] = base.get("technical_profile", {})
            base["technical_profile"]["category"] = category
            base["technical_profile"]["subcategory"] = subcat or ""
            buckets_hw[category].append(base)

        else:
            schema = SCHEMAS_SW.get(category, GENERIC_SW_SCHEMA)
            base = copy.deepcopy(schema)
            base["cisco_product_id"] = sku
            base["commercial_name"] = name
            base["product_type"] = "software"
            base["category"] = category
            base["pricing_model"] = base.get("pricing_model", {})
            base["pricing_model"]["currency"] = base["pricing_model"].get("currency", "USD")
            base["pricing_model"]["base_price"] = price
            if "subcategory" not in base:
                base["subcategory"] = subcat or ""
            buckets_sw[category].append(base)

    counts = {"hardware": 0, "software": 0, "files": 0}

    # HARDWARE ‚Üí 1 arquivo por categoria
    for category, items in buckets_hw.items():
        out_file = OUTPUT_DIR_HW / f"hw_{category.replace(' ', '_').lower()}.json"
        if verbose: print(f"\n[HW] {category}: {len(items)} itens ‚Üí {out_file}")
        n = 0
        for base in tqdm(items, desc=f"Enrich HW/{category}", disable=not verbose):
            sku = base.get("cisco_product_id", "")
            schema = SCHEMAS_HW[category]
            enriched = enrich_with_llm(base, schema, cache_key=f"HW::{category}::{sku}")
            append_item_to_json(out_file, enriched)
            counts["hardware"] += 1
            n += 1
            if max_items_per_category and n >= max_items_per_category:
                if verbose: print(f"  ‚Ü™Ô∏è  Limit reached ({max_items_per_category}) for HW/{category}")
                break
        counts["files"] += 1

    # SOFTWARE ‚Üí 1 arquivo por categoria
    for category, items in buckets_sw.items():
        out_file = OUTPUT_DIR_SW / f"sw_{category.replace(' ', '_').lower()}.json"
        if verbose: print(f"\n[SW] {category}: {len(items)} itens ‚Üí {out_file}")
        n = 0
        for base in tqdm(items, desc=f"Enrich SW/{category}", disable=not verbose):
            sku = base.get("cisco_product_id", "")
            schema = SCHEMAS_SW.get(category, GENERIC_SW_SCHEMA)
            enriched = enrich_with_llm(base, schema, cache_key=f"SW::{category}::{sku}")
            append_item_to_json(out_file, enriched)
            counts["software"] += 1
            n += 1
            if max_items_per_category and n >= max_items_per_category:
                if verbose: print(f"  ‚Ü™Ô∏è  Limit reached ({max_items_per_category}) for SW/{category}")
                break
        counts["files"] += 1

    _save_cache(CACHE_PATH, CACHE)

    if verbose:
        print("\n=== Summary ===")
        print(counts)
    return counts

# === EXECU√á√ÉO (teste r√°pido) ==================================================
summary = build_catalog_from_excel(
    xlsx_path=XLSX_PATH,
    max_items_per_category=MAX_ITEMS_PER_CATEGORY,
    limit_categories=LIMIT_CATEGORIES,   # ex.: ["Wireless","Switches"]
    verbose=True
)
summary


Rows in Excel: 4267


Classifying rows: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4267/4267 [00:00<00:00, 5309.40it/s]



[HW] Wireless: 509 itens ‚Üí out\hardware\hw_wireless.json


Enrich HW/Wireless:   0%|          | 2/509 [00:00<00:16, 31.29it/s]


  ‚Ü™Ô∏è  Limit reached (3) for HW/Wireless

[HW] Switches: 1669 itens ‚Üí out\hardware\hw_switches.json


Enrich HW/Switches:   0%|          | 2/1669 [00:00<00:47, 34.95it/s]


  ‚Ü™Ô∏è  Limit reached (3) for HW/Switches

[HW] Routers: 614 itens ‚Üí out\hardware\hw_routers.json


Enrich HW/Routers:   0%|          | 0/614 [00:00<?, ?it/s]

  ‚Ü™Ô∏è  Limit reached (3) for HW/Routers

Enrich HW/Routers:   0%|          | 2/614 [00:00<00:16, 38.02it/s]




[HW] Firewall: 180 itens ‚Üí out\hardware\hw_firewall.json


Enrich HW/Firewall:   1%|          | 2/180 [00:00<00:05, 34.16it/s]

  ‚Ü™Ô∏è  Limit reached (3) for HW/Firewall

[HW] Connectors: 21 itens ‚Üí out\hardware\hw_connectors.json



Enrich HW/Connectors:  10%|‚ñâ         | 2/21 [00:00<00:01, 12.84it/s]


  ‚Ü™Ô∏è  Limit reached (3) for HW/Connectors

[HW] Cabling: 3 itens ‚Üí out\hardware\hw_cabling.json


Enrich HW/Cabling:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 2/3 [00:00<00:00, 29.94it/s]


  ‚Ü™Ô∏è  Limit reached (3) for HW/Cabling

[HW] Antennas: 8 itens ‚Üí out\hardware\hw_antennas.json


Enrich HW/Antennas:  25%|‚ñà‚ñà‚ñå       | 2/8 [00:00<00:00, 33.80it/s]


  ‚Ü™Ô∏è  Limit reached (3) for HW/Antennas

[SW] Wireless: 545 itens ‚Üí out\software\sw_wireless.json


Enrich SW/Wireless:   0%|          | 2/545 [00:00<00:15, 34.23it/s]


  ‚Ü™Ô∏è  Limit reached (3) for SW/Wireless

[SW] Switches: 409 itens ‚Üí out\software\sw_switches.json


Enrich SW/Switches:   0%|          | 2/409 [00:00<00:13, 29.64it/s]

  ‚Ü™Ô∏è  Limit reached (3) for SW/Switches






[SW] Licenses: 9 itens ‚Üí out\software\sw_licenses.json


Enrich SW/Licenses:  22%|‚ñà‚ñà‚ñè       | 2/9 [00:00<00:00, 34.36it/s]

  ‚Ü™Ô∏è  Limit reached (3) for SW/Licenses






[SW] Routers: 250 itens ‚Üí out\software\sw_routers.json


Enrich SW/Routers:   1%|          | 2/250 [00:00<00:15, 16.27it/s]

  ‚Ü™Ô∏è  Limit reached (3) for SW/Routers






[SW] Firewall: 50 itens ‚Üí out\software\sw_firewall.json


Enrich SW/Firewall:   4%|‚ñç         | 2/50 [00:00<00:03, 12.72it/s]

  ‚Ü™Ô∏è  Limit reached (3) for SW/Firewall






=== Summary ===
{'hardware': 21, 'software': 15, 'files': 12}


{'hardware': 21, 'software': 15, 'files': 12}

In [66]:
# === Setup / Imports ==========================================================
import os, json, time, copy, re
from pathlib import Path
from collections import defaultdict

import pandas as pd
from tqdm import tqdm
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

# ---------- Switches de execu√ß√£o ----------
ENRICH_WITH_LLM = True          # True para preencher campos faltantes via LLM
MAX_ITEMS_PER_CATEGORY = None       # None = sem limite (use 3 para testar)
LIMIT_CATEGORIES = None          # ex.: ["Wireless","Switches"] para filtrar

# ---------- Caminhos ----------
XLSX_PATH       = Path("data/raw/Cisco_Pricing.xlsx")
SCHEMAS_HW_DIR  = Path("schemas/hardware")   # ex.: schema_switches.json, schema_wireless.json ...
SCHEMAS_SW_DIR  = Path("schemas/software")   # opcional; se vazio, usa template gen√©rico
OUTPUT_DIR_HW   = Path("out/hardware")
OUTPUT_DIR_SW   = Path("out/software")
CACHE_PATH      = Path("out/enrichment_cache.json")

OUTPUT_DIR_HW.mkdir(parents=True, exist_ok=True)
OUTPUT_DIR_SW.mkdir(parents=True, exist_ok=True)
CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)

# === Cache util ===============================================================
def _load_cache(path: Path) -> dict:
    if path.exists():
        try:
            return json.loads(path.read_text(encoding="utf-8"))
        except Exception:
            return {}
    return {}

def _save_cache(path: Path, data: dict):
    tmp = path.with_suffix(".tmp.json")
    tmp.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
    tmp.replace(path)

CACHE = _load_cache(CACHE_PATH)

# === Carregamento dos schemas =================================================
def load_schema_templates(dir_path: Path, expect_hw=True):
    templates = {}
    if dir_path.exists():
        for fp in sorted(dir_path.glob("*.json")):
            try:
                obj = json.loads(fp.read_text(encoding="utf-8"))
                cat = obj.get("category")
                if not cat:
                    base = fp.stem.replace("schema_", "")
                    cat = base.replace("_", " ").title()
                templates[cat] = obj
            except Exception as e:
                print(f"‚ö†Ô∏è Erro lendo {fp}: {e}")
    else:
        if expect_hw:
            print(f"‚ö†Ô∏è Diret√≥rio de schemas n√£o existe: {dir_path}")
    return templates

SCHEMAS_HW = load_schema_templates(SCHEMAS_HW_DIR, expect_hw=True)
SCHEMAS_SW = load_schema_templates(SCHEMAS_SW_DIR, expect_hw=False)

if not SCHEMAS_HW:
    raise RuntimeError(
        f"Nenhum schema de hardware encontrado em {SCHEMAS_HW_DIR}. "
        "Coloque seus arquivos schema_*.json l√° (ex.: schema_switches.json)."
    )

# === Template gen√©rico p/ software ===========================================
GENERIC_SW_SCHEMA = {
    "cisco_product_id": "",
    "commercial_name": "",
    "product_type": "software",
    "category": "",
    "subcategory": "",
    "lifecycle": {"status": "active", "eos_announced": None, "last_support_date": None},
    "license_model": {"type": "subscription", "term": None, "seats_or_nodes": None, "includes_support": True},
    "entitlements": {"features": [], "tier": None, "usage_limits": {}},
    "pricing_model": {"currency": "USD", "base_price": 0.0, "billing_cycle": "one_time", "pricing_tiers": []},
    "dependencies": {"requires": [], "compatibility": []},
    "regulatory": {"compliance": []},
    "metadata": {"vendor_sku_aliases": [], "notes": ""}
}

# === Normaliza√ß√£o das categorias =============================================
HW_CATEGORY_MAP = {
    "switch": "Switches", "switches": "Switches",
    "router": "Routers", "routers": "Routers",
    "firewall": "Firewall",
    "wireless": "Wireless", "access point": "Wireless", "ap": "Wireless",
    "antenna": "Antennas", "antennas": "Antennas",
    "cabling": "Cabling",
    "connector": "Connectors", "connectors": "Connectors",
}
SW_CATEGORY_MAP = {
    "license": "Licenses", "licensing": "Licenses",
    "subscription": "Subscriptions",
    "support": "Support",
    "cloud": "Cloud Services", "service": "Cloud Services",
}

def normalize_hw_category(raw: str) -> str | None:
    if not raw: return None
    s = str(raw).strip().lower()
    for key, cat in HW_CATEGORY_MAP.items():
        if key in s:
            return cat
    if s in {v.lower() for v in HW_CATEGORY_MAP.values()}:
        return s.title()
    return None

def normalize_sw_category(raw: str) -> str | None:
    if not raw: return None
    s = str(raw).strip().lower()
    for key, cat in SW_CATEGORY_MAP.items():
        if key in s:
            return cat
    if s in {v.lower() for v in HW_CATEGORY_MAP.values()}:
        return s.title()
    return None

# === Detec√ß√£o de colunas no Excel ============================================
def detect_columns(df: pd.DataFrame) -> dict:
    cols = {c: re.sub(r"\s+", "", str(c).strip().lower()) for c in df.columns}
    out = {"sku": None, "desc": None, "price": None, "category": None, "category_type": None, "subcategory": None}

    for col, norm in cols.items():
        if out["sku"] is None and (norm == "sku" or "partnumber" in norm or norm == "part" or "sku" in norm):
            out["sku"] = col
        elif out["desc"] is None and (norm in {"desc","description","name"} or "desc" in norm or "description" in norm):
            out["desc"] = col
        elif out["price"] is None and (norm in {"price","pri","list"} or "price" in norm or norm == "pri"):
            out["price"] = col
        elif out["category_type"] is None and re.fullmatch(r"category[-_]?type", norm):
            out["category_type"] = col
        elif out["category"] is None and (norm == "category" or (("category" in norm) and ("type" not in norm))):
            out["category"] = col
        elif out["subcategory"] is None and ("subcategory" in norm or "sub-category" in norm or norm == "sub_category"):
            out["subcategory"] = col

    missing = [k for k, v in out.items() if v is None and k in ("sku","desc","price")]
    if missing:
        raise RuntimeError(f"Excel sem colunas obrigat√≥rias: {missing}. Detectado: {out}")
    return out

def read_excel_rows(xlsx_path: Path) -> pd.DataFrame:
    if not xlsx_path.exists():
        raise FileNotFoundError(f"Excel n√£o encontrado: {xlsx_path}")
    df = pd.read_excel(xlsx_path, engine="openpyxl")
    cols = detect_columns(df)

    # Limpa pre√ßo (v√≠rgula decimal, separadores, etc.)
    price_series = (
        df[cols["price"]]
        .astype(str)
        .str.replace(r"[^\d\.,-]", "", regex=True)
        .str.replace(r"\.(?=\d{3},)", "", regex=True)  # remove separador de milhar antes de v√≠rgula decimal
        .str.replace(",", ".", regex=False)
    )
    df["_price"] = pd.to_numeric(price_series, errors="coerce").fillna(0.0)

    df["_sku"]   = df[cols["sku"]].astype(str).str.strip()
    df["_name"]  = df[cols["desc"]].astype(str).str.strip()
    df["_cat"]   = df[cols["category"]].astype(str).str.strip() if cols["category"] else ""
    df["_cat_type"] = df[cols["category_type"]].astype(str).str.strip() if cols["category_type"] else ""
    df["_sub"]   = df[cols["subcategory"]].astype(str).str.strip() if cols["subcategory"] else ""
    return df

# === LLM (enriquecimento opcional) ===========================================
def _extract_json(text: str) -> dict:
    """Aceita resposta com/sem fences e retorna um dict JSON."""
    t = text.strip()
    if t.startswith("```"):
        t = re.sub(r"^```[a-zA-Z0-9]*\s*", "", t)
        t = re.sub(r"\s*```$", "", t)
    s = t.find("{"); e = t.rfind("}")
    if s != -1 and e != -1 and e > s:
        t = t[s:e+1]
    return json.loads(t)

def _count_filled(obj) -> int:
    c = 0
    if isinstance(obj, dict):
        for v in obj.values(): c += _count_filled(v)
    elif isinstance(obj, list):
        for v in obj: c += _count_filled(v)
    else:
        if obj not in (None, "", []): c += 1
    return c

def _diff_keys(before: dict, after: dict, prefix=""):
    changes = []
    for k in after.keys():
        b = before.get(k, None)
        a = after.get(k, None)
        path = f"{prefix}.{k}" if prefix else k
        if isinstance(a, dict) and isinstance(b, dict):
            changes += _diff_keys(b, a, prefix=path)
        elif isinstance(a, list) and isinstance(b, list):
            if len(a) != len(b): changes.append(path + "[]")
        else:
            if (b in (None, "", []) and a not in (None, "", [])) or (b != a):
                changes.append(path)
    return changes

# Model config
MODEL_NAME = "gpt-4o-mini"
MODEL_TEMP = 0.3
llm = ChatOpenAI(model=MODEL_NAME, temperature=MODEL_TEMP)

ENRICH_PROMPT = ChatPromptTemplate.from_template(
    """You are a Cisco product data normalizer.

You receive:
- A PARTIAL product JSON (with SKU, name/description, raw category and base price).
- A TARGET SCHEMA (with the exact keys/structure we must output).

# GOAL
Fill ALL missing or empty fields you can reasonably infer from the SKU, product name/description and category.
Prefer realistic, conservative values (no marketing fluff). Keep units when applicable. If you don't know, leave null.

# STRICT RULES
- Output exactly the same key structure as TARGET SCHEMA (no extra/missing keys).
- Do NOT modify existing non-empty values in the partial JSON ‚Äî only fill blanks.
- For hardware, prioritize filling `technical_profile.hardware_attributes` (ports, poe_* fields, uplinks, stacking, wifi_standard), `dependencies`, and `compatibility`.
- For software, prioritize filling `license_model.term`, `billing_cycle`, `entitlements.features/tier`, and `dependencies`.
- Keep currency as provided. Do not invent prices. Be concise and consistent.

--- PARTIAL_JSON ---
{partial}

--- TARGET_SCHEMA ---
{schema}

Return ONLY a JSON object that matches the target schema (no explanations)."""
)

def enrich_with_llm(partial: dict, schema_obj: dict, cache_key: str) -> dict:
    if not ENRICH_WITH_LLM:
        return partial
    if cache_key in CACHE:
        return CACHE[cache_key]

    chain = ENRICH_PROMPT | llm
    tries = 0
    while True:
        tries += 1
        try:
            resp = chain.invoke({
                "partial": json.dumps(partial, ensure_ascii=False, indent=2),
                "schema":  json.dumps(schema_obj, ensure_ascii=False, indent=2),
            })
            enriched_json = _extract_json(resp.content)

            # Merge: schema baseline -> partial -> enriched
            out = copy.deepcopy(schema_obj)

            def deep_merge(dst, src):
                for k, v in src.items():
                    if isinstance(v, dict) and isinstance(dst.get(k), dict):
                        deep_merge(dst[k], v)
                    else:
                        dst[k] = v

            before = copy.deepcopy(out)
            deep_merge(out, partial)
            deep_merge(out, enriched_json)

            # Logs de enriquecimento
            before_count = _count_filled(partial)
            after_count  = _count_filled(out)
            delta        = after_count - before_count
            if delta <= 0:
                print(f"‚ÑπÔ∏è  {cache_key}: LLM retornou pouca coisa (delta={delta}).")
            else:
                changed = _diff_keys(partial, out)
                changed_preview = ", ".join(changed[:8]) + (" ..." if len(changed) > 8 else "")
                print(f"‚úÖ {cache_key}: +{delta} campos ‚Üí {changed_preview}")

            CACHE[cache_key] = out
            if (len(CACHE) % 10) == 0:
                _save_cache(CACHE_PATH, CACHE)
            return out

        except Exception as e:
            if tries >= 3:
                print(f"‚ö†Ô∏è LLM falhou para {cache_key}: {e} ‚Äî usando partial.")
                return partial
            time.sleep(1.2)

# === Persist√™ncia incremental por categoria ===================================
def append_item_to_json(filepath: Path, item: dict):
    if filepath.exists():
        try:
            arr = json.loads(filepath.read_text(encoding="utf-8"))
            if not isinstance(arr, list):
                arr = []
        except Exception:
            arr = []
    else:
        arr = []
    arr.append(item)
    filepath.write_text(json.dumps(arr, ensure_ascii=False, indent=2), encoding="utf-8")

# === Pipeline principal =======================================================
def build_catalog_from_excel(
    xlsx_path: Path = XLSX_PATH,
    max_items_per_category: int | None = MAX_ITEMS_PER_CATEGORY,
    limit_categories: list[str] | None = LIMIT_CATEGORIES,
    verbose: bool = True
):
    df = read_excel_rows(xlsx_path)
    if verbose:
        print(f"Rows in Excel: {len(df)}")

    buckets_hw = defaultdict(list)
    buckets_sw = defaultdict(list)

    records = df.to_dict(orient="records")
    for rec in tqdm(records, desc="Classifying rows", disable=not verbose):
        sku   = rec.get("_sku", "").strip()
        name  = rec.get("_name", "").strip()
        price = float(rec.get("_price", 0.0) or 0.0)
        raw_cat = rec.get("_cat", "")
        raw_type= rec.get("_cat_type", "")
        subcat  = rec.get("_sub", "")

        # 1) Decide Hardware/Software pelo Category-Type, se existir
        pt = None
        rt = raw_type.strip().lower()
        if rt.startswith("hard"):
            pt = "hardware"
        elif rt.startswith("soft"):
            pt = "software"

        # 2) Normaliza categoria
        hw_cat = normalize_hw_category(raw_cat)    # ex.: "Wireless"
        sw_cat = normalize_sw_category(raw_cat)    # ex.: "Wireless" ou "Licenses"

        # 3) Se n√£o deu pra decidir tipo pelo Category-Type, decide por categoria
        if not pt:
            pt = "hardware" if hw_cat else ("software" if sw_cat else "hardware")

        # 4) Categoria final
        category = hw_cat if pt == "hardware" else (sw_cat or (raw_cat.strip().title() if raw_cat else "Licenses"))

        # 5) Fallback por texto do produto (hardware)
        if pt == "hardware" and not hw_cat:
            txt = f"{name} {sku}".lower()
            if "switch" in txt:             category = "Switches"
            elif "router" in txt:           category = "Routers"
            elif "firewall" in txt:         category = "Firewall"
            elif "access point" in txt or "ap " in txt or txt.startswith("ap-"): category = "Wireless"
            elif "antenna" in txt:          category = "Antennas"
            elif "cable" in txt:            category = "Cabling"
            elif "connector" in txt:        category = "Connectors"

        if limit_categories and category not in limit_categories:
            continue

        if pt == "hardware":
            schema = SCHEMAS_HW.get(category)
            if not schema:
                if verbose:
                    print(f"‚Ü™Ô∏è  Sem schema p/ HW category='{category}', SKU={sku} ‚Äî pulando")
                continue
            base = copy.deepcopy(schema)
            base["cisco_product_id"] = sku
            base["commercial_name"] = name
            base["product_type"] = "hardware"
            base.setdefault("pricing_model", {})
            base["pricing_model"]["currency"] = base["pricing_model"].get("currency", "USD")
            base["pricing_model"]["base_price"] = price
            base["technical_profile"] = base.get("technical_profile", {})
            base["technical_profile"]["category"] = category
            base["technical_profile"]["subcategory"] = subcat or ""
            buckets_hw[category].append(base)

        else:
            schema = SCHEMAS_SW.get(category, GENERIC_SW_SCHEMA)
            base = copy.deepcopy(schema)
            base["cisco_product_id"] = sku
            base["commercial_name"] = name
            base["product_type"] = "software"
            base["category"] = category
            base["pricing_model"] = base.get("pricing_model", {})
            base["pricing_model"]["currency"] = base["pricing_model"].get("currency", "USD")
            base["pricing_model"]["base_price"] = price
            if "subcategory" not in base:
                base["subcategory"] = subcat or ""
            buckets_sw[category].append(base)

    counts = {"hardware": 0, "software": 0, "files": 0}

    # HARDWARE ‚Üí 1 arquivo por categoria
    for category, items in buckets_hw.items():
        out_file = OUTPUT_DIR_HW / f"hw_{category.replace(' ', '_').lower()}.json"
        if verbose: print(f"\n[HW] {category}: {len(items)} itens ‚Üí {out_file}")
        n = 0
        for base in tqdm(items, desc=f"Enrich HW/{category}", disable=not verbose):
            sku = base.get("cisco_product_id", "")
            schema = SCHEMAS_HW[category]
            enriched = enrich_with_llm(base, schema, cache_key=f"HW::{category}::{sku}")
            append_item_to_json(out_file, enriched)
            counts["hardware"] += 1
            n += 1
            if max_items_per_category and n >= max_items_per_category:
                if verbose: print(f"  ‚Ü™Ô∏è  Limit reached ({max_items_per_category}) for HW/{category}")
                break
        counts["files"] += 1

    # SOFTWARE ‚Üí 1 arquivo por categoria
    for category, items in buckets_sw.items():
        out_file = OUTPUT_DIR_SW / f"sw_{category.replace(' ', '_').lower()}.json"
        if verbose: print(f"\n[SW] {category}: {len(items)} itens ‚Üí {out_file}")
        n = 0
        for base in tqdm(items, desc=f"Enrich SW/{category}", disable=not verbose):
            sku = base.get("cisco_product_id", "")
            schema = SCHEMAS_SW.get(category, GENERIC_SW_SCHEMA)
            enriched = enrich_with_llm(base, schema, cache_key=f"SW::{category}::{sku}")
            append_item_to_json(out_file, enriched)
            counts["software"] += 1
            n += 1
            if max_items_per_category and n >= max_items_per_category:
                if verbose: print(f"  ‚Ü™Ô∏è  Limit reached ({max_items_per_category}) for SW/{category}")
                break
        counts["files"] += 1

    _save_cache(CACHE_PATH, CACHE)

    if verbose:
        print("\n=== Summary ===")
        print(counts)
    return counts

# === EXECU√á√ÉO (teste r√°pido) ==================================================
summary = build_catalog_from_excel(
    xlsx_path=XLSX_PATH,
    max_items_per_category=MAX_ITEMS_PER_CATEGORY,
    limit_categories=LIMIT_CATEGORIES,   # ex.: ["Wireless","Switches"]
    verbose=True
)
summary


Rows in Excel: 4267


Classifying rows: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4267/4267 [00:00<00:00, 14645.52it/s]



[HW] Wireless: 509 itens ‚Üí out\hardware\hw_wireless.json


Enrich HW/Wireless:   1%|          | 3/509 [00:10<28:53,  3.43s/it]


KeyboardInterrupt: 

In [68]:
# ================= DEBUG CONFIG =================
DEBUG_ENRICH = True
DEBUG_DIR = Path("out/debug_raw")
DEBUG_DIR.mkdir(parents=True, exist_ok=True)

# =============== UTILS ==========================
def _extract_json(text: str) -> dict:
    """Aceita resposta com/sem fences e retorna um dict JSON."""
    t = text.strip()
    # remove codefence se vier
    if t.startswith("```"):
        t = re.sub(r"^```[a-zA-Z0-9]*\s*", "", t)
        t = re.sub(r"\s*```$", "", t)
    s = t.find("{"); e = t.rfind("}")
    if s != -1 and e != -1 and e > s:
        t = t[s:e+1]
    return json.loads(t)

def _count_filled(obj) -> int:
    c = 0
    if isinstance(obj, dict):
        for v in obj.values(): c += _count_filled(v)
    elif isinstance(obj, list):
        for v in obj: c += _count_filled(v)
    else:
        if obj not in (None, "", []): c += 1
    return c

def _diff_keys(before: dict, after: dict, prefix=""):
    changes = []
    for k in after.keys():
        b = before.get(k, None)
        a = after.get(k, None)
        path = f"{prefix}.{k}" if prefix else k
        if isinstance(a, dict) and isinstance(b, dict):
            changes += _diff_keys(b, a, prefix=path)
        elif isinstance(a, list) and isinstance(b, list):
            if len(a) != len(b): changes.append(path + "[]")
        else:
            if (b in (None, "", []) and a not in (None, "", [])) or (b != a):
                changes.append(path)
    return changes

def _validate_structure_against_schema(schema_obj: dict, data_obj: dict, prefix=""):
    """Retorna listas (missing_keys, extra_keys) para auditoria r√°pida."""
    missing, extra = [], []
    # chaves que existem no schema mas n√£o no dado
    for k in schema_obj.keys():
        if k not in data_obj:
            missing.append(f"{prefix}{k}")
        else:
            if isinstance(schema_obj[k], dict) and isinstance(data_obj[k], dict):
                m, e = _validate_structure_against_schema(schema_obj[k], data_obj[k], prefix=f"{prefix}{k}.")
                missing += m; extra += e
            # se schema √© dict e dado √© lista ou primitivo, n√£o for√ßamos, s√≥ reportar diferen√ßa de tipo se quiser
    # chaves extras no dado
    for k in data_obj.keys():
        if k not in schema_obj:
            extra.append(f"{prefix}{k}")
    return missing, extra
# ===============================================

# ======= LLM CONFIG (for√ßar JSON) ==============
MODEL_NAME = "gpt-4o-mini"
MODEL_TEMP = 0.3
# For√ßa JSON puro
llm_json = ChatOpenAI(model=MODEL_NAME, temperature=MODEL_TEMP,
                      response_format={"type": "json_object"})

ENRICH_PROMPT = ChatPromptTemplate.from_template(
    """You are a Cisco product data normalizer.

You receive:
- A PARTIAL product JSON (with SKU, name/description, raw category and base price).
- A TARGET SCHEMA (with the exact keys/structure we must output).

# GOAL
Fill ALL missing or empty fields you can reasonably infer from the SKU, product name/description and category.
Prefer realistic, conservative values (no marketing fluff). Keep units when applicable. If you don't know, leave null.

# STRICT RULES
- Output exactly the same key structure as TARGET SCHEMA (no extra/missing keys).
- Do NOT modify existing non-empty values in the partial JSON ‚Äî only fill blanks.
- For hardware, prioritize filling `technical_profile.hardware_attributes` (ports, poe_* fields, uplinks, stacking, wifi_standard), `dependencies`, and `compatibility`.
- For software, prioritize filling `license_model.term`, `billing_cycle`, `entitlements.features/tier`, and `dependencies`.
- Keep currency as provided. Do not invent prices. Be concise and consistent.

--- PARTIAL_JSON ---
{partial}

--- TARGET_SCHEMA ---
{schema}

Return ONLY a JSON object that matches the target schema (no explanations)."""
)

def enrich_with_llm(partial: dict, schema_obj: dict, cache_key: str) -> dict:
    """Enriquece o 'partial' segundo 'schema_obj'.
       Salva debug: prompt e resposta crus; valida estrutura; loga delta de campos."""
    if not ENRICH_WITH_LLM:
        return partial
    if cache_key in CACHE:
        return CACHE[cache_key]

    chain = ENRICH_PROMPT | llm_json
    tries = 0
    while True:
        tries += 1
        try:
            # 1) chama LLM
            input_vars = {
                "partial": json.dumps(partial, ensure_ascii=False, indent=2),
                "schema":  json.dumps(schema_obj, ensure_ascii=False, indent=2),
            }
            resp = chain.invoke(input_vars)
            raw_text = resp.content

            # 2) DEBUG: salva prompt + resposta crua
            if DEBUG_ENRICH:
                (DEBUG_DIR / f"{cache_key.replace('::','__')}.prompt.json").write_text(
                    json.dumps(input_vars, ensure_ascii=False, indent=2), encoding="utf-8"
                )
                (DEBUG_DIR / f"{cache_key.replace('::','__')}.raw.txt").write_text(
                    raw_text, encoding="utf-8"
                )

            # 3) extrai JSON
            enriched_json = _extract_json(raw_text)

            # 4) merge: schema -> partial -> enriched
            out = copy.deepcopy(schema_obj)
            def deep_merge(dst, src):
                for k, v in src.items():
                    if isinstance(v, dict) and isinstance(dst.get(k), dict):
                        deep_merge(dst[k], v)
                    else:
                        # mant√©m o que veio em src (inclusive preenchendo vazios)
                        dst[k] = v

            before = copy.deepcopy(out)
            deep_merge(out, partial)
            deep_merge(out, enriched_json)

            # 5) valida estrutura contra schema
            missing, extra = _validate_structure_against_schema(schema_obj, out)
            if DEBUG_ENRICH and (missing or extra):
                print(f"‚ö†Ô∏è  {cache_key}: estrutura divergente ‚Äî missing={len(missing)} extra={len(extra)}")
                (DEBUG_DIR / f"{cache_key.replace('::','__')}.missing_keys.txt").write_text(
                    "\n".join(missing), encoding="utf-8"
                )
                (DEBUG_DIR / f"{cache_key.replace('::','__')}.extra_keys.txt").write_text(
                    "\n".join(extra), encoding="utf-8"
                )

            # 6) log de campos preenchidos
            before_count = _count_filled(partial)
            after_count  = _count_filled(out)
            delta        = after_count - before_count
            if delta <= 0:
                print(f"‚ÑπÔ∏è  {cache_key}: LLM n√£o acrescentou campos (delta={delta}). Veja {DEBUG_DIR} para prompt/sa√≠da.")
            else:
                changed = _diff_keys(partial, out)
                changed_preview = ", ".join(changed[:8]) + (" ..." if len(changed) > 8 else "")
                print(f"‚úÖ {cache_key}: +{delta} campos ‚Üí {changed_preview}")

            # 7) cache + return
            CACHE[cache_key] = out
            if (len(CACHE) % 10) == 0:
                _save_cache(CACHE_PATH, CACHE)
            return out

        except Exception as e:
            # DEBUG: log de erro com a √∫ltima resposta crua (se houver)
            print(f"‚ùå  {cache_key}: falha ao parsear/mesclar ‚Äî {e}")
            if tries >= 3:
                print(f"‚ö†Ô∏è  {cache_key}: desistindo ap√≥s {tries} tentativas. Usando partial.")
                return partial
            time.sleep(1.2)

# ============ SMOKE TEST (rode antes do pipeline) ============================
def smoke_test_enricher(sku: str, name: str, price: float, category: str, schema_bank: dict):
    """
    Faz um teste isolado de enriquecimento com 1 produto.
    - schema_bank: SCHEMAS_HW ou SCHEMAS_SW
    """
    schema = schema_bank.get(category)
    if not schema:
        print(f"üö´ Schema n√£o encontrado para categoria '{category}'.")
        return
    partial = copy.deepcopy(schema)
    # preenche apenas o m√≠nimo (como vem do Excel)
    partial["cisco_product_id"] = sku
    partial["commercial_name"]  = name
    if "product_type" in partial:
        # mant√©m como est√° no schema
        pass
    # pre√ßo
    if "pricing_model" in partial:
        partial["pricing_model"]["base_price"] = price
        partial["pricing_model"]["currency"] = partial["pricing_model"].get("currency", "USD")

    out = enrich_with_llm(partial, schema, cache_key=f"SMOKE::{category}::{sku}")
    print("\n--- RESULTADO (resumo) ---")
    print(json.dumps(out, ensure_ascii=False, indent=2)[:1200], "...\n")
    return out


In [70]:
_ = smoke_test_enricher(
    sku="MS225-48FP-HW",
    name="Meraki MS225-48FP L2 Stacking PoE Switch",
    price=7770.00,
    category="Switches",            # deve bater com o nome no schema_*.json
    schema_bank=SCHEMAS_HW          # ou SCHEMAS_SW para software
)


‚ÑπÔ∏è  SMOKE::Switches::MS225-48FP-HW: LLM n√£o acrescentou campos (delta=0). Veja out\debug_raw para prompt/sa√≠da.

--- RESULTADO (resumo) ---
{
  "cisco_product_id": "MS225-48FP-HW",
  "commercial_name": "Meraki MS225-48FP L2 Stacking PoE Switch",
  "product_type": "hardware",
  "category": "Switches",
  "subcategory": null,
  "lifecycle": {
    "status": "active",
    "eos_announced": null,
    "last_support_date": null
  },
  "pricing_model": {
    "type": "one_time",
    "currency": "USD",
    "base_price": 7770.0,
    "elig_pct": 0.01,
    "pricing_tiers": [
      {
        "min_quantity": 1,
        "price": 0.0,
        "effective": "2025-01-01",
        "discount_rules": [
          {
            "type": "volume",
            "threshold": 10,
            "discount_pct": 0.15
          }
        ]
      }
    ]
  },
  "dependencies": {
    "required_components": [],
    "compatible_with": []
  },
  "regulatory": {
    "certifications": [
      "FCC",
      "CE",
      "IC"
  

In [83]:
# %% ============================== Setup / Imports ==============================
import os, json, time, copy, re
from pathlib import Path
from collections import defaultdict

import pandas as pd
from tqdm import tqdm
from langchain_openai import ChatOpenAI
from langchain.prompts import ChatPromptTemplate

# -------------------- Controles de execu√ß√£o --------------------
ENRICH_WITH_LLM = True          # True => preenche via LLM
MAX_ITEMS_PER_CATEGORY = 3       # None = sem limite (use 3 p/ teste)
LIMIT_CATEGORIES = None          # ex.: ["Switches","Wireless"] para filtrar
DEBUG_SMOKE = True               # salva prompt/resposta da LLM p/ auditoria
TQDM_VERBOSE = True

# -------------------- Caminhos --------------------
XLSX_PATH       = Path("data/raw/Cisco_Pricing.xlsx")
SCHEMAS_HW_DIR  = Path("schemas/hardware")
SCHEMAS_SW_DIR  = Path("schemas/software")
OUTPUT_DIR_HW   = Path("out/hardware")
OUTPUT_DIR_SW   = Path("out/software")
CACHE_PATH      = Path("out/enrichment_cache.json")
DEBUG_DIR       = Path("out/debug_llm")

for p in [OUTPUT_DIR_HW, OUTPUT_DIR_SW, CACHE_PATH.parent, DEBUG_DIR]:
    p.mkdir(parents=True, exist_ok=True)

# %% ============================== Cache utils =================================
def _load_cache(path: Path) -> dict:
    if path.exists():
        try:
            return json.loads(path.read_text(encoding="utf-8"))
        except Exception:
            return {}
    return {}

def _save_cache(path: Path, data: dict):
    tmp = path.with_suffix(".tmp.json")
    tmp.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
    tmp.replace(path)

CACHE = _load_cache(CACHE_PATH)

# %% ======================= Carregamento dos schemas ===========================
def load_schema_templates(dir_path: Path, expect_hw=True):
    """
    L√™ todos os schema_*.json e devolve: { "<Categoria>": <dict do schema> }
    Ex.: schema_switches.json -> categoria "Switches"
    """
    templates = {}
    if dir_path.exists():
        for fp in sorted(dir_path.glob("*.json")):
            try:
                obj = json.loads(fp.read_text(encoding="utf-8"))
                cat = obj.get("category")
                if not cat:
                    base = fp.stem.replace("schema_", "")
                    cat = base.replace("_", " ").title()
                templates[cat] = obj
            except Exception as e:
                print(f"‚ö†Ô∏è Erro lendo {fp}: {e}")
    else:
        if expect_hw:
            print(f"‚ö†Ô∏è Diret√≥rio de schemas n√£o existe: {dir_path}")
    return templates

SCHEMAS_HW = load_schema_templates(SCHEMAS_HW_DIR, expect_hw=True)
SCHEMAS_SW = load_schema_templates(SCHEMAS_SW_DIR, expect_hw=False)

if not SCHEMAS_HW:
    raise RuntimeError(
        f"Nenhum schema de hardware encontrado em {SCHEMAS_HW_DIR}. "
        "Coloque seus arquivos schema_*.json l√° (ex.: schema_switches.json)."
    )

# Fallback gen√©rico se n√£o houver schema de software espec√≠fico
GENERIC_SW_SCHEMA = {
    "cisco_product_id": "",
    "commercial_name": "",
    "product_type": "software",
    "category": "",
    "subcategory": "",
    "lifecycle": {"status": "active", "eos_announced": None, "last_support_date": None},
    "license_model": {"type": "subscription", "term": "1Y", "seats_or_nodes": 1, "includes_support": True},
    "entitlements": {"features": [], "tier": "Base", "usage_limits": {}},
    "pricing_model": {"currency": "USD", "base_price": 0.0, "billing_cycle": "yearly", "pricing_tiers": []},
    "dependencies": {"requires": [], "compatibility": []},
    "regulatory": {"compliance": []},
    "metadata": {"vendor_sku_aliases": [], "notes": ""}
}

# %% ======================= Normaliza√ß√£o de categorias =========================
HW_CATEGORY_MAP = {
    "switch": "Switches", "switches": "Switches",
    "router": "Routers", "routers": "Routers",
    "firewall": "Firewall",
    "wireless": "Wireless", "access point": "Wireless", "ap ": "Wireless", " ap-": "Wireless",
    "antenna": "Antennas", "antennas": "Antennas",
    "cabling": "Cabling",
    "connector": "Connectors", "connectors": "Connectors",
}

SW_CATEGORY_MAP = {
    "license": "Licenses", "licensing": "Licenses",
    "subscription": "Subscriptions",
    "support": "Support",
    "cloud": "Cloud Services", "service": "Cloud Services",
    "wireless": "Wireless"  # permitir reuso de categoria
}

def normalize_hw_category(raw: str):
    if not raw: return None
    s = " " + str(raw).strip().lower() + " "
    for key, cat in HW_CATEGORY_MAP.items():
        if f" {key} " in s:
            return cat
    if str(raw).strip().title() in HW_CATEGORY_MAP.values():
        return str(raw).strip().title()
    return None

def normalize_sw_category(raw: str):
    if not raw: return None
    s = " " + str(raw).strip().lower() + " "
    for key, cat in SW_CATEGORY_MAP.items():
        if f" {key} " in s:
            return cat
    if str(raw).strip().title() in HW_CATEGORY_MAP.values():
        return str(raw).strip().title()
    return None

# ===== Heur√≠stica para detectar licen√ßas/software pelo SKU/Desc =====
_LICENSE_TOKENS = [
    r"^l-", r"^lic-", r"-lic($|[^a-z0-9])", r"\blicen[cs]e\b", r"\blic\b",
    r"\bdna\b", r"\bnw-?[ae]\b", r"network advantage", r"network essentials",
    r"\bsubscription\b", r"\bsupport\b", r"\bsnt\b", r"\bsmart\s*net\b",
    r"\bmeraki license\b", r"\bco-term\b", r"\basa\s*lic\b", r"\bfirepower\s*license\b",
    r"\b1y\b", r"\b3y\b", r"\b5y\b", r"\b1yr\b", r"\b3yr\b", r"\b5yr\b",
]

def is_license_like(sku: str, name: str) -> bool:
    txt = f"{sku} {name}".strip().lower()
    for pat in _LICENSE_TOKENS:
        if re.search(pat, txt):
            return True
    return False

# %% ============================ Excel parsing =================================
def detect_columns(df: pd.DataFrame) -> dict:
    # alvo: Category-Type, Category, SKU, Desc, price, Elig_%
    out = {"category_type": None, "category": None, "sku": None, "desc": None, "price": None, "elig": None}
    for col in df.columns:
        norm = re.sub(r"\s+", "", str(col).strip().lower())
        if out["category_type"] is None and norm in {"category-type", "categorytype"}:
            out["category_type"] = col
        elif out["category"] is None and norm == "category":
            out["category"] = col
        elif out["sku"] is None and norm in {"sku","partnumber","part","part#","productid","product_id"}:
            out["sku"] = col
        elif out["desc"] is None and norm in {"desc","description","name","productname","product_name"}:
            out["desc"] = col
        elif out["price"] is None and norm in {"price","list","listprice","list_price"}:
            out["price"] = col
        elif out["elig"] is None and norm in {"elig_%","elig","eligibility","eligpercent","eligibility%","elig_pct"}:
            out["elig"] = col

    missing = [k for k in ("sku","desc","price") if out[k] is None]
    if missing:
        raise RuntimeError(f"Excel sem colunas obrigat√≥rias: {missing}. Detectado: {out}")
    return out

def read_excel_rows(xlsx_path: Path) -> pd.DataFrame:
    if not xlsx_path.exists():
        raise FileNotFoundError(f"Excel n√£o encontrado: {xlsx_path}")
    df = pd.read_excel(xlsx_path, engine="openpyxl")
    cols = detect_columns(df)

    # limpar pre√ßo
    price_series = (
        df[cols["price"]]
        .astype(str)
        .str.replace(r"[^\d\.,-]", "", regex=True)
        .str.replace(r"\.(?=\d{3},)", "", regex=True)
        .str.replace(",", ".", regex=False)
    )
    df["_price"] = pd.to_numeric(price_series, errors="coerce").fillna(0.0)

    elig_series = None
    if cols["elig"] is not None:
        elig_series = (
            df[cols["elig"]]
            .astype(str)
            .str.replace(r"[^\d\.,-]", "", regex=True)
            .str.replace(",", ".", regex=False)
        )
        df["_elig"] = pd.to_numeric(elig_series, errors="coerce").fillna(0.0)
    else:
        df["_elig"] = 0.0

    df["_sku"]  = df[cols["sku"]].astype(str).str.strip()
    df["_name"] = df[cols["desc"]].astype(str).str.strip()
    df["_cat"]  = df[cols["category"]].astype(str).str.strip() if cols["category"] else ""
    df["_cat_type"] = df[cols["category_type"]].astype(str).str.strip() if cols["category_type"] else ""
    return df

# %% ======================= Helpers de merge/trava =============================
def pick_environment(sku: str, name: str, category: str) -> dict:
    """
    Heur√≠stica simples para ambiente:
    - Indoor enterprise (default): 0..45 ¬∞C, IP30
    - Fanless/compact: 0..50 ¬∞C, IP30
    - Outdoor (AP/switch outdoor): -20..55 ¬∞C, IP67
    - Industrial/Rugged (IE/IR/IW etc): -40..75 ¬∞C, IP54 ou IP67
    - Software/licen√ßa: sem ambiente (retornamos None para o caller tratar)
    """
    txt = f"{sku} {name}".lower()
    cat = (category or "").lower()

    # Software (s√≥ pra seguran√ßa, mesmo que n√£o seja chamado pra SW)
    if cat in {"licenses", "subscriptions", "support", "cloud services"}:
        return None

    # Palavras-chave
    is_outdoor = any(w in txt for w in ["outdoor", "mr76", "mr86", "mr84", "ap-outdoor"])
    is_industrial = any(w in txt for w in ["industrial", "rugged", "ie-", "ir", "iw", "cgr", "gr-" , "ic-", "ix-"])
    is_fanless = "fanless" in txt or "compact" in txt

    # Wireless outdoor (Meraki/Cisco)
    if is_outdoor or ("wireless" in cat and any(w in txt for w in ["mr7", "mr8"])):  # heur√≠stica leve
        return {"oper_temp_min_c": -20, "oper_temp_max_c": 55, "ip_rating": "IP67"}

    # Industrial/rugged switches/routers (IE/IR/CGR)
    if is_industrial:
        # alguns IE/IR s√£o IP54; se mencionar "outdoor"/"ip67", sobe
        ip = "IP67" if "ip67" in txt or "outdoor" in txt else "IP54"
        return {"oper_temp_min_c": -40, "oper_temp_max_c": 75, "ip_rating": ip}

    # Indoor enterprise (default)
    if is_fanless:
        return {"oper_temp_min_c": 0, "oper_temp_max_c": 50, "ip_rating": "IP30"}

    # Switch/AP enterprise normal
    return {"oper_temp_min_c": 0, "oper_temp_max_c": 45, "ip_rating": "IP30"}



def deep_get(obj, path_list, default=None):
    cur = obj
    for p in path_list:
        if not isinstance(cur, dict) or p not in cur:
            return default
        cur = cur[p]
    return cur

def deep_set(obj, path_list, value):
    cur = obj
    for p in path_list[:-1]:
        if p not in cur or not isinstance(cur[p], dict):
            cur[p] = {}
        cur = cur[p]
    cur[path_list[-1]] = value

def upsert_item_to_json(filepath: Path, item: dict, key="cisco_product_id"):
    arr = []
    if filepath.exists():
        try:
            arr = json.loads(filepath.read_text(encoding="utf-8"))
            if not isinstance(arr, list):
                arr = []
        except Exception:
            arr = []
    sku = item.get(key)
    arr = [x for x in arr if x.get(key) != sku]
    arr.append(item)
    filepath.write_text(json.dumps(arr, ensure_ascii=False, indent=2), encoding="utf-8")

# Campos travados (do Excel)
LOCK_PATHS = [
    ["cisco_product_id"],
    ["commercial_name"],
    ["product_type"],
    ["pricing_model", "base_price"],
    ["pricing_model", "elig_pct"],
    ["technical_profile", "category"],  # HW
    ["category"],                      # SW
]

def apply_locks(enriched: dict, locked_from_excel: dict) -> dict:
    out = copy.deepcopy(enriched)
    for path in LOCK_PATHS:
        val = deep_get(locked_from_excel, path, default=None)
        if val is not None:
            deep_set(out, path, val)
    return out

# %% ========================== LLM (enriquecimento) ===========================
def _shape_from_schema(schema_obj):
    """
    Gera a mesma estrutura de chaves do schema, mas zera TODOS os valores
    para evitar que a LLM copie defaults do schema.
    """
    if isinstance(schema_obj, dict):
        out = {}
        for k, v in schema_obj.items():
            out[k] = _shape_from_schema(v)
        return out
    elif isinstance(schema_obj, list):
        return [_shape_from_schema(schema_obj[0])] if schema_obj else []
    else:
        return None

def _extract_json(text: str) -> dict:
    t = text.strip()
    if t.startswith("```"):
        t = re.sub(r"^```[a-zA-Z0-9]*\s*", "", t)
        t = re.sub(r"\s*```$", "", t)
    s = t.find("{"); e = t.rfind("}")
    if s != -1 and e != -1 and e > s:
        t = t[s:e+1]
    return json.loads(t)

def _has_nulls_or_empty(d):
    if isinstance(d, dict):
        for v in d.values():
            if _has_nulls_or_empty(v):
                return True
        return False
    if isinstance(d, list):
        if len(d) == 0:
            return True
        return any(_has_nulls_or_empty(v) for v in d)
    return d in (None, "", [])

# ---- fallback program√°tico para eliminar nulos restantes (plaus√≠vel por categoria)
def _fallback_fill_plausible(obj: dict, pt: str, category: str, sku: str, name: str) -> dict:
    out = copy.deepcopy(obj)

    def ensure(path, value):
        cur = deep_get(out, path, None)
        if cur in (None, "", []):
            deep_set(out, path, value)

    # ---- Defaults comuns
    ensure(["lifecycle", "status"], "active")
    ensure(["lifecycle", "eos_announced"], "2030-12-31")
    ensure(["lifecycle", "last_support_date"], "2035-12-31")

    ensure(["pricing_model", "currency"], "USD")
    if pt == "hardware":
        ensure(["pricing_model", "type"], "one_time")
        # Subcategoria na RAIZ (se existir no schema) e em technical_profile
        if "subcategory" in out and out["subcategory"] in (None, ""):
            out["subcategory"] = "access_switch" if category == "Switches" else category.lower()
        ensure(
            ["technical_profile", "subcategory"],
            out.get("subcategory", "access_switch" if category == "Switches" else "base")
        )
    else:
        ensure(["pricing_model", "type"], "subscription")
        ensure(["pricing_model", "billing_cycle"], "yearly")
        if "subcategory" in out and out["subcategory"] in (None, ""):
            out["subcategory"] = "feature_license"

    # Regulatory comuns
    ensure(["regulatory", "certifications"], ["FCC", "CE", "IC"])
    ensure(["regulatory", "environment", "oper_temp_min_c"], 0)
    ensure(["regulatory", "environment", "oper_temp_max_c"], 50)
    ensure(["regulatory", "environment", "ip_rating"], "IP30")

    # ---- Espec√≠ficos por categoria
    txt = f"{sku} {name}".lower()

    if pt == "hardware" and category == "Firewall":
        ensure(["attributes", "firewall", "fw_throughput_gbps"], 1.0)
        ensure(["attributes", "firewall", "ngfw_throughput_gbps"], 0.7)
        ensure(["attributes", "firewall", "ips_throughput_gbps"], 0.6)
        ensure(["attributes", "firewall", "ipsec_vpn_gbps"], 0.5)
        ensure(["attributes", "firewall", "max_sessions_m"], 0.5)
        ensure(["attributes", "firewall", "interfaces"], [{"type": "GE", "qty": 8}])
        ensure(["attributes", "firewall", "ha_supported"], True)
        ensure(["attributes", "firewall", "utm_services"], ["IPS", "AV", "URL"])

    elif pt == "hardware" and category == "Switches":
        ports = 48 if re.search(r"\b48\b", txt) else (24 if re.search(r"\b24\b", txt) else 24)
        poe = 1 if ("poe" in txt or "-p" in sku.lower()) else 0
        ensure(["attributes", "switch", "layer"], "L2/L3")
        ensure(["attributes", "switch", "ports_total"], ports)
        ensure(["attributes", "switch", "poe_ports"], ports if poe else 0)
        ensure(["attributes", "switch", "poe_budget_w"], 740 if poe else 0)
        ensure(["attributes", "switch", "uplinks"], [{"type": "SFP+", "speed_gbps": 10, "qty": 4}])
        ensure(["attributes", "switch", "stacking", "supported"], True)
        ensure(["attributes", "switch", "stacking", "max_members"], 8)
        ensure(["attributes", "switch", "stacking", "stack_bw_gbps"], 80)
        ensure(["attributes", "switch", "switching_capacity_gbps"], 216 if ports == 48 else 160)
        ensure(["attributes", "switch", "forwarding_mpps"], 130 if ports == 48 else 95)
        ensure(["attributes", "switch", "latency_us"], 3.2)
        ensure(["attributes", "switch", "features"], ["RSTP", "PVLAN", "ACLs", "QoS"])

    elif pt == "hardware" and category == "Wireless":
        ensure(["technical_profile", "subcategory"], "access_point")
        ensure(["attributes", "wireless", "wifi_standard"], "802.11ax (Wi-Fi 6)")
        ensure(["attributes", "wireless", "radios"], ["2.4 GHz", "5 GHz"])
        ensure(["attributes", "wireless", "antenna_type"], "internal")
        ensure(["attributes", "wireless", "throughput_mbps"], 1200)
        ensure(["attributes", "wireless", "mounting"], "indoor")
        ensure(["attributes", "wireless", "power_requirements"], "PoE+")

    elif pt == "software":
        # Ajustes de licen√ßas
        if any(t in txt for t in ["1y", "1yr"]):
            term = "1Y"
        elif any(t in txt for t in ["3y", "3yr"]):
            term = "3Y"
        elif any(t in txt for t in ["5y", "5yr"]):
            term = "5Y"
        else:
            term = "1Y"
        ensure(["license_model", "type"], "subscription")
        ensure(["license_model", "term"], term)
        ensure(["license_model", "seats_or_nodes"], 1)
        ensure(["license_model", "includes_support"], True)

        if "advantage" in txt:
            tier = "Network Advantage"
            feats = ["Advanced routing", "Segmentation", "Telemetry"]
        elif "essentials" in txt:
            tier = "Network Essentials"
            feats = ["Layer 2/3 basic", "Access policies", "Basic monitoring"]
        elif "dna" in txt:
            tier = "Cisco DNA Advantage"
            feats = ["SD-Access", "Automation", "Analytics", "Assurance"]
        else:
            tier = "Base"
            feats = ["Basic feature set"]
        ensure(["entitlements", "tier"], tier)
        ensure(["entitlements", "features"], feats)
        ensure(["pricing_model", "billing_cycle"], "yearly")
        ensure(["regulatory", "compliance"], ["ISO/IEC 27001", "SOC 2"])

    # ---- Varredura final: nada pode ficar nulo/vazio
    def fill_any_nulls(d):
        if isinstance(d, dict):
            for k, v in list(d.items()):
                if v in (None, "", []):
                    # defaults gen√©ricos
                    if k.endswith("_date"):
                        d[k] = "2030-12-31"
                    elif k.endswith("_gbps") or k.endswith("_mbps"):
                        d[k] = 1.0
                    elif k.endswith("_w"):
                        d[k] = 15
                    elif k.startswith("max_"):
                        d[k] = 1
                    elif isinstance(v, list):
                        d[k] = ["N/A"]
                    else:
                        d[k] = "N/A"
                else:
                    fill_any_nulls(v)
        elif isinstance(d, list):
            if not d:
                d.append("N/A")
            else:
                for i, v in enumerate(list(d)):
                    if v in (None, "", []):
                        d[i] = "N/A"
                    elif isinstance(v, (dict, list)):
                        fill_any_nulls(v)

    fill_any_nulls(out)
    return out

# LLM setup
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0.9)

ENRICH_PROMPT = ChatPromptTemplate.from_template(
    "You are a Cisco product data engineer and domain expert.\n"
    "Task: Given a PARTIAL product JSON (from Cisco price list) and a TARGET SHAPE (keys only, no values),\n"
    "produce a COMPLETE JSON that fills ALL fields with realistic, conservative values.\n"
    "Hard rules:\n"
    "1) Do NOT modify these fields from the PARTIAL JSON: SKU (cisco_product_id), commercial_name, product_type,\n"
    "   pricing_model.base_price, pricing_model.elig_pct, and category (hardware: technical_profile.category | software: root category).\n"
    "2) The TARGET SHAPE only shows the keys; IGNORE any numeric/string examples from schemas.\n"
    "3) Your output must NOT contain null, empty strings, or empty arrays. Always infer plausible values.\n"
    "4) Be coherent with Cisco families and the category. Use typical ranges/specs for that category.\n"
    "5) Keep the exact key names/structure of the TARGET SHAPE; add realistic lists (2‚Äì6 items) where applicable.\n"
    "6) Currency = USD unless specified; keep units consistent.\n"
    "Return ONLY a valid JSON object.\n\n"
    "<<<PARTIAL_JSON>>>\n{partial}\n<<<END_PARTIAL_JSON>>>\n\n"
    "<<<TARGET_SHAPE>>>\n{shape}\n<<<END_TARGET_SHAPE>>>"
)

FIX_NULLS_PROMPT = ChatPromptTemplate.from_template(
    "You previously generated a product JSON but it still has null/empty fields.\n"
    "Replace ALL null, empty strings, or empty arrays with realistic, conservative Cisco-like values.\n"
    "Do not change: SKU, commercial_name, product_type, pricing_model.base_price, pricing_model.elig_pct, category.\n"
    "Return ONLY a valid JSON with the same structure.\n\n"
    "<<<CURRENT_JSON>>>\n{current}\n<<<END_CURRENT_JSON>>>"
)

def enrich_with_llm_fill_all(partial: dict, schema_obj: dict, cache_key: str, debug_tag: str | None = None) -> dict:
    if not ENRICH_WITH_LLM:
        return partial
    if cache_key in CACHE:
        return CACHE[cache_key]

    target_shape = _shape_from_schema(schema_obj)

    chainA = ENRICH_PROMPT | llm
    tries = 0
    while True:
        tries += 1
        try:
            resp = chainA.invoke({
                "partial": json.dumps(partial, ensure_ascii=False, indent=2),
                "shape":   json.dumps(target_shape, ensure_ascii=False, indent=2),
            })
            first = _extract_json(resp.content)
            first_locked = apply_locks(first, partial)

            if DEBUG_SMOKE and debug_tag:
                (DEBUG_DIR / f"{debug_tag}.prompt.json").write_text(
                    json.dumps({
                        "partial": json.dumps(partial, ensure_ascii=False, indent=2),
                        "shape":   json.dumps(target_shape, ensure_ascii=False, indent=2)
                    }, ensure_ascii=False, indent=2),
                    encoding="utf-8"
                )
                (DEBUG_DIR / f"{debug_tag}.first.txt").write_text(
                    json.dumps(first_locked, ensure_ascii=False, indent=2),
                    encoding="utf-8"
                )

            final_obj = first_locked

            # Segunda passada para remover nulos/vazios
            if _has_nulls_or_empty(final_obj):
                chainB = FIX_NULLS_PROMPT | llm
                resp2 = chainB.invoke({
                    "current": json.dumps(final_obj, ensure_ascii=False, indent=2)
                })
                second = _extract_json(resp2.content)
                final_obj = apply_locks(second, partial)

                if DEBUG_SMOKE and debug_tag:
                    (DEBUG_DIR / f"{debug_tag}.fix.txt").write_text(
                        json.dumps(final_obj, ensure_ascii=False, indent=2),
                        encoding="utf-8"
                    )

            # Terceira camada: fallback program√°tico (garantia hard de n√£o-nulos)
            if _has_nulls_or_empty(final_obj):
                pt = partial.get("product_type", "hardware")
                category = partial.get("category") or deep_get(partial, ["technical_profile", "category"], "Unknown")
                final_obj = _fallback_fill_plausible(final_obj, pt, category, partial["cisco_product_id"], partial["commercial_name"])

            CACHE[cache_key] = final_obj
            if tries % 3 == 1:
                _save_cache(CACHE_PATH, CACHE)
            return final_obj

        except Exception as e:
            if tries >= 3:
                print(f"‚ö†Ô∏è LLM falhou para {cache_key}: {e} ‚Äî usando partial.")
                return partial
            time.sleep(1.0)

# %% ====================== Persist√™ncia por categoria ==========================
def save_item_by_category(item: dict, product_type: str, category: str):
    if product_type == "hardware":
        out_file = OUTPUT_DIR_HW / f"hw_{category.replace(' ', '_').lower()}.json"
    else:
        out_file = OUTPUT_DIR_SW / f"sw_{category.replace(' ', '_').lower()}.json"
    upsert_item_to_json(out_file, item)

# %% ============================= Pipeline ====================================
def build_catalog_from_excel(
    xlsx_path: Path = XLSX_PATH,
    max_items_per_category: int | None = MAX_ITEMS_PER_CATEGORY,
    limit_categories: list | None = LIMIT_CATEGORIES,
    verbose: bool = True
):
    df = read_excel_rows(xlsx_path)
    if verbose:
        print(f"Rows in Excel: {len(df)}")

    buckets_hw = defaultdict(list)
    buckets_sw = defaultdict(list)

    # classificar registros
    records = df.to_dict(orient="records")
    for rec in tqdm(records, desc="Classifying rows", disable=not TQDM_VERBOSE):
        sku     = rec.get("_sku", "").strip()
        name    = rec.get("_name", "").strip()
        price   = float(rec.get("_price", 0.0) or 0.0)
        elig    = float(rec.get("_elig", 0.0) or 0.0)
        raw_cat = rec.get("_cat", "")
        raw_ct  = rec.get("_cat_type", "")

        # 1) Tipo via Category-Type
        pt = None
        if raw_ct:
            s = raw_ct.strip().lower()
            if s.startswith("hard"): pt = "hardware"
            elif s.startswith("soft"): pt = "software"

        # 2) Heur√≠stica forte de licen√ßa/software pelo SKU/Desc
        if is_license_like(sku, name):
            pt = "software"

        # 3) Normaliza categorias
        hw_cat = normalize_hw_category(raw_cat)
        sw_cat = normalize_sw_category(raw_cat)

        # 4) Se ainda n√£o deu para decidir por CT/heur√≠stica, decide por categoria
        if not pt:
            pt = "hardware" if hw_cat else ("software" if sw_cat else "hardware")

        # 5) Categoria final
        category = hw_cat if pt == "hardware" else (sw_cat or "Licenses")

        if limit_categories and category not in limit_categories:
            continue

        if pt == "hardware":
            schema = SCHEMAS_HW.get(category)
            if not schema:
                if verbose:
                    print(f"‚Ü™Ô∏è  Sem schema p/ HW category='{category}', SKU={sku} ‚Äî pulando")
                continue
            partial = {
                "cisco_product_id": sku,
                "commercial_name":  name,
                "product_type":     "hardware",
                "pricing_model": {"base_price": price, "elig_pct": elig},
                "technical_profile": {"category": category}
            }
            buckets_hw[category].append((partial, schema))
        else:
            schema = SCHEMAS_SW.get(category, GENERIC_SW_SCHEMA)
            partial = {
                "cisco_product_id": sku,
                "commercial_name":  name,
                "product_type":     "software",
                "category":         category,
                "pricing_model": {"base_price": price, "elig_pct": elig}
            }
            buckets_sw[category].append((partial, schema))

    counts = {"hardware": 0, "software": 0, "files": 0}

    # HARDWARE
    for category, items in buckets_hw.items():
        n = 0
        if verbose: print(f"\n[HW] {category}: {len(items)} itens")
        for partial, schema in tqdm(items, desc=f"Enrich HW/{category}", disable=not TQDM_VERBOSE):
            sku = partial.get("cisco_product_id", "")
            debug_tag = f"HW__{category.replace(' ','_')}__{sku}"
            enriched = enrich_with_llm_fill_all(partial, schema, cache_key=f"HW::{category}::{sku}", debug_tag=debug_tag)
            # Garantia hard final (se sobrou algo vazio)
            if _has_nulls_or_empty(enriched):
                enriched = _fallback_fill_plausible(enriched, "hardware", category, sku, partial["commercial_name"])
            save_item_by_category(enriched, "hardware", category)
            counts["hardware"] += 1
            n += 1
            if max_items_per_category and n >= max_items_per_category:
                if verbose: print(f"  ‚Ü™Ô∏è  Limit reached ({max_items_per_category}) for HW/{category}")
                break
        counts["files"] += 1

    # SOFTWARE
    for category, items in buckets_sw.items():
        n = 0
        if verbose: print(f"\n[SW] {category}: {len(items)} itens")
        for partial, schema in tqdm(items, desc=f"Enrich SW/{category}", disable=not TQDM_VERBOSE):
            sku = partial.get("cisco_product_id", "")
            debug_tag = f"SW__{category.replace(' ','_')}__{sku}"
            enriched = enrich_with_llm_fill_all(partial, schema, cache_key=f"SW::{category}::{sku}", debug_tag=debug_tag)
            if _has_nulls_or_empty(enriched):
                enriched = _fallback_fill_plausible(enriched, "software", category, sku, partial["commercial_name"])
            save_item_by_category(enriched, "software", category)
            counts["software"] += 1
            n += 1
            if max_items_per_category and n >= max_items_per_category:
                if verbose: print(f"  ‚Ü™Ô∏è  Limit reached ({max_items_per_category}) for SW/{category}")
                break
        counts["files"] += 1

    _save_cache(CACHE_PATH, CACHE)

    if verbose:
        print("\n=== Summary ===")
        print(counts)
    return counts

# %% ============================ Execu√ß√£o (teste) =============================
summary = build_catalog_from_excel(
    xlsx_path=XLSX_PATH,
    max_items_per_category=MAX_ITEMS_PER_CATEGORY,
    limit_categories=LIMIT_CATEGORIES,   # ex.: ["Switches","Wireless"]
    verbose=True
)
summary


Rows in Excel: 4267


Classifying rows: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 4267/4267 [00:00<00:00, 21345.48it/s]



[HW] Wireless: 509 itens


Enrich HW/Wireless:   0%|          | 2/509 [00:00<00:09, 55.52it/s]


  ‚Ü™Ô∏è  Limit reached (3) for HW/Wireless

[HW] Switches: 1563 itens


Enrich HW/Switches:   0%|          | 2/1563 [00:00<00:50, 31.04it/s]


  ‚Ü™Ô∏è  Limit reached (3) for HW/Switches

[HW] Routers: 604 itens


Enrich HW/Routers:   0%|          | 2/604 [00:00<00:29, 20.52it/s]

  ‚Ü™Ô∏è  Limit reached (3) for HW/Routers






[HW] Firewall: 178 itens


Enrich HW/Firewall:   1%|          | 2/178 [00:00<00:04, 43.02it/s]


  ‚Ü™Ô∏è  Limit reached (3) for HW/Firewall

[HW] Connectors: 20 itens


Enrich HW/Connectors:  10%|‚ñà         | 2/20 [00:00<00:00, 51.30it/s]


  ‚Ü™Ô∏è  Limit reached (3) for HW/Connectors

[HW] Cabling: 3 itens


Enrich HW/Cabling:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 2/3 [00:00<00:00, 74.94it/s]


  ‚Ü™Ô∏è  Limit reached (3) for HW/Cabling

[HW] Antennas: 8 itens


Enrich HW/Antennas:  25%|‚ñà‚ñà‚ñå       | 2/8 [00:00<00:00, 101.87it/s]


  ‚Ü™Ô∏è  Limit reached (3) for HW/Antennas

[SW] Wireless: 545 itens


Enrich SW/Wireless:   0%|          | 2/545 [00:00<00:07, 77.12it/s]


  ‚Ü™Ô∏è  Limit reached (3) for SW/Wireless

[SW] Switches: 515 itens


Enrich SW/Switches:   0%|          | 2/515 [00:00<00:07, 68.09it/s]


  ‚Ü™Ô∏è  Limit reached (3) for SW/Switches

[SW] Licenses: 9 itens


Enrich SW/Licenses:  22%|‚ñà‚ñà‚ñè       | 2/9 [00:00<00:00, 67.71it/s]


  ‚Ü™Ô∏è  Limit reached (3) for SW/Licenses

[SW] Routers: 260 itens


Enrich SW/Routers:   1%|          | 2/260 [00:00<00:03, 65.94it/s]


  ‚Ü™Ô∏è  Limit reached (3) for SW/Routers

[SW] Firewall: 52 itens


Enrich SW/Firewall:   4%|‚ñç         | 2/52 [00:00<00:00, 63.46it/s]


  ‚Ü™Ô∏è  Limit reached (3) for SW/Firewall

[SW] Connectors: 1 itens


Enrich SW/Connectors: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 220.85it/s]



=== Summary ===
{'hardware': 21, 'software': 16, 'files': 13}


{'hardware': 21, 'software': 16, 'files': 13}

## Most Recent Model

In [1]:
# 1. Instala√ß√µes necess√°rias
#!pip install -q -U langchain-openai tavily-python langchain beautifulsoup4 chromadb pypdf unstructured

import os
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.tools.tavily_search import TavilySearchResults
from langchain.tools.retriever import create_retriever_tool
from langchain_community.document_loaders import CSVLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain import hub
from langchain.agents import create_openai_tools_agent, AgentExecutor
import warnings

warnings.filterwarnings('ignore')

# --- CONFIGURA√á√ÉO DAS CHAVES DE API ---
# ‚ö†Ô∏è Cole suas chaves aqui.
OPENAI_API_KEY = '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'
TAVILY_API_KEY = "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" # <-- SUA CHAVE DA TAVILY AQUI

os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ['TAVILY_API_KEY'] = TAVILY_API_KEY

In [9]:
# =============================================================
# Cisco Sales Assistant ‚Äì fluxo completo (julho/2025)
# =============================================================
"""
Cria um agente LangGraph capaz de:
  ‚Ä¢ analisar a consulta do cliente
  ‚Ä¢ projetar uma solu√ß√£o (Solution Designer)
  ‚Ä¢ buscar especifica√ß√µes t√©cnicas
  ‚Ä¢ precificar os componentes
  ‚Ä¢ sintetizar tudo em uma resposta final

Requisitos:
  pip install langchain langgraph langchain-openai scikit-learn numpy
"""

# -------------------------------------------------------------
# 0. Imports
# -------------------------------------------------------------
import json
import re
from typing import List, Dict, TypedDict, Optional

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from langchain.tools import tool
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.runnables import RunnableLambda
from langgraph.graph import END, StateGraph

# -------------------------------------------------------------
# 1. Carregar cat√°logo Cisco
# -------------------------------------------------------------
product_dict: Dict[str, Dict] = {}
PRICELIST_PATH = "data/raw/pricelist.json"      # ajuste se necess√°rio

try:
    with open(PRICELIST_PATH, encoding="utf-8") as f:
        data = json.load(f)
        products = data if isinstance(data, list) else data.get("products", [])
        for p in products:
            pid = p.get("cisco_product_id")
            if pid:
                product_dict[pid] = p
    print(f"‚úÖ Data loaded: {len(product_dict)} products")
except Exception as e:
    print(f"‚ùå Error loading product data: {e}")

# -------------------------------------------------------------
# 2. Preparar embeddings TF‚ÄëIDF para recomenda√ß√µes
# -------------------------------------------------------------
def prepare_recommendation_data():
    """Gera matriz TF‚ÄëIDF a partir do cat√°logo para busca sem√¢ntica."""
    texts: List[str] = []
    for p in product_dict.values():
        hw = p.get("technical_profile", {}).get("hardware_attributes", {})
        txt = (
            f"{p.get('commercial_name', '')} {p.get('product_type', '')} "
            + " ".join(f"{k}={v}" for k, v in hw.items())
        ).strip()
        texts.append(txt)
    vectorizer = TfidfVectorizer(stop_words="english")
    matrix = vectorizer.fit_transform(texts) if texts else None
    return vectorizer, matrix


vectorizer, tfidf_matrix = prepare_recommendation_data()
print("‚úÖ Recommendation data prepared")

# -------------------------------------------------------------
# 3. Helper ‚Äì lista enxuta de produtos (TOP‚ÄëK)
# -------------------------------------------------------------
def get_product_list_str(requirements: str, top_k: int = 50) -> str:
    if tfidf_matrix is None:
        return "(catalog empty)"
    vec = vectorizer.transform([requirements])
    sims = cosine_similarity(vec, tfidf_matrix).flatten()
    idxs = sims.argsort()[::-1][:top_k]
    prods = [list(product_dict.values())[i] for i in idxs]
    return "\n".join(
        f"- {p['cisco_product_id']}: {p['commercial_name']} "
        f"({p['product_type']})"
        for p in prods
    )

# -------------------------------------------------------------
# 4. Ferramentas (LangChain¬†@tool)
# -------------------------------------------------------------
@tool
def get_product_price(part_number: str) -> Dict:
    """Retrieve pricing information for a Cisco product."""
    prod = product_dict.get(part_number)
    if not prod:
        return {"error": f"Product {part_number} not found", "part_number": part_number}
    pricing = prod.get("pricing_model", {})
    return {
        "price": pricing.get("base_price", 0.0),
        "currency": pricing.get("currency", "USD"),
        "description": prod.get("commercial_name", ""),
        "part_number": part_number,
        "product_type": prod.get("product_type", ""),
    }


@tool
def get_technical_specs(part_number: str) -> Dict:
    """Retrieve hardware specifications for a Cisco product."""
    prod = product_dict.get(part_number)
    if not prod:
        return {"error": f"Product {part_number} not found", "part_number": part_number}
    hw = prod.get("technical_profile", {}).get("hardware_attributes", {})
    if not hw:
        return {"error": f"No technical specs for {part_number}", "part_number": part_number}
    return {
        "specifications": hw,
        "description": prod.get("commercial_name", ""),
        "part_number": part_number,
        "product_type": prod.get("product_type", ""),
    }


@tool
def recommend_products(requirements: str, max_results: int = 3) -> List[Dict]:
    """Recommend Cisco products that best match the given requirements."""
    if tfidf_matrix is None:
        return [{"error": "Catalog not indexed"}]
    vec = vectorizer.transform([requirements])
    sims = cosine_similarity(vec, tfidf_matrix).flatten()
    idxs = sims.argsort()[::-1][:max_results]
    base = list(product_dict.values())
    return [
        {
            "part_number": base[i]["cisco_product_id"],
            "commercial_name": base[i]["commercial_name"],
            "product_type": base[i]["product_type"],
            "similarity_score": float(sims[i]),
        }
        for i in idxs
    ]

# -------------------------------------------------------------
# 5. Pydantic¬†models
# -------------------------------------------------------------
class SolutionComponent(BaseModel):
    part_number: str = Field(description="Cisco product ID")
    quantity: int = Field(default=1, description="Quantity required")
    role: str = Field(description="Role in the solution")


class SolutionDesign(BaseModel):
    summary: str
    components: List[SolutionComponent]
    justification: str


class AgentRoutingDecision(BaseModel):
    needs_design: bool = False
    needs_technical: bool = False
    needs_pricing: bool = False
    query_parts: Dict[str, str] = Field(default_factory=dict)

# -------------------------------------------------------------
# 6. LLM¬†e¬†prompts
# -------------------------------------------------------------
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)

# ‚Äî Orchestrator (corrigido)
orchestrator_prompt = ChatPromptTemplate.from_template(
    """You are a Cisco sales orchestration system.

Analyse the user query and decide which specialised agents are needed:
  ‚Ä¢ Solution Designer   ‚Üí needs_design
  ‚Ä¢ Technical Agent     ‚Üí needs_technical
  ‚Ä¢ Pricing Agent       ‚Üí needs_pricing

ALWAYS output a JSON object that matches the schema shown in
{{format_instructions}}.

User query: {{query}}
"""
)

orchestrator_agent = orchestrator_prompt | llm.with_structured_output(
    AgentRoutingDecision
)

# ‚Äî Solution¬†Designer
design_prompt = ChatPromptTemplate.from_template(
    """You are a Cisco Solution Architect. Design a complete solution.

Customer Requirements:
{requirements}

Available Cisco Products:
{product_list}

Return only part_numbers that appear above.
Output as JSON in the schema provided."""
)

llm_creative = ChatOpenAI(model="gpt-4o-mini", temperature=0.4)  # s√≥ o designer

design_agent = (
    {
        "requirements": lambda x: x["requirements"],
        "product_list": lambda x: get_product_list_str(x["requirements"]),
        "format_instructions": lambda x: x["format_instructions"],
    }
    | design_prompt
    | llm_creative.with_structured_output(SolutionDesign)
)


# -------------------------------------------------------------
# 7. State¬†type
# -------------------------------------------------------------
class AgentState(TypedDict):
    user_query: str
    orchestrator_decision: Optional[AgentRoutingDecision]
    solution_design: Optional[SolutionDesign]
    integrity_errors: List[str]
    rule_errors: List[str]          # ‚Üê NEW
    technical_results: List[Dict]
    pricing_results: List[Dict]
    final_response: str

# -------------------------------------------------------------
# 8. N√≥¬†‚Äî¬†Orchestrator
# -------------------------------------------------------------
def orchestrator_node(state: AgentState) -> AgentState:
    print(f"\nüéª [Orchestrator] ¬´{state['user_query']}¬ª")

    q = state["user_query"]

    # 1) tentativa normal com o LLM
    try:
        decision = orchestrator_agent.invoke(
            {
                "query": q,
                "format_instructions": AgentRoutingDecision.schema(),
            }
        )
    except Exception:
        print("‚ö†Ô∏è LLM parse fail ‚Üí empty decision")
        decision = AgentRoutingDecision()

    # 2) heur√≠stica se vier tudo falso
    if not any([decision.needs_design, decision.needs_technical, decision.needs_pricing]):
        q_low = q.lower()
        decision = AgentRoutingDecision(
            needs_design=any(w in q_low for w in ["design", "architecture", "solution"]),
            needs_technical="spec" in q_low,
            needs_pricing=any(w in q_low for w in ["price", "cost", "quote", "pricing"]),
            query_parts={},
        )

    # 3) salva no estado e retorna
    state["orchestrator_decision"] = decision
    return state


# -------------------------------------------------------------
# 9. N√≥¬†‚Äî¬†Solution¬†Designer
# -------------------------------------------------------------
def solution_design_node(state: AgentState) -> AgentState:
    print("\nüé® [Solution Designer]")
    design = design_agent.invoke(
        {"requirements": state["user_query"], "format_instructions": SolutionDesign.schema()}
    )
    state["solution_design"] = design
    # for√ßa que o agente de pre√ßo rode depois
    state["orchestrator_decision"].needs_pricing = True

    # for√ßa que o fluxo passe tamb√©m pelo Technical Agent
    state["orchestrator_decision"].needs_technical = True

    # specs de cada componente
    state["technical_results"] = []
    for comp in design.components:
        res = get_technical_specs(comp.part_number)
        if "error" not in res:
            res["quantity"] = comp.quantity
        state["technical_results"].append(res)
    return state

def integrity_validator_node(state: AgentState) -> AgentState:
    """Camada¬†0 ‚Äì verifica se cada SKU existe e qty ‚â•‚ÄØ1."""
    design = state.get("solution_design")
    if design is None:
        return state

    errors, validated = [], []
    for comp in design.components:
        if comp.part_number not in product_dict:
            errors.append(f"SKU_NOT_FOUND: {comp.part_number}")
            continue
        qty = max(1, int(comp.quantity))
        if qty != comp.quantity:
            errors.append(f"QUANTITY_ADJUSTED: {comp.part_number}‚Üí{qty}")
        validated.append(
            SolutionComponent(
                part_number=comp.part_number,
                quantity=qty,
                role=comp.role,
            )
        )
    state["integrity_errors"] = errors
    design.components = validated            # substitui lista original
    return state

# -------------------------------------------------------------
# Validator‚ÄëRules  (Camada¬†1)
# -------------------------------------------------------------
def validator_rules_node(state: AgentState) -> AgentState:
    """Regras b√°sicas de compatibilidade / coer√™ncia."""
    errors: List[str] = []
    design = state.get("solution_design")
    if design is None:
        state["rule_errors"] = errors
        return state

    # ‚Äî Regra 1: se a query menciona Wi‚ÄëFi¬†6, AP deve suportar ax
    if "wi-fi 6" in state["user_query"].lower():
        for comp in design.components:
            spec = next(
                (t for t in state["technical_results"] if t.get("part_number") == comp.part_number),
                None,
            )
            if spec and spec.get("specifications", {}).get("wifi_standard", "").lower() != "802.11ax":
                errors.append(f"WIFI6_AP_RULE fail: {comp.part_number}")

    # ‚Äî Regra 2: componente marcado como PoE precisa ter info PoE
    for comp in design.components:
        if "poe" in comp.role.lower():
            spec = next(
                (t for t in state["technical_results"] if t.get("part_number") == comp.part_number),
                None,
            )
            if spec and "poe" not in spec.get("specifications", {}).get("power_requirements", "").lower():
                errors.append(f"POE_SWITCH_RULE fail: {comp.part_number}")

    state["rule_errors"] = errors
    if errors:
        print("‚ö†Ô∏è Rule errors ‚Üí", errors)
    return state



# -------------------------------------------------------------
# 10. N√≥ ‚Äî Technical Agent  (substitua TODO o bloco)
# -------------------------------------------------------------
### helper: extrai poss√≠veis Cisco part‚Äënumbers do texto
PART_RE = re.compile(r"[A-Z]{2,}\d+[A-Z]*-[A-Za-z0-9]+(?:-[A-Za-z0-9]+)*")

def extract_part_numbers(text: str) -> List[str]:
    return list({m.group(0) for m in PART_RE.finditer(text)})

def technical_agent_node(state: AgentState) -> AgentState:
    # pula se j√° h√° design
    if state.get("solution_design") is not None:
        print("‚è© Technical Agent skipped (solution design already provides specs)")
        return state

    query_part = state["orchestrator_decision"].query_parts.get(
        "technical", state["user_query"]
    )
    ids = extract_part_numbers(query_part)

    if ids:
        print(f"\nüîß [Technical Agent] Found explicit IDs: {ids}")
        state["technical_results"] = [get_technical_specs(pid) for pid in ids]
        return state

    # fallback para recomenda√ß√£o sem IDs
    if not state["orchestrator_decision"].needs_technical:
        print("‚è© Technical Agent skipped (flag false & no IDs)")
        return state

    print(f"\nüîß [Technical Agent] Generating recommendations for ¬´{query_part}¬ª")
    recs = recommend_products.invoke({"requirements": query_part, "max_results": 5})
    state["technical_results"] = []
    for r in recs:
        spec = get_technical_specs(r["part_number"])
        if "error" not in spec:
            spec["recommendation_score"] = r.get("similarity_score", 0)
        state["technical_results"].append(spec)
    return state


# -------------------------------------------------------------
# 11. N√≥ ‚Äî Pricing Agent
# -------------------------------------------------------------
def pricing_agent_node(state: AgentState) -> AgentState:
    """Gera lista de pre√ßos (com subtotal) para os SKUs relevantes."""
    if not state["orchestrator_decision"].needs_pricing:
        print("‚è© Pricing Agent skipped")
        return state

    print("\nüí∞ [Pricing Agent]")

    # 1) Se houver SolutionDesign, precifica exatamente seus componentes
    if isinstance(state.get("solution_design"), SolutionDesign):
        items = [
            {"part_number": c.part_number, "quantity": c.quantity}
            for c in state["solution_design"].components
        ]
    else:
        # 2) Tenta extrair IDs explicitamente mencionados na parte de pre√ßo da query
        pricing_part = state["orchestrator_decision"].query_parts.get(
            "pricing", state["user_query"]
        )
        ids = extract_part_numbers(pricing_part)
        if ids:
            items = [{"part_number": pid, "quantity": 1} for pid in ids]
        else:
            # 3) Fallback: usa IDs provenientes do Technical Agent
            items = [
                {
                    "part_number": t.get("part_number"),
                    "quantity": t.get("quantity", 1),
                }
                for t in state["technical_results"]
                if t.get("part_number")
            ]

    # Deduplicar somando quantidades
    dedup: Dict[str, int] = {}
    for it in items:
        dedup[it["part_number"]] = dedup.get(it["part_number"], 0) + it["quantity"]

    # Consultar pre√ßos e calcular subtotais
    state["pricing_results"] = []
    for pn, qty in dedup.items():
        price_info = get_product_price(pn)
        price_info.update(
            {
                "quantity": qty,
                "subtotal": price_info.get("price", 0) * qty,
            }
        )
        state["pricing_results"].append(price_info)

    return state



###### Implementar futyuramente

def rules_validator_node(state: AgentState) -> AgentState:
    print("üîç [Validator‚ÄëRules] ‚Äì not implemented yet")
    return state

def reviewer_node(state: AgentState) -> AgentState:
    print("üßê [Reviewer‚ÄëLLM] ‚Äì skipped (placeholder)")
    return state




# -------------------------------------------------------------
# 12. N√≥ ‚Äî Synthesizer  (vers√£o refinada)
# -------------------------------------------------------------
def synthesize_node(state: AgentState) -> AgentState:
    print("\nüéØ [Synthesizer]")
    lines: List[str] = []

    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    # 1) Solution Design (se houver)
    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    if isinstance(state.get("solution_design"), SolutionDesign):
        d = state["solution_design"]
        lines.append("üöÄ Solution Design")
        lines.append(d.summary)

        lines.append("\nüîß Components:")
        for idx, comp in enumerate(d.components, 1):
            desc = next(
                (
                    t.get("description")
                    for t in state["technical_results"]
                    if t.get("part_number") == comp.part_number
                ),
                comp.part_number,
            )
            lines.append(f"{idx}. {desc} ({comp.quantity}√ó) ‚Äì {comp.role}")

        lines.append("\n‚úÖ Justification:\n" + d.justification)

    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    # 2) Technical Specifications  (sempre que houver)
    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    if state["technical_results"]:
        lines.append("\nüîß Technical Specifications:")
        for t in state["technical_results"]:
            if "error" in t:
                lines.append(f"- {t.get('part_number')}: {t['error']}")
                continue
            lines.append(f"\n‚Ä¢ {t['description']} ({t['part_number']})")
            for k, v in t.get("specifications", {}).items():
                lines.append(f"  - {k.replace('_', ' ').title()}: {v}")

    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    # 3) Erros de integridade / regras
    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    if state.get("integrity_errors"):
        lines.append("\n‚ö†Ô∏è Integrity issues:")
        for err in state["integrity_errors"]:
            lines.append(f"- {err}")

    if state.get("rule_errors"):
        lines.append("\n‚ö†Ô∏è Rule issues:")
        for err in state["rule_errors"]:
            lines.append(f"- {err}")

    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    # 4) Pricing
    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    if state["pricing_results"]:
        total = 0.0
        currency = "USD"
        lines.append("\nüíµ Pricing:")
        for p in state["pricing_results"]:
            if "error" in p:
                lines.append(f"- {p.get('part_number')}: {p['error']}")
                continue
            currency = p["currency"]
            total += p["subtotal"]
            lines.append(
                f"- {p['description']} ({p['quantity']}√ó): "
                f"{currency} {p['subtotal']:.2f}"
            )
        lines.append(f"\nTOTAL ESTIMATED: {currency} {total:.2f}")

    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    # 5) Caso nenhum dado relevante
    # ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
    if not lines:
        lines.append("‚ùå No relevant information found")

    state["final_response"] = "\n".join(lines)
    return state



# -------------------------------------------------------------
# 13. Roteamento (todas as fun√ß√µes)
# -------------------------------------------------------------
def route_after_orch(state: AgentState) -> str:
    """Primeira decis√£o: vai para designer, tech, price ou direto ao synth."""
    dec = state["orchestrator_decision"]
    if dec.needs_design:
        return "designer"
    if dec.needs_technical:
        return "tech"
    if dec.needs_pricing:
        return "price"
    return "synth"


def route_after_designer(_: AgentState) -> str:
    """Sempre passar pelo Validator‚ÄëIntegrity depois do designer."""
    return "integrity"


def route_after_integrity(_: AgentState) -> str:
    """Ap√≥s Camada¬†0, encaminha para as futuras regras."""
    return "rules"


def route_after_rules(_: AgentState) -> str:
    """Placeholder ‚Äì sempre manda para o reviewer (LLM stub)."""
    return "reviewer"


def route_after_reviewer(state: AgentState) -> str:
    """Decide se ainda precisa specs (tech) ou pre√ßo, sen√£o sintetiza."""
    dec = state["orchestrator_decision"]
    if dec.needs_technical:
        return "tech"
    if dec.needs_pricing:
        return "price"
    return "synth"


def route_after_tech(state: AgentState) -> str:
    """Se j√° precisamos de pre√ßo, vai ao pricing; sen√£o, sintetiza."""
    return "price" if state["orchestrator_decision"].needs_pricing else "synth"


def route_after_price(_: AgentState) -> str:
    """Pre√ßo √© a √∫ltima etapa antes de sintetizar."""
    return "synth"

# -------------------------------------------------------------
# 14. Construir o grafo
# -------------------------------------------------------------
workflow = StateGraph(AgentState)

# 1) N√≥s
workflow.add_node("orch", orchestrator_node)
workflow.add_node("designer", solution_design_node)           # Solution Designer
workflow.add_node("integrity", integrity_validator_node)
workflow.add_node("rules", validator_rules_node)       # stub
workflow.add_node("reviewer", reviewer_node)           # stub
workflow.add_node("tech", technical_agent_node)
workflow.add_node("price", pricing_agent_node)
workflow.add_node("synth", synthesize_node)


# 2) Ponto de entrada
workflow.set_entry_point("orch")

# 3) Condicionais
workflow.add_conditional_edges("orch", route_after_orch, {
    "designer": "designer",
    "tech": "tech",
    "price": "price",
    "synth": "synth",
})

workflow.add_conditional_edges("designer", lambda _: "integrity", {
    "integrity": "integrity",
})

workflow.add_conditional_edges("integrity", lambda _: "rules", {
    "rules": "rules",
})

workflow.add_conditional_edges("rules", lambda _: "reviewer", {
    "reviewer": "reviewer",
})

workflow.add_conditional_edges("reviewer", route_after_reviewer, {
    "tech": "tech",
    "price": "price",
    "synth": "synth",
})

workflow.add_conditional_edges("tech", route_after_tech, {
    "price": "price",
    "synth": "synth",
})

workflow.add_conditional_edges("price", route_after_price, {
    "synth": "synth",
})

# 4) Encerramento
workflow.add_edge("synth", END)

# 5) Compilar
app = workflow.compile()


# -------------------------------------------------------------
# 15. Helper para executar
# -------------------------------------------------------------
def run_sales_quote(query: str) -> str:
    init: AgentState = {
        "user_query": query,
        "orchestrator_decision": None,
        "solution_design": None,
        "technical_results": [],
        "pricing_results": [],
        "integrity_errors": [],
        "rule_errors": [],
        "final_response": "",
    }
    final_state = app.invoke(init)
    return final_state["final_response"]

# -------------------------------------------------------------
# 16. Exemplo
# -------------------------------------------------------------
if __name__ == "__main__":
    q = (
        "Design a secure branch‚Äëoffice solution for 50 users with Wi‚ÄëFi¬†6, "
        "firewall and PoE switches. Provide pricing."
    )
    print(run_sales_quote(q))


‚úÖ Data loaded: 16 products
‚úÖ Recommendation data prepared

üéª [Orchestrator] ¬´Design a secure branch‚Äëoffice solution for 50 users with Wi‚ÄëFi¬†6, firewall and PoE switches. Provide pricing.¬ª

üé® [Solution Designer]
üßê [Reviewer‚ÄëLLM] ‚Äì skipped (placeholder)
‚è© Technical Agent skipped (solution design already provides specs)

üí∞ [Pricing Agent]

üéØ [Synthesizer]
üöÄ Solution Design
Design a secure branch-office solution for 50 users with Wi-Fi 6, firewall, and PoE switches.

üîß Components:
1. ASA 5516-X with FirePOWER Services (1√ó) ‚Äì Firewall
2. Meraki MR53E Access Point (5√ó) ‚Äì Access Point
3. Meraki Dual Band Omni Antennas (5√ó) ‚Äì Antenna

‚úÖ Justification:
The ASA5516-FPWR-K9 provides robust firewall capabilities with FirePOWER services, ensuring security for the branch office. The MR53E-HW access points support Wi-Fi 6, providing high-speed wireless connectivity for up to 50 users. The MA-ANT-20 omnidirectional antennas enhance the coverage and perf

In [11]:
print(run_sales_quote(
    "whats the price for MR42E-HW"
))



üéª [Orchestrator] ¬´whats the price for MR42E-HW¬ª

üí∞ [Pricing Agent]

üéØ [Synthesizer]

üíµ Pricing:
- Meraki MR42E Access Point (1√ó): USD 1099.00

TOTAL ESTIMATED: USD 1099.00


In [190]:
print(run_sales_quote(
    "Design a secure branch-office solution for 50 users with Wi‚ÄëFi 6, firewall and PoE switches. Provide pricing."
))

print(run_sales_quote(
    "I need technical specs for ASA5516-FPWR-K9 and pricing for MR53E-HW"
))



üéª [Orchestrator] ¬´Design a secure branch-office solution for 50 users with Wi‚ÄëFi 6, firewall and PoE switches. Provide pricing.¬ª

üé® [Solution Designer]
üîç [Validator‚ÄëRules] ‚Äì not implemented yet
üßê [Reviewer‚ÄëLLM] ‚Äì skipped (placeholder)
‚è© Technical Agent skipped (solution design already provides specs)

üí∞ [Pricing Agent]

üéØ [Synthesizer]
üöÄ Solution Design
Design a secure branch-office solution for 50 users with Wi-Fi 6, firewall, and PoE switches.

üîß Components:
1. ASA 5555-X Firewall (1√ó) ‚Äì Firewall
2. Meraki MR53E Access Point (3√ó) ‚Äì Access Point for Wi-Fi 6 coverage
3. ASA 5516-X with FirePOWER Services (1√ó) ‚Äì Firewall with FirePOWER Services

‚úÖ Justification:
The ASA5555-X is selected for its high performance and security features suitable for a branch office. The MR53E-HW access points provide Wi-Fi 6 capabilities for better performance and capacity for 50 users. The ASA5516-FPWR-K9 offers integrated FirePOWER services for enhanced s

In [176]:
# Test 1: Solicita√ß√£o t√©cnica + pre√ßo
test_query_1 = "I need technical specs for ASA5516-FPWR-K9 and pricing for MR53E-HW"
result_1 = run_sales_quote(test_query_1)
print("\nüí¨ CLIENT RESPONSE 1:")
print(result_1)



üéª [Orchestrator] ¬´I need technical specs for ASA5516-FPWR-K9 and pricing for MR53E-HW¬ª

üîß [Technical Agent] Found explicit IDs: ['ASA5516-FPWR-K9', 'MR53E-HW']

üí∞ [Pricing Agent]

üéØ [Synthesizer]

üí¨ CLIENT RESPONSE 1:

üíµ Pricing:
- ASA 5516-X with FirePOWER Services (1√ó): USD 5995.00
- Meraki MR53E Access Point (1√ó): USD 1699.00

TOTAL ESTIMATED: USD 7694.00


## Adapta√ß√£o Completa para LlamaIndex

In [8]:
!pip install -q -U llama-index llama-index-embeddings-openai beautifulsoup4 pypdf unstructured

#!pip install -q -U llama-index==0.10.0 llama-index-embeddings-openai==0.1.0 beautifulsoup4==4.12.3 pypdf==4.2.0 unstructured==0.13.0 pillow==10.3.0 tenacity==8.2.3 protobuf==4.25.3

In [None]:
import os
import warnings

warnings.filterwarnings('ignore')

# --- CONFIGURA√á√ÉO DAS CHAVES DE API ---
# ‚ö†Ô∏è Cole suas chaves aqui.
OPENAI_API_KEY = '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'
TAVILY_API_KEY = "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" # <-- SUA CHAVE DA TAVILY AQUI

os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
os.environ['TAVILY_API_KEY'] = TAVILY_API_KEY

In [12]:
import os
from llama_index.core import VectorStoreIndex, Settings
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.tools import FunctionTool
from llama_index.core.agent import ReActAgent
from llama_index.llms.openai import OpenAI
import warnings

In [14]:
# 1. Instala√ß√µes necess√°rias (remover LangChain, adicionar LlamaIndex)
#!pip install -q -U llama-index llama-index-embeddings-openai beautifulsoup4 pypdf unstructured

import os
from llama_index.core import VectorStoreIndex, Settings
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.tools import FunctionTool
from llama_index.core.agent import ReActAgent
from llama_index.llms.openai import OpenAI
import warnings

warnings.filterwarnings('ignore')

# --- CONFIGURA√á√ÉO DAS CHAVES ---
OPENAI_API_KEY = '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'  # Substituir
os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY

# =============================================================
# Cisco Sales Assistant com LlamaIndex (Otimizado)
# =============================================================

# -------------------------------------------------------------
# 0. Configura√ß√£o Global LlamaIndex
# -------------------------------------------------------------
# ‚ö†Ô∏è Otimizado para performance Cisco
Settings.embed_model = OpenAIEmbedding(
    model="text-embedding-3-small",
    dimensions=256  # 50% mais r√°pido que 1536-dim
)
Settings.llm = OpenAI(model="gpt-4o-mini", temperature=0)
Settings.node_parser = SentenceSplitter(
    chunk_size=512,
    chunk_overlap=20,
    separator="\n",
    paragraph_separator="\n\n"
)

# -------------------------------------------------------------
# 1. Carregar cat√°logo Cisco como √çndice H√≠brido
# -------------------------------------------------------------
from llama_index.core import SimpleDirectoryReader
from llama_index.core.vector_stores import SimpleVectorStore

# Carregar documentos (ajuste o caminho)
documents = SimpleDirectoryReader("data/raw").load_data()

# Criar √≠ndice com busca h√≠brida (keyword + vector)
vector_store = SimpleVectorStore()  # FAISS-like local
index = VectorStoreIndex.from_documents(
    documents, 
    vector_store=vector_store,
    show_progress=True
)
query_engine = index.as_query_engine(
    similarity_top_k=5,
    vector_store_query_mode="hybrid"
)
print(f"‚úÖ √çndice criado com {len(documents)} documentos")

# -------------------------------------------------------------
# 2. Ferramentas Adaptadas para LlamaIndex
# -------------------------------------------------------------
# --- Fun√ß√£o de Recomenda√ß√£o com RAG H√≠brido ---
def recommend_products(query: str, top_k: int = 3) -> list:
    """Recomenda produtos Cisco baseado em descri√ß√µes t√©cnicas."""
    results = query_engine.query(query)
    return [
        {
            "part_number": node.metadata.get("cisco_product_id", ""),
            "commercial_name": node.metadata.get("commercial_name", ""),
            "score": node.score
        }
        for node in results.source_nodes[:top_k]
    ]

# --- Ferramenta de Pre√ßos (mantida) ---
def get_product_price(part_number: str) -> dict:
    """Busca pre√ßo de produto por part number."""
    # Implementa√ß√£o direta (sem LangChain)
    return {"part_number": part_number, "price": 299.99}  # Mock

# --- Ferramentas como Objetos LlamaIndex ---
recommend_tool = FunctionTool.from_defaults(fn=recommend_products)
price_tool = FunctionTool.from_defaults(fn=get_product_price)

# -------------------------------------------------------------
# 3. Agente ReAct com LlamaIndex
# -------------------------------------------------------------
# ‚ö†Ô∏è Vers√£o simplificada e mais r√°pida
agent = ReActAgent.from_tools(
    tools=[recommend_tool, price_tool],
    llm=Settings.llm,
    verbose=True,
    max_iterations=6
)

# -------------------------------------------------------------
# 4. Fluxo de Cota√ß√£o Otimizado
# -------------------------------------------------------------
def generate_cisco_quote(query: str) -> str:
    """Gera cota√ß√£o completa com cen√°rios."""
    # Passo 1: An√°lise de requisitos
    analysis_prompt = f"""
    Como especialista Cisco, analise estes requisitos e identifique:
    - Tipo de solu√ß√£o (rede, seguran√ßa, colabora√ß√£o)
    - Componentes cr√≠ticos
    - Restri√ß√µes de or√ßamento
    
    Requisitos: {query}
    """
    analysis = agent.chat(analysis_prompt).response
    
    # Passo 2: Gera√ß√£o de cen√°rios
    scenario_prompt = f"""
    Com base nesta an√°lise:
    {analysis}
    
    Gere TR√äS cen√°rios de cota√ß√£o:
    1. Custo-Otimizado: Foco em pre√ßo
    2. Performance-M√°xima: Melhores recursos
    3. Balanceado: Equil√≠brio custo-benef√≠cio
    
    Use esta estrutura:
    [Cen√°rio X]
    Descri√ß√£o: ...
    Componentes: 
      - Produto A (quantidade)
      - Produto B (quantidade)
    Trade-offs: ...
    """
    scenarios = agent.chat(scenario_prompt).response
    
    # Passo 3: Precifica√ß√£o
    pricing_prompt = f"""
    Para estes cen√°rios:
    {scenarios}
    
    Calcule:
    - Pre√ßo total por cen√°rio
    - Economia vs. MSRP
    - 3 talking points por cen√°rio
    """
    pricing = agent.chat(pricing_prompt).response
    
    return f"## An√°lise T√©cnica\n{analysis}\n\n## Cen√°rios\n{scenarios}\n\n## Precifica√ß√£o\n{pricing}"

# -------------------------------------------------------------
# 5. Exemplo de Execu√ß√£o
# -------------------------------------------------------------
if __name__ == "__main__":
    query = (
        "Solu√ß√£o de rede para escrit√≥rio com 50 usu√°rios, "
        "requer Wi-Fi 6, switches PoE+ e firewall b√°sico. "
        "Or√ßamento m√°ximo: $15k."
    )
    print(generate_cisco_quote(query))

Failed to load file C:\Users\Giovani\Desktop\EMPRESAS\02¬∫ DATA, CLOUD & BLOCKCHAIN\DATA & AI\11 - CONSULTORIAS\BAIRESDEV\PROJETOS\CISCO\cisco-quote-assistant\data\raw\_product_catalog.csv with error: Error tokenizing data. C error: Expected 1 fields in line 11, saw 2
. Skipping...
Failed to load file C:\Users\Giovani\Desktop\EMPRESAS\02¬∫ DATA, CLOUD & BLOCKCHAIN\DATA & AI\11 - CONSULTORIAS\BAIRESDEV\PROJETOS\CISCO\cisco-quote-assistant\data\raw\Pricelist.csv with error: Error tokenizing data. C error: Expected 2 fields in line 12, saw 3
. Skipping...


Parsing nodes:   0%|          | 0/6 [00:00<?, ?it/s]

Generating embeddings:   0%|          | 0/982 [00:00<?, ?it/s]

‚úÖ √çndice criado com 6 documentos
> Running step c5cd8ab7-5dfa-4ed3-b6a8-89527b824751. Step input: 
    Como especialista Cisco, analise estes requisitos e identifique:
    - Tipo de solu√ß√£o (rede, seguran√ßa, colabora√ß√£o)
    - Componentes cr√≠ticos
    - Restri√ß√µes de or√ßamento
    
    Requisitos: Solu√ß√£o de rede para escrit√≥rio com 50 usu√°rios, requer Wi-Fi 6, switches PoE+ e firewall b√°sico. Or√ßamento m√°ximo: $15k.
    



This implementation will be removed in a v0.13.0 and the new implementation will be promoted to the `from llama_index.core.agent import ReActAgent` path.

See the docs for more information: https://docs.llamaindex.ai/en/stable/understanding/agent/)
  return cls(

This implementation will be removed in a v0.13.0.

See the docs for more information on updated agent usage: https://docs.llamaindex.ai/en/stable/understanding/agent/)
  return old_new1(cls, *args, **kwargs)


[1;3;38;5;200mThought: A partir dos requisitos fornecidos, posso identificar o tipo de solu√ß√£o, os componentes cr√≠ticos e as restri√ß√µes de or√ßamento.
 
- Tipo de solu√ß√£o: Rede
- Componentes cr√≠ticos:
  - Acesso Wi-Fi 6
  - Switches PoE+
  - Firewall b√°sico
- Restri√ß√µes de or√ßamento: $15k
Answer: A solu√ß√£o requerida √© uma solu√ß√£o de rede, com componentes cr√≠ticos que incluem Wi-Fi 6, switches PoE+ e um firewall b√°sico, e o or√ßamento m√°ximo √© de $15k.
[0m> Running step c6be9b82-e464-47bc-a35e-d4edf71448f2. Step input: 
    Com base nesta an√°lise:
    A solu√ß√£o requerida √© uma solu√ß√£o de rede, com componentes cr√≠ticos que incluem Wi-Fi 6, switches PoE+ e um firewall b√°sico, e o or√ßamento m√°ximo √© de $15k.
    
    Gere TR√äS cen√°rios de cota√ß√£o:
    1. Custo-Otimizado: Foco em pre√ßo
    2. Performance-M√°xima: Melhores recursos
    3. Balanceado: Equil√≠brio custo-benef√≠cio
    
    Use esta estrutura:
    [Cen√°rio X]
    Descri√ß√£o: ...
    Comp

In [None]:
# =============================================================
# Cisco Sales Assistant ‚Äì fluxo completo com valida√ß√£o Camada¬†0
# (Designer criativo, Validator‚ÄëIntegrity, stubs para futuras
#  Validator‚ÄëRules e Reviewer‚ÄëLLM)
# =============================================================
"""
Rodar este arquivo cria um agente LangGraph capaz de:
  ‚Ä¢ analisar a consulta do cliente
  ‚Ä¢ projetar uma solu√ß√£o (Solution Designer) com leve criatividade
  ‚Ä¢ validar SKU/quantidade (Integrity)
  ‚Ä¢ (stubs) validar compatibilidade & revisar via LLM
  ‚Ä¢ buscar especifica√ß√µes t√©cnicas
  ‚Ä¢ precificar os componentes
  ‚Ä¢ sintetizar tudo numa resposta final

Requisitos:
  pip install langchain langgraph langchain-openai scikit-learn numpy
"""

# -------------------------------------------------------------
# 0. Imports
# -------------------------------------------------------------
import json
import re
from typing import List, Dict, TypedDict, Optional

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

from langchain.tools import tool
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_core.runnables import RunnableLambda
from langgraph.graph import END, StateGraph

# -------------------------------------------------------------
# 1. Cat√°logo Cisco (ajuste caminho se necess√°rio)
# -------------------------------------------------------------
product_dict: Dict[str, Dict] = {}
PRICELIST_PATH = "data/raw/pricelist.json"

try:
    with open(PRICELIST_PATH, encoding="utf-8") as f:
        data = json.load(f)
        products = data if isinstance(data, list) else data.get("products", [])
        for p in products:
            pid = p.get("cisco_product_id")
            if pid:
                product_dict[pid] = p
    print(f"‚úÖ Data loaded: {len(product_dict)} products")
except Exception as e:
    print(f"‚ùå Error loading product data: {e}")

# -------------------------------------------------------------
# 2. Embeddings TF‚ÄëIDF
# -------------------------------------------------------------

def prepare_recommendation_data():
    texts: List[str] = []
    for p in product_dict.values():
        hw = p.get("technical_profile", {}).get("hardware_attributes", {})
        txt = (
            f"{p.get('commercial_name', '')} {p.get('product_type', '')} "
            + " ".join(f"{k}={v}" for k, v in hw.items())
        ).strip()
        texts.append(txt)
    vec = TfidfVectorizer(stop_words="english")
    mat = vec.fit_transform(texts) if texts else None
    return vec, mat

vectorizer, tfidf_matrix = prepare_recommendation_data()
print("‚úÖ Recommendation data prepared")

# -------------------------------------------------------------
# 3. Helpers
# -------------------------------------------------------------
PART_RE = re.compile(r"[A-Z]{2,}\d+[A-Z]*-[A-Za-z0-9]+(?:-[A-Za-z0-9]+)*")

def extract_part_numbers(text: str) -> List[str]:
    """Return unique Cisco‚Äëlooking part numbers in text."""
    return list({m.group(0) for m in PART_RE.finditer(text)})

def get_product_list_str(requirements: str, top_k: int = 50) -> str:
    if tfidf_matrix is None:
        return "(catalog empty)"
    vec = vectorizer.transform([requirements])
    sims = cosine_similarity(vec, tfidf_matrix).flatten()
    idxs = sims.argsort()[::-1][:top_k]
    prods = [list(product_dict.values())[i] for i in idxs]
    return "\n".join(
        f"- {p['cisco_product_id']}: {p['commercial_name']} ({p['product_type']})"
        for p in prods
    )

# -------------------------------------------------------------
# 4. LangChain tools
# -------------------------------------------------------------
@tool
def get_product_price(part_number: str) -> Dict:
    """Retrieve pricing information for a Cisco product."""
    prod = product_dict.get(part_number)
    if not prod:
        return {"error": "SKU_NOT_FOUND", "part_number": part_number}
    pricing = prod.get("pricing_model", {})
    return {
        "price": pricing.get("base_price", 0.0),
        "currency": pricing.get("currency", "USD"),
        "description": prod.get("commercial_name", ""),
        "part_number": part_number,
        "product_type": prod.get("product_type", ""),
    }

@tool
def get_technical_specs(part_number: str) -> Dict:
    """Retrieve hardware specs for a Cisco product."""
    prod = product_dict.get(part_number)
    if not prod:
        return {"error": "SKU_NOT_FOUND", "part_number": part_number}
    hw = prod.get("technical_profile", {}).get("hardware_attributes", {})
    if not hw:
        return {"error": "NO_SPECS", "part_number": part_number}
    return {
        "specifications": hw,
        "description": prod.get("commercial_name", ""),
        "part_number": part_number,
        "product_type": prod.get("product_type", ""),
    }

@tool
def recommend_products(requirements: str, max_results: int = 3) -> List[Dict]:
    """Recommend products via TF‚ÄëIDF cosine similarity."""
    if tfidf_matrix is None:
        return [{"error": "CATALOG_NOT_INDEXED"}]
    vec = vectorizer.transform([requirements])
    sims = cosine_similarity(vec, tfidf_matrix).flatten()
    idxs = sims.argsort()[::-1][:max_results]
    base = list(product_dict.values())
    return [
        {
            "part_number": base[i]["cisco_product_id"],
            "commercial_name": base[i]["commercial_name"],
            "product_type": base[i]["product_type"],
            "similarity_score": float(sims[i]),
        }
        for i in idxs
    ]

# -------------------------------------------------------------
# 5. Pydantic models
# -------------------------------------------------------------
class SolutionComponent(BaseModel):
    part_number: str
    quantity: int = Field(ge=1)
    role: str

class SolutionDesign(BaseModel):
    summary: str
    components: List[SolutionComponent]
    justification: str

class AgentRoutingDecision(BaseModel):
    needs_design: bool = False
    needs_technical: bool = False
    needs_pricing: bool = False
    query_parts: Dict[str, str] = Field(default_factory=dict)

# -------------------------------------------------------------
# 6. LLM & prompts
# -------------------------------------------------------------
llm_cold = ChatOpenAI(model="gpt-4o-mini", temperature=0)
llm_creative = ChatOpenAI(model="gpt-4o-mini", temperature=0.4)  # designer

orchestrator_prompt = ChatPromptTemplate.from_template(
    """You are a Cisco sales orchestrator. Output JSON (AgentRoutingDecision) telling which agents to call.
User query: {{query}}
{{format_instructions}}"""
)

orchestrator_agent = orchestrator_prompt | llm_cold.with_structured_output(AgentRoutingDecision)

design_prompt = ChatPromptTemplate.from_template(
    """You are a Cisco Solution Architect. Combine ONLY the products listed below to build a solution. Keep summary ‚â§120‚ÄØtokens.
Requirements:
{requirements}

Available Cisco Products:
{product_list}

Return JSON matching schema.
"""
)

design_agent = (
    {
        "requirements": lambda x: x["requirements"],
        "product_list": lambda x: get_product_list_str(x["requirements"]),
        "format_instructions": lambda x: x["format_instructions"],
    }
    | design_prompt
    | llm_creative.with_structured_output(SolutionDesign)
)

# -------------------------------------------------------------
# 7. Graph state type
# -------------------------------------------------------------
class AgentState(TypedDict):
    user_query: str
    orchestrator_decision: Optional[AgentRoutingDecision]
    solution_design: Optional[SolutionDesign]
    integrity_errors: List[str]
    technical_results: List[Dict]
    pricing_results: List[Dict]
    final_response: str

# -------------------------------------------------------------
# 8. Nodes
# -------------------------------------------------------------

def orchestrator_node(state: AgentState) -> AgentState:
    print(f"\nüéª [Orchestrator] ¬´{state['user_query']}¬ª")
    q = state["user_query"]
    try:
        decision = orchestrator_agent.invoke({
            "query": q,
            "format_instructions": AgentRoutingDecision.schema(),
        })
    except Exception:
        decision = AgentRoutingDecision()
    if not any([decision.needs_design, decision.needs_technical, decision.needs_pricing]):
        ql = q.lower()
        decision = AgentRoutingDecision(
            needs_design=any(w in ql for w in ["design", "architecture", "solution"]),
            needs_technical="spec" in ql,
            needs_pricing=any(w in ql for w in ["price", "cost", "quote", "pricing"]),
            query_parts={},
        )
    state["orchestrator_decision"] = decision
    return state


def designer_node(state: AgentState) -> AgentState:
    print("\nüé® [Solution Designer]")
    design = design_agent.invoke({
        "requirements": state["user_query"],
        "format_instructions": SolutionDesign.schema(),
    })
    state["solution_design"] = design
    state["orchestrator_decision"].needs_pricing = True  # for√ßa pricing
    return state


def integrity_validator_node(state: AgentState) -> AgentState:
    """Camada 0 ‚Äì garante SKU v√°lido & qty ‚â•1"""
    design = state.get("solution_design")
    if design is None:
        return state

    errors: List[str] = []
    validated: List[SolutionComponent] = []
    for comp in design.components:
        if comp.part_number not in product_dict:
            errors.append(f"SKU_NOT_FOUND: {comp.part_number}")
            continue
        qty = max(1, int(comp.quantity))
        if qty != comp.quantity:
            errors.append(f"QUANTITY_ADJUSTED: {comp.part_number} ‚Üí {qty}")
        validated.append(SolutionComponent(
            part_number=comp.part_number,
            quantity=qty,
            role=comp.role,
        ))
    if errors:
        print("‚ö†Ô∏è Integrity errors ‚Üí", errors)
    state["integrity_errors"] = errors
    # substitui componentes por lista validada (mesmo se houver erros)
    state["solution_design"].components = validated
    return state

# --- Placeholder nodes ----------------------------------------------------

def rules_validator_node(state: AgentState) -> AgentState:
    print("üîç [Validator‚ÄëRules] ‚Äì not implemented yet")
    return state

def reviewer_node(state: AgentState) -> AgentState:
    print("üßê [Reviewer‚ÄëLLM] ‚Äì skipped (placeholder)")
    return state

# -------------------------------------------------------------------------

def technical_agent_node(state: AgentState) -> AgentState:
    # skip if we already have design (specs later)
    if state.get("solution_design") is not None:
        return state

    if not state["orchestrator_decision"].needs_technical:
        return state

    query_part = state["orchestrator_decision"].query_parts.get(
        "technical", state["user_query"])
    ids = extract_part_numbers(query_part)
    if ids:
        state["technical_results"] = [get_technical_specs(pid) for pid in ids]
    else:
        recs = recommend_products.invoke({"requirements": query_part, "max_results": 5})
        state["technical_results"] = [get_technical_specs(r["part_number"]) for r in recs]
    return state


def pricing_agent_node(state: AgentState) -> AgentState:
    if not state["orchestrator_decision"].needs_pricing:
        return state

    # prefer componentes do design
    if isinstance(state.get("solution_design"), SolutionDesign):
        items = [{"pn": c.part_number, "qty": c.quantity} for c in state["solution_design"].components]
    else:
        ids = extract_part_numbers(state["user_query"])
        items = [{"pn": i, "qty": 1} for i in ids]

    state["pricing_results"] = []
    for it in items:
        info = get_product_price(it["pn"])
        info.update({"quantity": it["qty"], "subtotal": info.get("price", 0) * it["qty"]})
        state["pricing_results"].append(info)
    return state


def synthesize_node(state: AgentState) -> AgentState:
    lines: List[str] = []
    if isinstance(state.get("solution_design"), SolutionDesign):
        d = state["solution_design"]
        lines.append("üöÄ Solution Design\n" + d.summary)
        lines.append("\nüîß Components:")
        for i, c in enumerate(d.components, 1):
            lines.append(f"{i}. {c.part_number} ({c.quantity}√ó) ‚Äì {c.role}")
        if state.get("integrity_errors"):
            lines.append("\n‚ö†Ô∏è Integrity issues:\n- " + "\n- ".join(state["integrity_errors"]))
        lines.append("\n‚úÖ Justification:\n" + d.justification)

    if state["pricing_results"]:
        total = 0.0
        currency = "USD"
        lines.append("\nüíµ Pricing:")
        for p in state["pricing_results"]:
            if "error" in p:
                lines.append(f"- {p.get('part_number')}: {p['error']}")
                continue
            currency = p["currency"]
            total += p["subtotal"]
            lines.append(f"- {p['description']} ({p['quantity']}√ó): {currency} {p['subtotal']:.2f}")
        lines.append(f"\nTOTAL: {currency} {total:.2f}")

    if not lines:
        lines.append("No relevant information found.")
    state["final_response"] = "\n".join(lines)
    return state

# -------------------------------------------------------------
# 9. Routing helpers
# -------------------------------------------------------------

def route_after_orch(state: AgentState):
    dec = state["orchestrator_decision"]
    if dec.needs_design:
        return "designer"
    if dec.needs_technical:
        return "tech"
    if dec.needs_pricing:
        return "price"
    return "synth"

def route_after_designer(_):
    return "integrity"

def route_after_integrity(_):
    return "rules"

def route_after_rules(_):
    return "reviewer"

def route_after_reviewer(state: AgentState):
    dec = state["orchestrator_decision"]
    if dec.needs_technical:
        return "tech"
    if dec.needs_pricing:
        return "price"
    return "synth"


def route_after_tech(state: AgentState):
    return "price" if state["orchestrator_decision"].needs_pricing else "synth"

def route_after_price(_):
    return "synth"

# -------------------------------------------------------------
# 10. Build graph
# -------------------------------------------------------------
workflow


In [7]:
import logging
from services.ai_engine.app.core.tools import product_search_tool
from langchain_openai import OpenAIEmbeddings
import openai

# Configura√ß√£o de logging para garantir que as mensagens apare√ßam
logging.basicConfig(level=logging.INFO)

# Defina a chave da API diretamente no c√≥digo
#openai_api_key = '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'  # Substitua pela sua chave da OpenAI

# Defina a chave da API diretamente
#openai.api_key = openai_api_key  # Isso define a chave para todas as chamadas OpenAI

# Configure o cliente OpenAIEmbeddings com a chave da API diretamente
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small"
)

# Teste r√°pido para realizar a busca
try:
    # Par√¢metros para a busca
    query = "whats the price for MR42E-HW"
    params = {
        "query": query,
        "k_faiss": 5,  # N√∫mero de resultados a serem retornados pela busca FAISS
        "k_bm25": 5,   # N√∫mero de resultados a serem retornados pela busca BM25
        "k_tfidf": 5   # N√∫mero de resultados a serem retornados pela busca TF-IDF
    }

    # Chama a fun√ß√£o de busca com os par√¢metros fornecidos
    out = product_search_tool.invoke(params)

    # Exibe a sa√≠da do teste
    logging.info("Resultado da busca: %s", out)

except Exception as e:
    logging.error("Erro durante o teste: %s", str(e))


INFO:services.ai_engine.app.core.tools:[product_search_tool] query='whats the price for MR42E-HW' k_faiss=5 k_bm25=5 k_tfidf=5
INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:root:Resultado da busca: [{'cisco_product_id': 'MR42-HW', 'commercial_name': 'Meraki MR42 Cloud Managed AP', 'description': '', 'technical_profile': {'category': 'Hardware', 'subcategory': ''}, 'pricing_model': {'base_price': 1099.0, 'currency': 'USD', 'elig_pct': 0.01}}, {'cisco_product_id': 'MR42E-HW', 'commercial_name': 'Meraki MR42E Cloud Managed Indoor AP with External Antennas', 'description': '', 'technical_profile': {'category': 'Hardware', 'subcategory': ''}, 'pricing_model': {'base_price': 1099.0, 'currency': 'USD', 'elig_pct': 0.01}}, {'cisco_product_id': 'MR52-HW', 'commercial_name': 'Meraki MR52 Cloud Managed AP', 'description': '', 'technical_profile': {'category': 'Hardware', 'subcategory': ''}, 'pricing_model': {'base_price': 1399.0, 'currency': 'USD', 'eli

In [None]:
print(run_sales_quote(
    "whats the price for MR42E-HW"
))
