<a href="https://colab.research.google.com/github/kartikigaikwad/Amazon-Clone/blob/main/kartiki.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
# === Cell 1: Install & Setup ===
!pip install -U openai==1.55.3 langchain langchain-openai langchain-core pinecone gradio arxiv tqdm

import arxiv
import openai
import pinecone
import os
from tqdm import tqdm
from google.colab import userdata
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import AzureOpenAIEmbeddings
from pinecone import Pinecone, ServerlessSpec, PineconeApiException
import gradio as gr

# Helper: Load secrets from Colab userdata (or manual env vars)
def _set_env(var: str):
    if not os.environ.get(var):
        os.environ[var] = userdata.get(f"{var}")

for var in ["AZURE_OPENAI_API_KEY", "AZURE_OPENAI_ENDPOINT", "USER_AGENT",
            "PINECONE_API_KEY", "PINECONE_INDEX", "OPENAI_API_VERSION"]:
    _set_env(var)

openai.api_key = os.getenv("AZURE_OPENAI_API_KEY")
openai.api_base = os.getenv("AZURE_OPENAI_ENDPOINT")
openai.api_type = "azure"
openai.api_version = os.getenv("OPENAI_API_VERSION")

pinecone_api_key = os.getenv("PINECONE_API_KEY")
pinecone_index_name = os.getenv("PINECONE_INDEX")

pinecone_client = Pinecone(api_key=pinecone_api_key)

# Create index if not present
try:
    existing_indexes = [i['name'] for i in pinecone_client.list_indexes()]
    if pinecone_index_name not in existing_indexes:
        print(f"Creating new Pinecone index: {pinecone_index_name}")
        pinecone_client.create_index(
            name=pinecone_index_name,
            dimension=1536,
            metric="cosine",
            spec=ServerlessSpec(cloud='aws', region='us-east-1')
        )
    else:
        print(f"Pinecone index '{pinecone_index_name}' already exists.")
except PineconeApiException as e:
    if e.status == 409 and "ALREADY_EXISTS" in e.body:
        print(f"Pinecone index '{pinecone_index_name}' already exists.")
    else:
        raise

index = pinecone_client.Index(pinecone_index_name)
print(f"‚úÖ Connected to Pinecone index: {pinecone_index_name}")


Collecting langchain
  Using cached langchain-1.0.5-py3-none-any.whl.metadata (4.9 kB)
Collecting langchain-openai
  Using cached langchain_openai-1.0.2-py3-none-any.whl.metadata (1.8 kB)
Collecting langchain-core
  Using cached langchain_core-1.0.4-py3-none-any.whl.metadata (3.5 kB)
Collecting langgraph<1.1.0,>=1.0.2 (from langchain)
  Using cached langgraph-1.0.3-py3-none-any.whl.metadata (7.8 kB)
INFO: pip is looking at multiple versions of langchain-openai to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-openai
  Using cached langchain_openai-1.0.1-py3-none-any.whl.metadata (1.8 kB)
  Using cached langchain_openai-1.0.0-py3-none-any.whl.metadata (1.8 kB)
  Using cached langchain_openai-0.3.35-py3-none-any.whl.metadata (2.4 kB)
  Using cached langchain_openai-0.3.34-py3-none-any.whl.metadata (2.4 kB)
  Using cached langchain_openai-0.3.33-py3-none-any.whl.metadata (2.4 kB)
  Using cached langchain_openai-0.3.32-py3-none-

In [10]:
# === Cell 2: Utility Functions ===
import uuid
import json
from typing import List, Dict, Any

def search_arxiv(query: str, max_results: int = 5):
    print(f"\nüîç Searching arXiv for '{query}' ...")
    search = arxiv.Search(query=query, max_results=max_results, sort_by=arxiv.SortCriterion.Relevance)
    results = []
    for r in search.results():
        results.append({
            "id": r.get_short_id(),
            "title": r.title,
            "authors": [a.name for a in r.authors],
            "summary": r.summary,
            "pdf_url": r.pdf_url,
            "published": r.published.isoformat() if r.published else None
        })
    print(f"‚úÖ Found {len(results)} papers.")
    return results


def chunk_texts(texts: List[str], chunk_size: int = 1000, overlap: int = 100):
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=overlap)
    chunks = splitter.split_text("\n".join(texts))
    print(f"üß© Created {len(chunks)} text chunks.")
    return chunks


def get_embeddings_client(deployment="text-embedding-3-small"):
    return AzureOpenAIEmbeddings(deployment=deployment)


def embed_texts(emb_client, texts: List[str], batch_size: int = 8):
    print("‚öôÔ∏è Generating embeddings...")
    embeddings = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i:i+batch_size]
        batch_embs = emb_client.embed_documents(batch)
        embeddings.extend(batch_embs)
    print(f"‚úÖ Created embeddings for {len(texts)} chunks.")
    return embeddings


def upsert_to_pinecone(index, embeddings, texts, namespace="default"):
    print(f"üöÄ Uploading {len(embeddings)} vectors to Pinecone...")
    data = [(str(uuid.uuid4()), emb, {"text": texts[i]}) for i, emb in enumerate(embeddings)]
    index.upsert(vectors=data, namespace=namespace)
    print("‚úÖ Data successfully stored in Pinecone.")


In [11]:
# === Cell 3: Full pipeline ===
def index_research_topic(topic: str, max_papers: int = 5):
    try:
        logs = [f"üöÄ Starting indexing for topic: {topic}"]

        # Step 1: Search arXiv
        papers = search_arxiv(topic, max_results=max_papers)
        if not papers:
            return "‚ùå No relevant papers found on arXiv."

        logs.append(f"‚úÖ Found {len(papers)} relevant papers.")

        # Step 2: Combine abstracts + titles
        texts = [
            f"Title: {p['title']}\nAuthors: {', '.join(p['authors'])}\nAbstract: {p['summary']}"
            for p in papers
        ]

        # Step 3: Chunk text
        chunks = chunk_texts(texts)
        logs.append(f"üß© Split into {len(chunks)} chunks.")

        # Step 4: Embedding generation
        emb_client = get_embeddings_client()
        embeddings = embed_texts(emb_client, chunks)
        logs.append("‚úÖ Embeddings created successfully.")

        # Step 5: Store in Pinecone
        upsert_to_pinecone(index, embeddings, chunks)
        logs.append("üì¶ Data stored in Pinecone vector database.")

        logs.append(f"üéØ Indexing complete for '{topic}'.")
        return "\n".join(logs)

    except Exception as e:
        return f"‚ùå Error during indexing: {str(e)}"


In [12]:
# === Cell 4: Gradio App ===
def gradio_index(topic, max_papers):
    if not topic.strip():
        return "‚ö†Ô∏è Please enter a research topic."
    return index_research_topic(topic, int(max_papers))

app = gr.Interface(
    fn=gradio_index,
    inputs=[
        gr.Textbox(label="Enter Research Topic", placeholder="e.g. Transformer models in NLP"),
        gr.Slider(1, 10, step=1, value=3, label="Number of Papers to Retrieve")
    ],
    outputs=gr.Textbox(label="System Logs", lines=15),
    title="üìö Agentic Research Assistant ‚Äì Part 1",
    description="Enter a research topic to fetch papers from arXiv, create embeddings, and store them in Pinecone."
)

app.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://4fa673d29c66775dd5.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [13]:
# === Cell 5: Verify Pinecone Data (fixed for new SDK) ===
def verify_pinecone(index):
    print("\nüîé Verifying Pinecone Index Contents...")
    stats = index.describe_index_stats()

    # Convert to dict safely
    if hasattr(stats, "to_dict"):
        stats_dict = stats.to_dict()
    elif isinstance(stats, dict):
        stats_dict = stats
    else:
        # Fallback: convert via __dict__ (some versions return an object)
        stats_dict = json.loads(json.dumps(stats, default=lambda o: o.__dict__))

    # Pretty-print stats
    print(json.dumps(stats_dict, indent=2))

    # Extract total vector count (handling missing namespaces)
    namespaces = stats_dict.get("namespaces", {})
    total = sum([ns.get("vector_count", 0) for ns in namespaces.values()])
    print(f"\n‚úÖ Total vectors stored in Pinecone: {total}")

# Call verification
verify_pinecone(index)



üîé Verifying Pinecone Index Contents...
{
  "namespaces": {
    "default": {
      "vector_count": 39
    }
  },
  "index_fullness": 0.0,
  "total_vector_count": 39,
  "dimension": 1536,
  "metric": "cosine",
  "vector_type": "dense"
}

‚úÖ Total vectors stored in Pinecone: 39


In [14]:
# === Cell 6: Semantic Search Query ===
def semantic_search(query: str, top_k: int = 3):
    emb_client = get_embeddings_client()
    q_emb = emb_client.embed_query(query)
    results = index.query(vector=q_emb, top_k=top_k, include_metadata=True)
    print(f"\nüîç Query: {query}")
    for match in results["matches"]:
        snippet = match["metadata"]["text"][:200].replace("\n", " ")
        print(f"\nScore: {round(match['score'], 4)}")
        print(f"Snippet: {snippet}...")

semantic_search("how CNN helps in detecting crop leaf diseases")



üîç Query: how CNN helps in detecting crop leaf diseases


In [15]:
# === Cell 6: Enhanced Topic-wise Namespace Version ===
import re

def clean_namespace(name: str) -> str:
    """Clean topic name so it can be used safely as Pinecone namespace."""
    return re.sub(r'[^a-zA-Z0-9_-]', '_', name.strip().lower())

def index_research_topic_topicwise(topic: str, max_papers: int = 5):
    """Same as before, but stores data topic-wise and shows paper info."""
    try:
        namespace = clean_namespace(topic)
        logs = [f"üöÄ Starting indexing for topic: '{topic}' (namespace: {namespace})"]

        # Step 1: Search arXiv
        papers = search_arxiv(topic, max_results=max_papers)
        if not papers:
            return "‚ùå No relevant papers found on arXiv."

        logs.append(f"‚úÖ Found {len(papers)} relevant papers.\n")

        # Step 2: Display retrieved papers
        for p in papers:
            logs.append(f"üìÑ **{p['title']}**")
            logs.append(f"   üîó {p['pdf_url']}")
            logs.append(f"   üßë‚Äçüî¨ {', '.join(p['authors'])}")
            logs.append(f"   üìù {p['summary'][:250]}...\n")

        # Step 3: Prepare text for embeddings
        texts = [
            f"Title: {p['title']}\nAuthors: {', '.join(p['authors'])}\nAbstract: {p['summary']}"
            for p in papers
        ]

        # Step 4: Chunk text
        chunks = chunk_texts(texts)
        logs.append(f"üß© Split into {len(chunks)} chunks.")

        # Step 5: Create embeddings
        emb_client = get_embeddings_client()
        embeddings = embed_texts(emb_client, chunks)
        logs.append("‚úÖ Embeddings created successfully.")

        # Step 6: Store topic data in its own namespace
        upsert_to_pinecone(index, embeddings, chunks, namespace=namespace)
        logs.append(f"üì¶ Data stored in Pinecone namespace '{namespace}'.")

        # Step 7: Verify & summarize
        stats = index.describe_index_stats()
        ns_count = stats['namespaces'].get(namespace, {}).get('vector_count', 0)
        logs.append(f"üîç Namespace '{namespace}' now contains {ns_count} vectors.")

        logs.append(f"üéØ Indexing complete for topic '{topic}'.")
        return "\n".join(logs)

    except Exception as e:
        return f"‚ùå Error: {str(e)}"


In [16]:
# === Cell 7: Topic-wise Gradio Interface ===
def gradio_index_topicwise(topic, max_papers):
    if not topic.strip():
        return "‚ö†Ô∏è Please enter a research topic."
    return index_research_topic_topicwise(topic, int(max_papers))

app2 = gr.Interface(
    fn=gradio_index_topicwise,
    inputs=[
        gr.Textbox(label="Enter Research Topic", placeholder="e.g. Diffusion models in image generation"),
        gr.Slider(1, 10, step=1, value=3, label="Number of Papers to Retrieve")
    ],
    outputs=gr.Textbox(label="System Logs", lines=20),
    title="üìö Agentic Research Assistant ‚Äì Topic-wise Namespace Indexing",
    description="This version retrieves papers from arXiv, shows summaries, and stores each topic in its own Pinecone namespace."
)

app2.launch(share=True)


Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://b9c46417e0c5840375.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [19]:
# === Cell 8: Verify Topic-wise Namespaces (Fixed for new Pinecone SDK) ===
def show_all_namespaces(index):
    # Convert to a plain Python dict first
    stats = index.describe_index_stats().to_dict() if hasattr(index.describe_index_stats(), 'to_dict') else index.describe_index_stats()

    # Print raw JSON
    print(json.dumps(stats, indent=2))

    # Safely access namespaces
    namespaces = stats.get("namespaces", {})
    if not namespaces:
        print("\n‚ö†Ô∏è No namespaces found. Try indexing a topic first.")
        return

    print("\nüìä Namespace Summary:")
    for ns, data in namespaces.items():
        count = data.get("vector_count", 0)
        print(f"  üß≠ {ns}: {count} vectors")

# ‚úÖ Run the function
show_all_namespaces(index)


{
  "namespaces": {
    "human_life": {
      "vector_count": 18
    },
    "default": {
      "vector_count": 57
    },
    "generative_ai": {
      "vector_count": 13
    }
  },
  "index_fullness": 0.0,
  "total_vector_count": 88,
  "dimension": 1536,
  "metric": "cosine",
  "vector_type": "dense"
}

üìä Namespace Summary:
  üß≠ human_life: 18 vectors
  üß≠ default: 57 vectors
  üß≠ generative_ai: 13 vectors
