In [None]:
%pip install langchain  astrapy groq langchain_huggingface langchain_groq langchain_text_splitters

In [None]:
%pip install -U sentence-transformers transformers

In [None]:
from google.colab import userdata
ASTRA_DB_TOKEN=userdata.get('ASTRA_DB_TOKEN')
ASTRA_DB_ID=userdata.get('ASTRA_DB_ID')
groq_key=userdata.get('GROQ_API_KEY')
HF_TOKEN=userdata.get('HF_TOKEN')
SERPER_API_KEY=userdata.get('SERPER_API_KEY')
ASTRA_DB_ENDPOINT="https://7c36ff6b-70bc-4a9b-8e2d-2a8aecc92ac1-us-east1.apps.astra.datastax.com"

In [None]:
%pip install -qU langchain-community arxiv

In [None]:
from langchain_groq import ChatGroq
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.output_parsers import JsonOutputParser
from langchain_community.utilities import ArxivAPIWrapper
from typing import List, Dict
import requests
import json
from datetime import datetime
import hashlib
from astrapy import DataAPIClient
import logging
logging.getLogger("astrapy").setLevel(logging.ERROR)


In [None]:
class Researchtools:
  def __init__(self,serper_api_key:str):
    self.arxiv=ArxivAPIWrapper(
        top_k_results=5,
        ARXIV_MAX_QUERY_LENGTH=400,
        load_max_documents=4,
        load_all_available_meta=True)

    self.serper_key=serper_api_key
  def search_arxiv(self,query:str):
    print(f"Arxiv search:")
    try:
      result=self.arxiv.run(query)
      return[{
          "type":"research_paper",
          "content":result,
          "source":"arxiv",
          "query":query
      }]
    except Exception as e:
      print(f"Arxiv search failed:m")

    return []
  def search_job(self,query:str,max_results:int=5)->List[Dict]:
    print(f"fetching url{query}")

    url="https://google.serper.dev/search"

    payload={
        "q":query,
        "num":max_results,
        "location":"India"
    }
    headers={
        'X-API-KEY':self.serper_key,
        'Content_type':'application/json'
    }

    try:
      response=requests.post(url,
                            json=payload,
                              headers=headers,
                              )
      response.raise_for_status()
      data=response.json()
      jobs=data.get('jobs',[])

      formatted=[]
      for job in jobs:
        formatted.append({
              "type": "job_posting",
              "title": job.get('title', 'N/A'),
              "company": job.get('company', 'N/A'),
              "location": job.get('location', 'N/A'),
              "description": job.get('description', 'N/A'),
              "source_url": job.get('link', 'N/A'),
              "posted_at": job.get('posted', 'N/A'),
              "via": job.get('via', 'N/A')


          })

        return formatted

    except requests.exceptions.RequestException as e:
              print("Error fetching it")
              return []

  def execute_tools(self,tool_name:str,query:str,**kwargs)->List[Dict]:
    if tool_name=="arxiv":
      return self.search_arxiv(query)
    elif tool_name=="job_search":
      max_results=kwargs.get('max_results',5)
      return self.search_job(query,max_results)
    elif tool_name=="fetching_url":
      content=self.fetching_url(query,kwargs.get('max_results',5000))
      if content:
           return [{
                      "type": "webpage",
                      "url": query,
                      "content": content
                  }]
           return []
      else:
          print(f"error fetching{tool_name}")
          return []
  def fetching_url(self,url:str,max_cahrs:int=5000):
    print("Fetching url")

    if not url.startswith(('http://', 'https://')):
      print("invalid  url format")
      return None
    try:
      jina_url=f"https://r.jina.ai/{url}"
      response=requests.get(jina_url)
      response.raise_for_status()
      content=response.text
      print("fetched")
      return content


    except:
      print("error jina")




In [None]:
class PlannerAgent:
  def __init__(self,groq_key:str):
    self.llm=ChatGroq(api_key=groq_key, model="llama-3.1-8b-instant")


  def create_plan(self,user_goal:str):
        prompt = f"""You are a research planning assistant. Break down this goal into 3-5 specific, actionable research tasks.

        User Goal: {user_goal}

        For each task, specify:
        - task_id: number (1, 2, 3, etc.)
        - description: specific research action
        - tool: which tool to use
          - "arxiv" for academic papers/research
          - "job_search" for internships/jobs
          - "web_fetch" for specific URLs

        Return ONLY a JSON array:
          [
            {{"task_id": 1, "description": "Search for papers on...", "tool": "arxiv"}},
            {{"task_id": 2, "description": "Find internships at...", "tool": "job_search"}}
          ]

          Return ONLY valid JSON, no 'explanation."""

        response = self.llm.client.create(
                      model="llama-3.1-8b-instant",
                      messages=[
                          {"role": "system", "content": "You are a research planner. Always output valid JSON."},
                          {"role": "user", "content": prompt}
                      ],
                      temperature=0.1,
                      max_tokens=1500,
                      response_format={"type": "json_object"}
                  )


        raw_content = response.choices[0].message.content
        data = json.loads(raw_content)
        return data.get("tasks", [])

In [None]:
import arxiv

class ResearcherAgent:
    def __init__(self, groq_key: str, astra_token: str, astra_endpoint: str,serper_api_key):

        self.llm = ChatGroq(api_key=groq_key, model="llama-3.1-8b-instant")
        self.embedder = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        self.serper_api_key = serper_api_key

        self.arxiv_client = arxiv.Client()


        client = DataAPIClient(token=astra_token)
        db = client.get_database(astra_endpoint)
        self.collection = db.get_collection("ResearchPapers")
    def search_arxiv(self, query):
      if not getattr(self, 'arxiv_enabled', True):
        return []

    def execute_task(self, task: Dict) -> List[Dict]:
        query = task.get('description', '')
        print(f"Searching Arxiv for: {query}")


        search = arxiv.Search(
            query=query,
            max_results=10,
            sort_by=arxiv.SortCriterion.Relevance
        )

        processed_results = []
        for r in self.arxiv_client.results(search):
            # Map Arxiv metadata to our structure
            paper = {
                "title": r.title,
                "authors": [a.name for a in r.authors],
                "link": r.entry_id,
                "pdf_url":r.pdf_url,
                "category":r.primary_category,
                "summary": r.summary[:500].replace('\n', ' ') + "...", # Truncate for the table
                "published": r.published.strftime("%Y-%m-%d"),
                "researched_at": datetime.now().isoformat()
            }
            processed_results.append(paper)


        if processed_results:
            self.store_findings(processed_results)

        return processed_results

    def store_findings(self, findings: List[Dict]):
        if not findings: return

        for f in findings:

            doc_id = hashlib.md5(f['link'].encode()).hexdigest()[:16]
            vector = self.embedder.embed_query(f"{f['title']} {f['summary']}")

            doc_to_store = {"_id": doc_id, "$vector": vector, "metadata": f}

            try:
                self.collection.replace_one(filter={"_id": doc_id}, replacement=doc_to_store, upsert=True)
            except Exception as e:
                # If the collection is missing, this will tell us exactly why once
                if "COLLECTION_NOT_EXIST" in str(e):
                    print("You need to run the 'create_collection' cell!")
                    break
                else:
                    print(f"Error storing finding {doc_id}: {e}")
                continue

In [None]:
from astrapy.info import CollectionDefinition
from astrapy.constants import VectorMetric



try:

    definition = (
        CollectionDefinition.builder()
        .set_vector_dimension(384)
        .set_vector_metric(VectorMetric.COSINE)
        .build()
    )

    db.create_collection("ResearchPapers", definition=definition)
    print("Collection 'ResearchPapers' created successfully")
except Exception as e:
    print(f"Status: {e}")

In [None]:
class CriticAgent:
  def __init__(self,groq_key:str):
    self.llm=ChatGroq(api_key=groq_key, model="llama-3.1-8b-instant")
  def validate_findings(self,findings:List[Dict])->List[Dict]:
    if not findings:
      return {"total_findings":0,
              "confidence_score":0.0,
              "issue_found":["no data provided by the researcher."],
              "recommendatiosn":["check tool API keys or try a broader search query."]}
    sample_for_llm=[
        {
            "title": f.get('title',f.get('company','Unknon')),
            "has_link":bool(f.get('source_url') or f.get('link') or f.get('url')),
            "has_text":len(f.get('description',f.get('content','')))>20
        }
        for f in findings[:10]

    ]
    prompt = f"""You are a Research Auditor.
        I have found {len(findings)} results. Here is a sample of the first 10:

        {json.dumps(sample_for_llm, indent=2)}

        Analyze if these results are high quality and relevant.
        Return a JSON object with these exact keys:
        - total_findings (integer)
        - issues_found (list of strings)
        - confidence_score (float between 0.0 and 1.0)
        - recommendations (list of strings)
        """

    try:
            # 4. Call Groq with JSON Mode enabled
            response = self.llm.client.create(
                model="llama-3.1-8b-instant",
                messages=[
                    {"role": "system", "content": "You are a quality control AI. Always output valid JSON."},
                    {"role": "user", "content": prompt}
                ],
                response_format={"type": "json_object"},
                temperature=0.1  # Low temperature for consistent "grading"
            )

            # 5. Parse the direct JSON output (No cleaning needed!)
            return json.loads(response.choices[0].message.content)

    except Exception as e:
            print(f"Critic Analysis Error: {e}")
            return {
                "total_findings": len(findings),
                "confidence_score": 0.5,
                "issues_found": ["Error during automated critique"],
                "recommendations": ["Manual verification required"]
            }

In [None]:
class ResearchOrchestrator:
  def __init__(self,PlannerAgent,ResearcherAgent,CriticAgent):
    self.planner=PlannerAgent
    self.researcher=ResearcherAgent
    self.critic=CriticAgent
  def run(self, user_goal: str, max_iterations: int = 2):
    plan=self.planner.create_plan(user_goal)
    all_final_findings = []
    for i in range(max_iterations):
      current_findings=[]
      for task in plan:
        findings=self.researcher.execute_task(task)
        current_findings.extend(findings)
        all_final_findings.extend(current_findings)

      report=self.critic.validate_findings(all_final_findings)
      total_findings=report.get('total_findings',0)
      confidence=report.get('confidence_score',0.0)

      if confidence>0.8:
        self.researcher.store_findings(current_findings)
        return current_findings
      else:
        print(f"Issue:{report.get('issues_found',[])}")

        self.researcher.store_findings(all_final_findings)
        return all_final_findings


In [None]:
  # 1. Initialize all Agents
  planner = PlannerAgent(groq_key=groq_key)

  # REMOVED serper_api_key here
  researcher = ResearcherAgent(
      groq_key=groq_key,
      astra_token=ASTRA_DB_TOKEN,
      astra_endpoint=ASTRA_DB_ENDPOINT,
      serper_api_key=SERPER_API_KEY # Added the missing argument
  )

  # Use 'groq_key' to match your other agents
  critic = CriticAgent(groq_key=groq_key)

  # 2. Setup the Orchestrator
  boss = ResearchOrchestrator(planner, researcher, critic)

  boss.run("Find 2026 AI internship roles in San Francisco")