### Installing required packages

In [1]:
!pip install uv
!uv pip install "numpy<2.0"
!uv pip install openai pandas sentence-transformers transformers altair beautifulsoup4 lxml superlinked==19.21.1

[2mUsing Python 3.11.0 environment at: /Users/lanchu/mambaforge/envs/genAI[0m
[2mAudited [1m1 package[0m [2min 27ms[0m[0m
[2mUsing Python 3.11.0 environment at: /Users/lanchu/mambaforge/envs/genAI[0m
[2mAudited [1m8 packages[0m [2min 30ms[0m[0m


### Setting up the Imports and setting up the intial library checks

In [None]:
import pandas as pd
import superlinked.framework as sl
from datetime import timedelta
from sentence_transformers import SentenceTransformer
from openai import OpenAI
import os
from abc import ABC, abstractmethod
from typing import Any, Optional, Dict
from tqdm import tqdm


# Abstract Tool Class
class Tool(ABC):
    @abstractmethod
    def name(self) -> str:
        pass

    @abstractmethod
    def description(self) -> str:
        pass

    @abstractmethod
    def use(self, *args, **kwargs) -> Any:
        pass


from dotenv import load_dotenv

load_dotenv()

# Initialize OpenAI Client
api_key = os.getenv("OPENAI_API_KEY")
if not api_key:
    raise ValueError("Please set the OPENAI_API_KEY environment variable.")

client = OpenAI(api_key=api_key)
model = "gpt-4o"

### Downloading the relevant data

### Data Loading and Truncation

To improve loading times, especially for users on the free Colab tier, the dataset is truncated to 100 rows by default. This means that only the first 100 entries from the original CSV file are loaded initially. You can change it to use the full dataset if you want!

In [3]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlencode
import pandas as pd


def query_arxiv(
    query="%22large%20language%20models%22",
    max_results=2000,
    order_by="lastUpdatedDate",
    order="descending",
):
    """
    Basic function for querying the API that lets us specify the most important parameters.
    query: URL encoded string to search for in paper titles and abstracts
    max_results: maximum amount of results returned by the API
    order_by: variable to order the results by
    order: descending or ascending based on the order_by parameter
    """
    params = {
        "search_query": f"all:{query}",
        "start": 0,
        "max_results": max_results,
        "sortBy": order_by,
        "sortOrder": order,
    }
    url = f"http://export.arxiv.org/api/query?{urlencode(params)}"

    try:
        response = requests.get(url)
        response.raise_for_status()
        logging.info(f"Length of response text: {len(response.text)}")
        soup = BeautifulSoup(response.text, "xml")
        data = []
        for entry in soup.find_all("entry"):
            data_entry = {tag.name: tag.text.strip() for tag in entry.find_all()}
            if "id" in data_entry:  # Ensure there is an 'id' field
                data.append(data_entry)
        logging.info(f"{len(data)} entries found")
        return pd.DataFrame(data)
    except requests.exceptions.RequestException as e:
        logging.error(f"Error during request: {e}")
    except Exception as e:
        logging.error(f"Unexpected error: {e}")
    return pd.DataFrame()  # Return an empty DataFrame if there was an error

In [4]:
import logging

df = query_arxiv(query="%22artificial%20intelligence%22")


In [5]:
from dateutil import parser
from datetime import datetime, timedelta, timezone

# The API returns the datetimes as a string, which we first parse
# in the datetime format and then convert them to timestamps
df["published"] = [
    parser.parse(date).replace(tzinfo=timezone.utc).timestamp() for date in df.published
]

## Understanding Time-Aware Paper Discovery with RecencySpace

When searching through research papers, finding relevant content is only half the story - knowing when it was published can be just as important. That's why we've implemented a smart time-aware search system using RecencySpace. Imagine you're organizing your digital library: you want to find papers not just by what they're about, but also by when they were written. Our system does exactly that, but automatically and intelligently.

We've set up our search to look at both what a paper contains and when it was published. Using different time windows (like papers from the last few months, the last year, or even older), our system can prioritize papers based on their publication dates while still keeping track of their relevance to your search. It's like having a research assistant who knows exactly how to balance the importance of time and content in your search results.

For our collection of AI research papers this time-aware approach helps us understand how ideas evolved during these foundational years. When you search for topics like "quantum computing" or "neural networks," the system doesn't just find papers containing these terms - it also considers their place in the timeline of AI development. This makes it easier to trace how concepts developed and changed over time, giving us a clearer picture of AI's historical progression.

In [6]:
class PaperSchema(sl.Schema):
    text: sl.String
    publication_date: sl.Timestamp  # This will handle datetime objects properly
    vector_id: sl.IdField
    pdf_name: sl.String
    # summary: sl.String


paper = PaperSchema()

# Define spaces
# text_space = sl.TextSimilaritySpace(
#     text=sl.chunk(paper.text, chunk_size=200, chunk_overlap=50),
#     model="sentence-transformers/all-mpnet-base-v2",
# )

text_space = sl.TextSimilaritySpace(
    text=paper.text,  # No chunking - keep documents as-is
    model="sentence-transformers/all-mpnet-base-v2",
)
recency_space = sl.RecencySpace(
    timestamp=paper.publication_date,
    period_time_list=[
        sl.PeriodTime(timedelta(days=365)),  # papers within 1 year
        sl.PeriodTime(timedelta(days=2 * 365)),  # papers within 2 years
        sl.PeriodTime(timedelta(days=3 * 365)),  # papers within 3 years
    ],
    negative_filter=-0.25,
)

In [7]:
df = df.rename(
    {
        "published": "publication_date",
        "title": "pdf_name",
        "summary": "text",
        "id": "vector_id",
    },
    axis=1,
)
df

Unnamed: 0,vector_id,updated,publication_date,pdf_name,text,author,name,comment,link,primary_category,category,affiliation,doi,journal_ref
0,http://arxiv.org/abs/2509.09679v1,2025-09-11T17:59:51Z,1.757614e+09,ButterflyQuant: Ultra-low-bit LLM Quantization...,Large language models require massive memory f...,Yuzhang Shang,Yuzhang Shang,Replace discrete Hadamard transforms with cont...,,,,,,
1,http://arxiv.org/abs/2509.09677v1,2025-09-11T17:59:34Z,1.757614e+09,The Illusion of Diminishing Returns: Measuring...,Does continued scaling of large language model...,Jonas Geiping,Jonas Geiping,,,,,,,
2,http://arxiv.org/abs/2509.09674v1,2025-09-11T17:59:17Z,1.757614e+09,SimpleVLA-RL: Scaling VLA Training via Reinfor...,Vision-Language-Action (VLA) models have recen...,Ning Ding,Ning Ding,,,,,,,
3,http://arxiv.org/abs/2509.09675v1,2025-09-11T17:59:17Z,1.757614e+09,CDE: Curiosity-Driven Exploration for Efficien...,Reinforcement Learning with Verifiable Rewards...,Dong Yu,Dong Yu,21 pages,,,,,,
4,http://arxiv.org/abs/2509.06602v2,2025-09-11T17:52:20Z,1.757334e+09,Demo: Healthcare Agent Orchestrator (HAO) for ...,Molecular Tumor Boards (MTBs) are multidiscipl...,Thomas Osborne,Thomas Osborne,"9 pages, 1 figure; Added missing co-authors an...",,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,http://arxiv.org/abs/2507.21407v2,2025-08-30T06:01:56Z,1.753749e+09,Graph-Augmented Large Language Model Agents: C...,Autonomous agents based on large language mode...,Shirui Pan,Shirui Pan,"15 pages, 7 figures",,,,,,
1996,http://arxiv.org/abs/2509.00373v1,2025-08-30T06:00:53Z,1.756534e+09,Activation Steering Meets Preference Optimizat...,Vision Language Models (VLMs) have demonstrate...,Xiaowei Huang,Xiaowei Huang,,,,,,,
1997,http://arxiv.org/abs/2507.00445v2,2025-08-30T05:54:19Z,1.751349e+09,Iterative Distillation for Reward-Guided Fine-...,We address the problem of fine-tuning diffusio...,Shuiwang Ji,Shuiwang Ji,,,,,,,
1998,http://arxiv.org/abs/2508.20547v2,2025-08-30T05:51:05Z,1.756370e+09,SPGrasp: Spatiotemporal Prompt-driven Grasp Sy...,Real-time interactive grasp synthesis for dyna...,Jie Chen,Jie Chen,,,,,,,


In [8]:
class PaperSchema(sl.Schema):
    text: sl.String
    publication_date: sl.Timestamp  # This will handle datetime objects properly
    vector_id: sl.IdField
    pdf_name: sl.String
    # summary: sl.String


paper = PaperSchema()

# Define spaces
# text_space = sl.TextSimilaritySpace(
#     text=sl.chunk(paper.text, chunk_size=200, chunk_overlap=50),
#     model="sentence-transformers/all-mpnet-base-v2",
# )

text_space = sl.TextSimilaritySpace(
    text=paper.text,  # No chunking - keep documents as-is
    model="sentence-transformers/all-mpnet-base-v2",
)
recency_space = sl.RecencySpace(
    timestamp=paper.publication_date,
    period_time_list=[
        sl.PeriodTime(timedelta(days=365)),  # papers within 1 year
        sl.PeriodTime(timedelta(days=2 * 365)),  # papers within 2 years
        sl.PeriodTime(timedelta(days=3 * 365)),  # papers within 3 years
    ],
    negative_filter=-0.25,
)

### Creating the index and ingesting the relevant data. we are using the in-memory superlinked executor to ingest the data


In [9]:
# Create the index
paper_index = sl.Index([text_space, recency_space])
# df["publication_date"] = df["publication_date"] / 1e9  # Convert from nanoseconds to seconds


# Parser to map DataFrame columns to schema fields
parser = sl.DataFrameParser(
    paper,
    mapping={
        paper.vector_id: "vector_id",
        paper.publication_date: "publication_date",
        paper.text: "text",
        paper.pdf_name: "pdf_name",
        # paper.summary: "summary",
    },
)

# Set up in-memory source and executor
source = sl.InMemorySource(paper, parser=parser)
executor = sl.InMemoryExecutor(sources=[source], indices=[paper_index])
app = executor.run()

# Load the DataFrame with a progress bar using batches
batch_size = 100
data_batches = [df[i : i + batch_size] for i in range(0, len(df), batch_size)]
for batch in tqdm(
    data_batches, total=len(data_batches), desc="Loading Data into Source"
):
    source.put(batch)

Loading Data into Source: 100%|██████████| 20/20 [08:05<00:00, 24.26s/it]


### Query definition

In [16]:
# Define the query
knowledgebase_query = (
    sl.Query(
        paper_index,
        weights={
            text_space: sl.Param("relevance_weight"),
            recency_space: sl.Param("recency_weight"),
        },
    )
    .find(paper)
    .similar(text_space, sl.Param("search_query"))
    .select(paper.vector_id, paper.publication_date, paper.text, paper.pdf_name)
    .limit(sl.Param("limit"))
)

### Defining the tools for the kernel agent

### Retrieval Tool

In [17]:
class RetrievalTool(Tool):
    def __init__(self, df, app, knowledgebase_query, client, model):
        self.df = df
        self.app = app
        self.knowledgebase_query = knowledgebase_query
        self.client = client
        self.model = model

    def name(self) -> str:
        return "RetrievalTool"

    def description(self) -> str:
        return "Retrieves a list of relevant papers based on a query using Superlinked."

    def use(self, query: str) -> pd.DataFrame:
        result = self.app.query(
            self.knowledgebase_query,
            relevance_weight=1.0,
            # recency_weight=0.5,
            search_query=query,
            limit=5,
        )
        df_result = sl.PandasConverter.to_pandas(result)
        # Ensure summary is a string
        if "summary" in df_result.columns:
            df_result["summary"] = df_result["summary"].astype(str)
        else:
            print("Warning: 'summary' column not found in retrieved DataFrame.")
        return df_result

### Summarization Tool

In [18]:
class SummarizationTool(Tool):
    def __init__(self, df, client, model):
        self.df = df
        self.client = client
        self.model = model

    def name(self) -> str:
        return "SummarizationTool"

    def description(self) -> str:
        return "Generates a concise summary of specified papers using an LLM."

    def use(self, query: str, paper_ids: list) -> str:
        papers = self.df[self.df["entry_id"].isin(paper_ids)]
        if papers.empty:
            return "No papers found with the given IDs."
        summaries = papers["summary"].tolist()
        summary_str = "\n\n".join(summaries)
        prompt = f"""
        Summarize the following paper summaries:\n\n{summary_str}\n\nProvide a concise summary.
        """
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=500,
        )
        return response.choices[0].message.content.strip()

### Question Answer Tool

In [19]:
class QuestionAnsweringTool(Tool):
    def __init__(self, retrieval_tool, client, model):
        self.retrieval_tool = retrieval_tool
        self.client = client
        self.model = model

    def name(self) -> str:
        return "QuestionAnsweringTool"

    def description(self) -> str:
        return "Answers questions about research topics using retrieved paper summaries or general knowledge if no specific context is available."

    def use(self, query: str) -> str:
        df_result = self.retrieval_tool.use(query)
        # if "text" not in df_result.columns:
        #     # Tag as a general question if summary is missing
        #     prompt = f"""
        #     You are a knowledgeable research assistant.
        #     This is a general question tagged as [GENERAL].
        #     Answer based on the paper provided to you. If you don't know the answer, provide a brief explanation of why.

        #     User's question: {query}
        #     """
        # else:
        # Use paper summaries for specific context
        print(df_result.head(1))
        contexts = df_result["text"].tolist()
        context_str = "\n\n".join(contexts)
        prompt = f"""
        You are a research assistant. Use the following paper to answer the user's question. There is definitely information in the paper.
        Paper summaries:
        {context_str}

        User's question: {query}
        """
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            max_tokens=500,
        )
        return response.choices[0].message.content.strip()

### Setting up the kernel agent

In [20]:
class KernelAgent:
    def __init__(
        self,
        retrieval_tool: RetrievalTool,
        summarization_tool: SummarizationTool,
        question_answering_tool: QuestionAnsweringTool,
        client,
        model,
    ):
        self.retrieval_tool = retrieval_tool
        self.summarization_tool = summarization_tool
        self.question_answering_tool = question_answering_tool
        self.client = client
        self.model = model

    def classify_query(self, query: str) -> str:
        prompt = f"""
        Classify the following user prompt into one of the three categories:
        - retrieval: The user wants to find a list of papers based on some criteria (e.g., 'Find papers on AI ethics from 2020').
        - summarization: The user wants to summarize a list of papers (e.g., 'Summarize papers with entry_id 123, 456, 789').
        - question_answering: The user wants to ask a question about research topics and get an answer (e.g., 'What is the current ecb rates?').

        User prompt: {query}

        Respond with only the category name (retrieval, summarization, question_answering).
        If unsure, respond with 'unknown'.
        """
        response = self.client.chat.completions.create(
            model=self.model,
            messages=[{"role": "user", "content": prompt}],
            temperature=0,
            # max_tokens=10,
        )
        classification = response.choices[0].message.content.strip().lower()
        print(f"Query type: {classification}")
        return classification

    def process_query(self, query: str, params: Optional[Dict] = None) -> str:
        query_type = self.classify_query(query)
        if query_type == "retrieval":
            df_result = self.retrieval_tool.use(query)
            response = "Here are the top papers:\n"
            for i, row in df_result.iterrows():
                # Ensure summary is a string and handle empty cases
                summary = str(row["text"]) if pd.notna(row["text"]) else ""
                response += f"{i+1}. {row['pdf_name']} \nSummary: {summary}...\n\n"
            return response
        elif query_type == "summarization":
            if not params or "paper_ids" not in params:
                return "Error: Summarization query requires a 'paper_ids' parameter with a list of entry_ids."
            return self.summarization_tool.use(query, params["paper_ids"])
        elif query_type == "question_answering":
            return self.question_answering_tool.use(query)
        else:
            return "Error: Unable to classify query as 'retrieval', 'summarization', or 'question_answering'."

In [21]:
retrieval_tool = RetrievalTool(df, app, knowledgebase_query, client, model)
summarization_tool = SummarizationTool(df, client, model)
question_answering_tool = QuestionAnsweringTool(retrieval_tool, client, model)

# Initialize KernelAgent
kernel_agent = KernelAgent(
    retrieval_tool, summarization_tool, question_answering_tool, client, model
)

In [0]:
from superlinked.framework.common.parser.dataframe_parser import DataFrameParser

dataframe_parser = DataFrameParser(
    schema=paper,
    mapping={paper.publication_date: "publication_date", paper.text: "text"},
)

In [0]:
from superlinked.framework.common.dag.context import CONTEXT_COMMON, CONTEXT_COMMON_NOW
from superlinked.framework.dsl.source.in_memory_source import InMemorySource
from superlinked.framework.dsl.executor.in_memory.in_memory_executor import (
    InMemoryExecutor,
    InMemoryApp,
)

# Setting a specific end date to ensure reproducibility of the notebook
END_OF_APRIL_24_TS = int(datetime(2025, 9, 10, 23, 59).timestamp())
EXECUTOR_DATA = {CONTEXT_COMMON: {CONTEXT_COMMON_NOW: END_OF_APRIL_24_TS}}
source: InMemorySource = InMemorySource(paper, parser=dataframe_parser)
executor: InMemoryExecutor = InMemoryExecutor(
    sources=[source], indices=[paper_index], context_data=EXECUTOR_DATA
)
app: InMemoryApp = executor.run()

In [0]:
from superlinked.evaluation.charts.recency_plotter import RecencyPlotter

# To get an intuitive understanding of how recency is weighted for our data,
# we can explore the weights using Superlinked's inbuilt RecencyPlotter
recency_plotter = RecencyPlotter(recency_space, context_data=EXECUTOR_DATA)
recency_plotter.plot_recency_curve()


In [47]:
df_result = retrieval_tool.use("What is low carbon AI?")
print(df_result)

   publication_date                                               text  \
0        1698758083  Artificial intelligence (AI) increasingly infl...   
1        1757424846  Environmental sustainability, particularly in ...   
2        1757154147  The article provides an overview of approaches...   
3        1757462026  We present FlexiFlow, a lifetime-aware design ...   
4        1757282061  The rapid proliferation of artificial intellig...   

                                            pdf_name  \
0  Assessing the Sustainability and Trustworthine...   
1  The Carbon Footprint Wizard: A Knowledge-Augme...   
2   Computational Concept of the Psyche (in Russian)   
3   Lifetime-Aware Design of Item-Level Intelligence   
4  A Maslow-Inspired Hierarchy of Engagement with...   

                                  id  similarity_score  
0  http://arxiv.org/abs/2310.20435v3          0.444214  
1  http://arxiv.org/abs/2509.07733v1          0.437331  
2  http://arxiv.org/abs/2509.07009v2          0

In [40]:
def fix_corrupted_timestamp(timestamp):
    try:
        # If timestamp is way too large, it might be nanoseconds
        if timestamp > 1e12:  # If larger than year 2001 in milliseconds
            # Try dividing by 1e9 (nanoseconds to seconds)
            fixed_timestamp = timestamp / 1e9
            return datetime.fromtimestamp(fixed_timestamp, tz=timezone.utc).date()
        else:
            return datetime.fromtimestamp(timestamp, tz=timezone.utc).date()
    except (OSError, ValueError, OverflowError) as e:
        print(f"Still can't convert {timestamp}: {e}")
        return None


# df_result = retrieval_tool.use("What is low carbon AI?")

if "publication_date" in df_result.columns:
    df_result["publication_date"] = df_result["publication_date"].apply(
        fix_corrupted_timestamp
    )


In [41]:
df_result = df_result.rename({"pdf_name": "title"}, axis=1)

In [23]:
class RetrievalTool_norecency(Tool):
    def __init__(self, df, app, knowledgebase_query, client, model):
        self.df = df
        self.app = app
        self.knowledgebase_query = knowledgebase_query
        self.client = client
        self.model = model

    def name(self) -> str:
        return "RetrievalTool_norecency"

    def description(self) -> str:
        return "Retrieves a list of relevant papers based on a query using Superlinked."

    def use(self, query: str) -> pd.DataFrame:
        result = self.app.query(
            self.knowledgebase_query,
            relevance_weight=1.0,
            recency_weight=0,
            search_query=query,
            limit=5,
        )
        df_result = sl.PandasConverter.to_pandas(result)
        # Ensure summary is a string
        if "summary" in df_result.columns:
            df_result["summary"] = df_result["summary"].astype(str)
        else:
            print("Warning: 'summary' column not found in retrieved DataFrame.")
        return df_result

In [24]:
retrieval_tool_norecency = RetrievalTool_norecency(
    df, app, knowledgebase_query, client, model
)


In [25]:
retrieval_tool_norecency.use("What is low carbon AI?")



Unnamed: 0,publication_date,text,pdf_name,id,similarity_score
0,1698758083,Artificial intelligence (AI) increasingly infl...,Assessing the Sustainability and Trustworthine...,http://arxiv.org/abs/2310.20435v3,0.444214
1,1757424846,"Environmental sustainability, particularly in ...",The Carbon Footprint Wizard: A Knowledge-Augme...,http://arxiv.org/abs/2509.07733v1,0.437331
2,1757154147,The article provides an overview of approaches...,Computational Concept of the Psyche (in Russian),http://arxiv.org/abs/2509.07009v2,0.41706
3,1757462026,"We present FlexiFlow, a lifetime-aware design ...",Lifetime-Aware Design of Item-Level Intelligence,http://arxiv.org/abs/2509.08193v1,0.407133
4,1757282061,The rapid proliferation of artificial intellig...,A Maslow-Inspired Hierarchy of Engagement with...,http://arxiv.org/abs/2509.07032v1,0.399878
