In [None]:
import arxiv
import logging
import pandas as pd
import datetime

# do logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


In [None]:
'b'.join(['a'])

In [None]:
def fetch_recent_papers(categories: list[str]=["cs.AI"], days: int=1, max_results=100):
    """
    Fetches papers from specific categories within a time window.
    """
    # 1. Build Query (e.g., "cat:cs.LG OR cat:cs.AI")
    query = " OR ".join([f"cat:{c}" for c in categories])
    
    client = arxiv.Client(
        page_size=100,
        delay_seconds=3.0, # Be nice to ArXiv servers
        num_retries=3
    )
    
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=arxiv.SortCriterion.SubmittedDate
    )
    
    # 2. Time Window
    threshold = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=days)
    
    results = []
    for result in client.results(search):
        # ArXiv results are sorted by date, so we can break early
        if result.published < threshold:
            break
            
        results.append({
            "id": result.entry_id.split('/')[-1],
            "title": result.title,
            "authors": [a.name for a in result.authors],
            "abstract": result.summary.replace("\n", " "),
            "published": result.published,
            "primary_category": result.primary_category,
            "url": result.pdf_url,
        })
        
    since_time = threshold.strftime('%Y-%m-%d %H:%M')
    print(
        f"Fetched {len(results)} new papers since {since_time}"
    )
    return pd.DataFrame(results)

In [None]:
df = fetch_recent_papers(max_results=1000)
df['combined_text'] = "Title: " + df['title'] + " Abstract: " + df['abstract']

In [None]:
df

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

In [None]:
my_interest_embedding = model.encode("I am interested in Retrieval-augmented generation (RAG).")
content_embedding = model.encode(df['combined_text'].tolist())

print(np.shape(content_embedding))

cosine_sim = np.dot(content_embedding, my_interest_embedding) / (np.linalg.norm(content_embedding, axis=1) * np.linalg.norm(my_interest_embedding))
top_k_indices = np.argsort(-cosine_sim)[:10]

In [None]:
top_df = df.iloc[top_k_indices]

In [None]:
import ollama
import json

def get_authors(author_list: list[str]):
    """
    Formats the author list for LLM input, truncating if too long.
    """
    num_authors = len(author_list)

    if num_authors <= 12:
        author_string = ", ".join(author_list)
    else:
        first_part = author_list[:10]
        last_part = author_list[-2:]
        author_string = f"{', '.join(first_part)} ... {', '.join(last_part)}"
    
    return author_string

def parse_llm_output(response_text: str):
    """
    Sometimes LLM does not give list of JSON, so return {"paper": [...]} instead.
    Then parse it accordingly.
    """
    data = json.loads(response_text)
    
    if isinstance(data, list):
        return data
    
    if isinstance(data, dict):
        for key in ['papers', 'results', 'recommendations']:
            if key in data:
                return data[key]
    return []

def fine_rank_with_llm(top_papers_df: pd.DataFrame, user_interest:str="RAG"):
    """
    Uses a local LLM to give a qualitative 'Quality Score' to the top candidates.
    """
    top_papers_df = top_papers_df.copy()
    top_papers_df['cleaned_authors'] = top_papers_df['authors'].apply(get_authors)
    all_papers_json = top_papers_df[['id', 'title', 'cleaned_authors', 'abstract']].to_dict(orient='records')

    logger.info(f"Number of papers to process: {len(all_papers_json)}")
    logger.info(f"Sample input paper JSON: {all_papers_json[0]}")

    llm_ranking_prompt = f"""
    ## Task

    You are an expert AI and Physics PhD Researcher. 
    Given a list of papers with their titles, authors, and abstracts, you should help me decide which research papers to read based on my specific interest.
    These papers are posted on arXiv today.
    Your output should be top-5 results, stored in JSON format.

    To select best papers, you should consider relevancy to my interest, novelty, and soundness of methodology. Use the author list to gauge credibility if needed.
    

    ## My Interest
    My research interest topics: "{user_interest}"

    
    ## Input paper as JSON array
    {all_papers_json}

    ## Output

    In the "papers" field, provide a list of JSON response with top-5 papers, with the following fields:
    - "id": The ArXiv ID of the paper.
    - "reasoning": A one-sentence explanation of why this paper is worth reading or not.

    The output format should look like this:
    {{
        "papers": 
        [
            {{"id": "paper_id_1", "reasoning": "Reasoning for paper 1..."}},
            {{"id": "paper_id_2", "reasoning": "Reasoning for paper 2..."}},
            ...
            {{"id": "paper_id_5", "reasoning": "Reasoning for paper 5..."}}
        ]
    }}
    """
    
    response = ollama.generate(
        model="llama3:8b", 
        prompt=llm_ranking_prompt,
        format="json",
        options={"temperature": 0}
    )
    
    llm_output = parse_llm_output(response["response"])
    logger.info(f"LLM Output: {llm_output}")
    
    refined_results = []
    for paper_analysis in llm_output:
        paper_id = paper_analysis['id']
        matching_paper = top_papers_df[top_papers_df['id'] == paper_id].iloc[0]
        
        refined_results.append({
            "id": paper_id,
            "url": matching_paper["url"],
            "title": matching_paper["title"],
            "authors": matching_paper["cleaned_authors"],
            "abstract": matching_paper["abstract"],
            "reasoning": paper_analysis["reasoning"]
        })

    return pd.DataFrame(refined_results)

In [None]:
refined_df = fine_rank_with_llm(top_df)

In [None]:
refined_df

In [None]:
)