In [27]:
import requests
from bs4 import BeautifulSoup
import re

class ArxivClient:
    def __init__(self):
        self.url_template = "http://arxiv.org/search/?query={keywords}&searchtype=all"
        self.keywords = "gaze tracking"
        self.set_url()

    def set_url(self):
        self.url = self.url_template.format(keywords=self.keywords) 

    def get_top_articles(self):
        response = requests.get(self.url)
        top_articles = self.parse_articles(response.text)
        return top_articles


    def parse_articles(self, html_content):
        # LOG.debug("Parsing fetched HTML...")
        soup = BeautifulSoup(html_content, 'html.parser')
        articles = soup.find_all('li', class_='arxiv-result') 

        top_articles = []
        for article in articles:
            new_entry = {}
            link_tag = article.find('p', class_='list-title').find('a')
            if not link_tag: continue
            new_entry["link"] = link_tag['href'] 
            arxiv_id = link_tag.text
            arxiv_id = re.sub(r'\s*arXiv:\s*', '', arxiv_id)
            new_entry["arxiv_id"] = arxiv_id

            title_element = article.find('p', class_='title')
            if title_element:
                new_entry["title"] = title_element.text.strip()  # Extract title text

            authors_element = article.find('p', class_='authors')
            if authors_element:
                authors = authors_element.text.strip()
                authors = re.sub(r'^\s*Authors:\s*', '', authors)
                authors = re.sub(r'\s+', ' ', authors)
                new_entry["authors"] = authors

            abstract_element = article.find('span', class_='abstract-full')
            if abstract_element:
                abstract = abstract_element.text.strip()
                abstract = re.sub(r'\s*\S\s*Less\s*$', '', abstract)
                new_entry["abstract"] = abstract
            top_articles.append(new_entry)

        # LOG.info(f"{len(top_articles)} Articles parsed successfully.")
        return top_articles 



In [28]:
ArxivClient().get_top_articles()

{'link': 'https://arxiv.org/abs/2411.01969', 'arxiv_id': '2411.01969', 'title': 'Active Gaze Behavior Boosts Self-Supervised Object Learning', 'authors': 'Zhengyang Yu, Arthur Aubret, Marcel C. Raabe, Jane Yang, Chen Yu, Jochen Triesch', 'abstract': "Due to significant variations in the projection of the same object from different viewpoints, machine learning algorithms struggle to recognize the same object across various perspectives. In contrast, toddlers quickly learn to recognize objects from different viewpoints with almost no supervision. Recent works argue that toddlers develop this ability by mapping close-in-time visual inputs to similar representations while interacting with objects. High acuity vision is only available in the central visual field, which may explain why toddlers (much like adults) constantly move their gaze around during such interactions. It is unclear whether/how much toddlers curate their visual experience through these eye movements to support learning ob

[{'link': 'https://arxiv.org/abs/2411.01969',
  'arxiv_id': '2411.01969',
  'title': 'Active Gaze Behavior Boosts Self-Supervised Object Learning',
  'authors': 'Zhengyang Yu, Arthur Aubret, Marcel C. Raabe, Jane Yang, Chen Yu, Jochen Triesch',
  'abstract': "Due to significant variations in the projection of the same object from different viewpoints, machine learning algorithms struggle to recognize the same object across various perspectives. In contrast, toddlers quickly learn to recognize objects from different viewpoints with almost no supervision. Recent works argue that toddlers develop this ability by mapping close-in-time visual inputs to similar representations while interacting with objects. High acuity vision is only available in the central visual field, which may explain why toddlers (much like adults) constantly move their gaze around during such interactions. It is unclear whether/how much toddlers curate their visual experience through these eye movements to support le