# Crawl Daily Papers and Create a Dataset
This notebook aims to gather the daily papers listed in Hugging Face and create a dataset that can later be used for analysis.

In [None]:
!pip install arxiv pypdf scholarly tqdm huggingface_hub pandas

In [4]:
import requests
from bs4 import BeautifulSoup
import json
from scholarly import scholarly
import pprint
import os
from tqdm import tqdm
import pandas as pd

In [10]:
def get_hugging_face_top_daily_paper(url) -> list:
    """
    This is a tool that returns the most upvoted paper on Hugging Face daily papers.
    It returns a list of papers
    """
    try:
        response = requests.get(url)
        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
        soup = BeautifulSoup(response.content, "html.parser")

        # Extract the title element from the JSON-like data in the "data-props" attribute
        containers = soup.find_all('div', class_='SVELTE_HYDRATER contents')
        paper_list = []

        for container in containers:
            # Parse the data-props attribute if it exists
            if 'data-props' in container.attrs:
                try:
                    # Find all article elements that contain paper information
                    articles = container.find_all('article')
                    
                    for article in articles:
                        # Find the paper title within the article
                        title_element = article.find('h3')
                        if title_element:
                            title = title_element.find('a').text.strip()
                            paper_list.append(title)
                except:
                    continue
        return paper_list
    except requests.exceptions.RequestException as e:
        print(f"Error occurred while fetching the HTML: {e}")
        return None

In [11]:
from datetime import datetime, timedelta

def get_past_year_urls():
    end_date = datetime(2025, 2, 10)
    start_date = datetime(2023, 5, 10)
    current_date = start_date
    urls = []
    
    while current_date <= end_date:
        url = f"https://huggingface.co/papers?date={current_date.strftime('%Y-%m-%d')}"
        urls.append(url)
        current_date += timedelta(days=1)
    
    return urls

In [12]:
# Get papers for each day in the past year
papers_by_date = {}
for url in get_past_year_urls():
    # Extract date from URL
    date_str = url.split('date=')[1]
    papers = get_hugging_face_top_daily_paper(url)
    if papers:
        papers_by_date[date_str] = papers

In [13]:
# Print number of days and total papers
print(f"Number of days: {len(papers_by_date)}")
total_papers = sum(len(papers) for papers in papers_by_date.values())
print(f"Total number of papers: {total_papers}")

# Print a sample (first day)
print("\nSample of the data:")
sample_date = next(iter(papers_by_date))
print(f"\nDate: {sample_date}")
print("Papers:")
for paper in papers_by_date[sample_date][:3]:  # print first 3 papers
    print(f"- {paper}")
print("...")

Number of days: 643
Total number of papers: 7791

Sample of the data:

Date: 2023-05-10
Papers:
- To Compress or Not to Compress- Self-Supervised Learning and Information Theory: A Review
- Recommender Systems with Generative Retrieval
- Are ChatGPT and GPT-4 General-Purpose Solvers for Financial Text Analytics? An Examination on Several Typical Tasks
...


Now we have listed all the top papers in the given timeframe.

Next is to get details about each paper and its authors.

## Get Data for Each Paper 

In [18]:
import arxiv
from scholarly import scholarly
from huggingface_hub import HfApi


def get_paper_by_title(paper_title: str):
    paper = next(arxiv.Client().results(arxiv.Search(query=paper_title)))
    return paper

def download_paper_by_id(paper_id: str) -> None:
    """
    This tool gets the id of a paper and downloads it from arxiv. It saves the paper locally
    in the current directory as "paper.pdf".

    Args:
        paper_id: The id of the paper to download.
    """
    paper = next(arxiv.Client().results(arxiv.Search(id_list=[paper_id])))
    paper.download_pdf(filename="paper.pdf")
    return None

def get_author_information_by_name(name: str):
    search_query = scholarly.search_author(name)
    author = scholarly.fill(next(search_query))
    return author

def get_paper_id_by_title(title: str) -> str:
    """
    This is a tool that returns the arxiv paper id by its title.
    It returns the title of the paper

    Args:
        title: The paper title for which to get the id.
    """
    api = HfApi()
    papers = api.list_papers(query=title)
    if papers:
        for paper in papers:
            if paper.title == title:
                return paper.id
    else:
        return None

In [15]:
sample_paper_details = get_paper_by_title(papers_by_date['2023-05-10'][0])
print(json.dumps(sample_paper_details.__dict__, indent=4 , default=str))

{
    "entry_id": "http://arxiv.org/abs/2304.09355v5",
    "updated": "2023-11-21 13:12:21+00:00",
    "published": "2023-04-19 00:33:59+00:00",
    "title": "To Compress or Not to Compress- Self-Supervised Learning and Information Theory: A Review",
    "authors": [
        "Ravid Shwartz-Ziv",
        "Yann LeCun"
    ],
    "summary": "Deep neural networks excel in supervised learning tasks but are constrained\nby the need for extensive labeled data. Self-supervised learning emerges as a\npromising alternative, allowing models to learn without explicit labels.\nInformation theory, and notably the information bottleneck principle, has been\npivotal in shaping deep neural networks. This principle focuses on optimizing\nthe trade-off between compression and preserving relevant information,\nproviding a foundation for efficient network design in supervised contexts.\nHowever, its precise role and adaptation in self-supervised learning remain\nunclear. In this work, we scrutinize various

In [16]:
import re

def extract_arxiv_id(entry_id: str) -> str:
    """Extract arXiv ID from entry_id URL using regex."""
    match = re.search(r'abs/([0-9]+\.[0-9]+)', entry_id)
    return match.group(1) if match else None

In [37]:
json_file = 'Assets/papers_details.json'
title_id_file = 'Assets/paper_title_id.json'
papers_with_details = {}
paper_title_id = {}

# Load existing data if files exist
if os.path.exists(json_file):
    with open(json_file, 'r', encoding='utf-8') as f:
        papers_with_details = json.load(f)

if os.path.exists(title_id_file):
    with open(title_id_file, 'r') as f:
        paper_title_id = json.load(f)

# Calculate total number of papers
total_papers = sum(len(papers) for papers in papers_by_date.values())
counter = 0

# Create progress bar
with tqdm(total=total_papers, desc="Processing papers") as pbar:
    for date, papers in papers_by_date.items():
        for paper in papers:
            try:
                # Save every 500 papers
                if counter % 500 == 0 and counter != 0:
                    # Truncate and write new data
                    try:
                        with open(json_file, 'w', encoding='utf-8') as f:
                            f.truncate(0)
                            json.dump(papers_with_details, f, indent=4, default=str, ensure_ascii=False)
                            f.flush()
                            os.fsync(f.fileno())
                        with open(title_id_file, 'w', encoding='utf-8') as f:
                            f.truncate(0)
                            json.dump(paper_title_id, f, indent=4, ensure_ascii=False)
                            f.flush() 
                            os.fsync(f.fileno())
                        print(f"saved at checkpoint {counter//500}.")
                    except Exception as e:
                        print(f"Error saving files: {e}")
                        
                # Skip if paper title already exists
                if paper in paper_title_id:
                    # print("skipped")
                    pbar.update(1)
                    counter += 1
                    continue
                    
                paper_details = get_paper_by_title(paper)
                # Extract paper ID from entry_id URL
                paper_id = extract_arxiv_id(paper_details.entry_id)
                
                # Skip if paper already exists in JSON
                if paper_id in papers_with_details:
                    pbar.update(1)
                    counter += 1
                    continue
                    
                papers_with_details[paper_id] = paper_details.__dict__
                paper_title_id[paper] = paper_id
                
                
                counter += 1
                pbar.update(1)
                    
            except Exception as e:
                pbar.update(1)
                counter += 1
                continue

# Save final results
with open(json_file, 'w') as f:
    json.dump(papers_with_details, f, indent=4, default=str)
with open(title_id_file, 'w') as f:
    json.dump(paper_title_id, f, indent=4)

Processing papers:   6%|▋         | 501/7791 [00:14<02:54, 41.87it/s]

saved at checkpoint 1.


Processing papers:  13%|█▎        | 1001/7791 [00:56<05:11, 21.81it/s]

saved at checkpoint 2.


Processing papers:  19%|█▉        | 1501/7791 [01:53<06:28, 16.19it/s]

saved at checkpoint 3.


Processing papers:  26%|██▌       | 2001/7791 [02:53<07:57, 12.12it/s]

saved at checkpoint 4.


Processing papers:  32%|███▏      | 2501/7791 [04:26<16:52,  5.22it/s]

saved at checkpoint 5.


Processing papers:  39%|███▊      | 3001/7791 [05:47<06:52, 11.62it/s]

saved at checkpoint 6.


Processing papers:  45%|████▍     | 3500/7791 [19:02<2:21:17,  1.98s/it] 

saved at checkpoint 7.


Processing papers:  51%|█████▏    | 4000/7791 [29:48<2:08:24,  2.03s/it]

saved at checkpoint 8.


Processing papers:  58%|█████▊    | 4500/7791 [43:54<1:48:17,  1.97s/it]

saved at checkpoint 9.


Processing papers:  64%|██████▍   | 5000/7791 [57:55<1:22:11,  1.77s/it]

saved at checkpoint 10.


Processing papers:  71%|███████   | 5500/7791 [1:11:21<1:18:20,  2.05s/it]

saved at checkpoint 11.


Processing papers:  77%|███████▋  | 6000/7791 [1:25:07<1:07:28,  2.26s/it]

saved at checkpoint 12.


Processing papers:  82%|████████▏ | 6388/7791 [1:34:24<58:20,  2.50s/it]  Bozo feed; consider handling: document declared as utf-8, but parsed as windows-1252
Processing papers:  83%|████████▎ | 6500/7791 [1:38:04<30:56,  1.44s/it]  

saved at checkpoint 13.


Processing papers:  90%|█████████ | 7043/7791 [2:25:36<07:08,  1.75it/s]     

saved at checkpoint 14.


Processing papers:  97%|█████████▋| 7564/7791 [2:25:39<00:03, 67.62it/s] 

saved at checkpoint 15.


Processing papers: 100%|██████████| 7791/7791 [2:25:40<00:00,  1.12s/it] 


In [7]:
# Load the papers details from JSON file
with open('Assets/papers_details.json', 'r', encoding='utf-8') as f:
    papers_with_details = json.load(f)


In [7]:
papers_with_details['2304.09355']

{'entry_id': 'http://arxiv.org/abs/2304.09355v5',
 'updated': '2023-11-21 13:12:21+00:00',
 'published': '2023-04-19 00:33:59+00:00',
 'title': 'To Compress or Not to Compress- Self-Supervised Learning and Information Theory: A Review',
 'authors': ['Ravid Shwartz-Ziv', 'Yann LeCun'],
 'summary': 'Deep neural networks excel in supervised learning tasks but are constrained\nby the need for extensive labeled data. Self-supervised learning emerges as a\npromising alternative, allowing models to learn without explicit labels.\nInformation theory, and notably the information bottleneck principle, has been\npivotal in shaping deep neural networks. This principle focuses on optimizing\nthe trade-off between compression and preserving relevant information,\nproviding a foundation for efficient network design in supervised contexts.\nHowever, its precise role and adaptation in self-supervised learning remain\nunclear. In this work, we scrutinize various self-supervised learning\napproaches from

## Save paper nodes

In [38]:
# Create a list to store paper data
paper_data = []

# Extract required attributes from each paper in the last 2000 papers
for i, (paper_id, paper_details) in enumerate(papers_with_details.items(), 1):
    paper_row = {
        'node_id': f"paper_{i}",
        'paper_id': paper_id,
        'publish_date': paper_details['published'].split()[0],
        'title': paper_details['title']
    }
    paper_data.append(paper_row)

# Convert to DataFrame and save as CSV
df = pd.DataFrame(paper_data)
df.to_csv('GraphDataset/paper.csv', index=False)

## Save Author and Organization Nodes

In [42]:
# Create a set to store unique authors and a counter dictionary
unique_authors = set()
author_counts = {}

# Create lists to store author data and paper-author relationships
author_data = []
paper_author_data = []
author_id = 1
author_name_to_id = {}

# Extract authors from each paper
for i, (paper_id, paper_details) in enumerate(papers_with_details.items(), 1):
    if 'authors' in paper_details:
        for author in paper_details['authors']:
            author_name = author
            if author_name not in unique_authors:
                # Add new author
                unique_authors.add(author_name)
                author_data.append({
                    'node_id': f"author_{author_id}",
                    'name': author_name
                })
                author_name_to_id[author_name] = f"author_{author_id}"
                author_id += 1
                author_counts[author_name] = 1
            else:
                # Increment count for existing author
                author_counts[author_name] += 1
            
            # Add paper-author relationship
            paper_author_data.append({
                'paper_id': f"paper_{i}",
                'author_id': author_name_to_id[author_name]
            })

# Convert to DataFrames and save as CSV
author_df = pd.DataFrame(author_data)
author_df.to_csv('GraphDataset/author.csv', index=False)

paper_author_df = pd.DataFrame(paper_author_data)
paper_author_df.to_csv('GraphDataset/paper_author.csv', index=False)

print(f"Total unique authors: {len(unique_authors)}")
print(f"Top 5 authors by paper count:")
top_authors = sorted(author_counts.items(), key=lambda x: x[1], reverse=True)[:5]
for author, count in top_authors:
    print(f"{author}: {count} papers")


Total unique authors: 18522
Top 5 authors by paper count:
Yu Qiao: 37 papers
Ziwei Liu: 34 papers
Dahua Lin: 31 papers
Hongsheng Li: 28 papers
Lei Zhang: 25 papers


In [43]:
# Create a set to store unique categories and a counter dictionary
unique_categories = set()
category_counts = {}

# Create lists to store category data and paper-category relationships
category_data = []
paper_category_data = []
category_id = 1
category_name_to_id = {}

# Extract categories from each paper
for i, (paper_id, paper_details) in enumerate(papers_with_details.items(), 1):
    if 'categories' in paper_details:
        for category in paper_details['categories']:
            if category not in unique_categories:
                # Add new category
                unique_categories.add(category)
                category_data.append({
                    'node_id': f"category_{category_id}",
                    'name': category
                })
                category_name_to_id[category] = f"category_{category_id}"
                category_id += 1
                category_counts[category] = 1
            else:
                # Increment count for existing category
                category_counts[category] += 1
            
            # Add paper-category relationship
            paper_category_data.append({
                'paper_id': f"paper_{i}",
                'category_id': category_name_to_id[category]
            })

# Convert to DataFrames and save as CSV
category_df = pd.DataFrame(category_data)
category_df.to_csv('GraphDataset/category.csv', index=False)

paper_category_df = pd.DataFrame(paper_category_data)
paper_category_df.to_csv('GraphDataset/paper_category.csv', index=False)

print(f"\nTotal unique categories: {len(unique_categories)}")
print(f"Top 5 categories by paper count:")
top_categories = sorted(category_counts.items(), key=lambda x: x[1], reverse=True)[:5]
for category, count in top_categories:
    print(f"{category}: {count} papers")



Total unique categories: 224
Top 5 categories by paper count:
cs.CV: 1832 papers
cs.CL: 1710 papers
cs.AI: 1561 papers
cs.LG: 1325 papers
cs.RO: 209 papers


In [44]:
# Print summary statistics
print("\nDataset Summary:")
print(f"Total papers: {len(papers_with_details)}")
print(f"Total unique categories: {len(unique_categories)}")

# Count unique authors
unique_authors = set()
for paper_details in papers_with_details.values():
    if 'authors' in paper_details:
        for author in paper_details['authors']:
            if isinstance(author, dict):
                unique_authors.add(author['name'])
            elif isinstance(author, str):
                unique_authors.add(author)

print(f"Total unique authors: {len(unique_authors)}")



Dataset Summary:
Total papers: 4193
Total unique categories: 224
Total unique authors: 18522
