# **Experimentation**

### **Import Relevant Libraries**

In [None]:
'''
Webpage for a date's (say 4th October 2024) daily papers has the format: https://huggingface.co/papers?date=2024-10-04
'''

In [2]:
import requests
from bs4 import BeautifulSoup
import os

# Base URL for Hugging Face
BASE_URL = "https://huggingface.co"

# URL for the Hugging Face daily papers page (update this to the correct page)
URL = f"{BASE_URL}/papers?date=2024-10-04"

# Function to extract PDF links from individual paper page
def get_pdf_link(paper_page_url):
    # Fetch the paper's page content
    paper_response = requests.get(paper_page_url)
    
    # Parse the paper's page content
    if paper_response.status_code == 200:
        paper_soup = BeautifulSoup(paper_response.content, 'html.parser')
        
        # Find the PDF link on the paper's page
        # Assuming the PDF link is in an <a> tag with 'href' that contains '.pdf'
        pdf_link = paper_soup.find('a', href=lambda href: href and ".pdf" in href)
        
        if pdf_link:
            return pdf_link['href']  # Return the PDF link
    return None

# Function to download the PDF file
def download_pdf(pdf_url, paper_title):
    # Get the PDF content
    pdf_response = requests.get(pdf_url)
    
    if pdf_response.status_code == 200:
        # Define the PDF file name, making it a valid file name
        paper_title = paper_title.replace("/", "-").replace("\\", "-")  # Clean file name
        pdf_file_name = f"{paper_title}.pdf"
        
        # Write the PDF content to a file
        with open(pdf_file_name, 'wb') as f:
            f.write(pdf_response.content)
        print(f"Downloaded: {pdf_file_name}")
    else:
        print(f"Failed to download PDF: {pdf_url}")

# Step 1: Scrape the daily papers page for paper links
response = requests.get(URL)

if response.status_code == 200:
    # Parse the main daily papers page
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all the paper links
    # We are looking for <a> tags with the given class attributes in the example you shared
    paper_links = soup.find_all('a', class_='line-clamp-3 cursor-pointer text-balance')
    
    for paper in paper_links:
        paper_title = paper.text.strip()  # Get the title of the paper
        paper_page_url = paper['href']   # Get the relative link to the paper's Hugging Face page

        print(paper_page_url)
        
        # Ensure the link is a full URL
        paper_page_url = f"{BASE_URL}{paper_page_url}"
        
        # Step 2: Visit each paper's Hugging Face page to extract the PDF link
        pdf_link = get_pdf_link(paper_page_url)
        
        if pdf_link:
            # Step 3: Download the PDF
            download_pdf(pdf_link, paper_title)
        else:
            print(f"PDF link not found for: {paper_title}")
else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")


/papers/2410.02740
PDF link not found for: Revisit Large-Scale Image-Caption Data in Pre-training Multimodal Foundation Models
/papers/2410.02713
PDF link not found for: Video Instruction Tuning With Synthetic Data
/papers/2410.02757
PDF link not found for: Loong: Generating Minute-level Long Videos with Autoregressive Language Models
/papers/2410.02712
PDF link not found for: LLaVA-Critic: Learning to Evaluate Multimodal Models
/papers/2410.02746
PDF link not found for: Contrastive Localized Language-Image Pre-Training
/papers/2410.02073
PDF link not found for: Depth Pro: Sharp Monocular Metric Depth in Less Than a Second
/papers/2410.01679
PDF link not found for: VinePPO: Unlocking RL Potential For LLM Reasoning Through Refined Credit Assignment
/papers/2410.02724
PDF link not found for: Large Language Models as Markov Chains
/papers/2410.02678
PDF link not found for: Distilling an End-to-End Voice Assistant Without Instruction Training Data
/papers/2410.02416
PDF link not found for: