In [None]:
import os
import requests
from bs4 import BeautifulSoup

# Base URL for NeurIPS proceedings
years = [2024, 2023, 2022]
base_url = "https://papers.nips.cc/paper_files/paper/{}"

# Headers to mimic a browser
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Directory to save PDFs
pdf_dir = "neurips_papers"
os.makedirs(pdf_dir, exist_ok=True)

def download_pdfs(year):
    url = base_url.format(year)
    response = requests.get(url, headers=headers, allow_redirects=True)
    if response.status_code != 200:
        print(f"Failed to retrieve {url}")
        return
    
    soup = BeautifulSoup(response.text, 'html.parser')
    paper_links = [a['href'] for a in soup.find_all('a', href=True) if "/paper/" in a['href']]
    
    for paper_link in paper_links:
        paper_page_url = f"https://papers.nips.cc{paper_link}"
        paper_response = requests.get(paper_page_url, headers=headers)
        
        if paper_response.status_code != 200:
            print(f"Failed to retrieve {paper_page_url}")
            continue
        
        paper_soup = BeautifulSoup(paper_response.text, 'html.parser')
        pdf_link = paper_soup.find('a', href=True, text="Paper")
        
        if pdf_link:
            pdf_url = f"https://papers.nips.cc{pdf_link['href']}"
            pdf_name = pdf_url.split('/')[-1]
            pdf_path = os.path.join(pdf_dir, pdf_name)
            
            print(f"Downloading {pdf_name}...")
            pdf_response = requests.get(pdf_url, headers=headers, allow_redirects=True)
            
            if pdf_response.status_code == 200:
                with open(pdf_path, 'wb') as pdf_file:
                    pdf_file.write(pdf_response.content)
                print(f"Saved: {pdf_path}")
            else:
                print(f"Failed to download {pdf_name}")

# Loop through specified years
for year in years:
    print(f"Fetching papers for {year}...")
    download_pdfs(year)

print("Download complete.")