In [None]:
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Base URL for NeurIPS proceedings
BASE_URL = "https://papers.nips.cc"

# Headers to mimic a browser
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
}

# Specify the year
YEAR = 2024  # Change this if needed
MAX_PAPERS = 50  # Limit to 150 papers

def fetch_paper_links(year):
    """Fetch paper links from the NeurIPS proceedings page."""
    url = f"{BASE_URL}/paper_files/paper/{year}"
    print(f"Fetching papers from {url}...")

    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        if response.status_code != 200:
            print(f"❌ Failed to retrieve {url}")
            return []

        soup = BeautifulSoup(response.text, "html.parser")
        
        # Find all paper links
        paper_links = [a['href'] for a in soup.find_all('a', href=True) if "/paper/" in a['href']]
        return paper_links[:MAX_PAPERS]  # Limit to MAX_PAPERS

    except requests.RequestException as e:
        print(f"⚠️ Error fetching papers: {e}")
        return []

def extract_paper_details(paper_link, serial_no):
    """Extract authors, abstract, and PDF link from a paper page."""
    paper_url = f"{BASE_URL}{paper_link}"
    
    try:
        response = requests.get(paper_url, headers=HEADERS, timeout=10)
        if response.status_code != 200:
            print(f"❌ Failed to retrieve {paper_url}")
            return None

        soup = BeautifulSoup(response.text, "html.parser")

        # Extract authors from the <p> tag after <h4> Authors
        authors = "No Authors Available"
        author_heading = soup.find('h4', string="Authors")  # Find h4 with "Authors"
        if author_heading:
            author_paragraph = author_heading.find_next('p')  # Get the next paragraph
            if author_paragraph:
                italic_authors = author_paragraph.find('i')  # Get authors inside <i> tag
                if italic_authors:
                    authors = italic_authors.text.strip()

        # Extract abstract including those with GitHub links
        abstract = "No Abstract Available"
        abstract_heading = soup.find('h4', string="Abstract")
        if abstract_heading:
            abstract_paragraph = abstract_heading.find_next('p')
            if abstract_paragraph:
                abstract = abstract_paragraph.text.strip()
                github_link = abstract_paragraph.find('a', href=True)
                if github_link:
                    abstract += f" (GitHub: {github_link['href']})"

        # Extract PDF link
        pdf_link_tag = soup.find('a', string="Paper")
        pdf_link = f"{BASE_URL}{pdf_link_tag['href']}" if pdf_link_tag else "No PDF Available"

        return [serial_no, authors, abstract, pdf_link]

    except requests.RequestException as e:
        print(f"⚠️ Error accessing {paper_url}: {e}")
        return None

def main():
    """Main function to scrape and save NeurIPS papers to a CSV file."""
    paper_links = fetch_paper_links(YEAR)
    
    if not paper_links:
        print("⚠️ No papers found.")
        return

    data = []
    for index, link in enumerate(paper_links, start=1):
        if index > MAX_PAPERS:  # Stop after 150 papers
            break
        print(f" Processing Paper {index}/{MAX_PAPERS}...")
        paper_details = extract_paper_details(link, index)
        if paper_details:
            data.append(paper_details)

    # Create DataFrame and save to CSV
    df = pd.DataFrame(data, columns=["Serial No", "Authors", "Abstract", "PDF Link"])
    csv_file = f"neurips_papers_{YEAR}.csv"
    df.to_csv(csv_file, index=False)

    print(f"✅ CSV file '{csv_file}' created successfully!")

if __name__ == "__main__":
    main()