In [2]:
import os
import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
from tqdm import tqdm
import argparse
import re

SAVE_TO = '/Users/janek/Documents/gpt_project/papers/'
YEARS_OF_INTEREST = np.arange(2010, 2023, 1).tolist()
CONFERENCES = ['nips', 'icml', 'iclr']


def make_directory(save_path):
    if '.pdf' in save_path:
        save_path = '/'.join(save_path.split('/')[:-1])
    if not os.path.exists(save_path):
        os.mkdir(save_path)


def save_file(full_url, save_to):
    make_directory(save_to)
    with open(save_to, 'wb') as f:
        f.write(requests.get(full_url).content)


def get_set_of_hashes_downloaded(path):
    files = os.listdir(path)
    files = [f for f in files if '.pdf' in f]
    hashes = [file.split('-')[0] for file in files]
    return set(hashes)


def get_nips_papers(year):
    BASE = 'https://papers.nips.cc/'
    CONFERENCE = 'nips'
    nips_save_to = SAVE_TO + f'{CONFERENCE}/' + f'{year}/'
    downloaded_hashes = get_set_of_hashes_downloaded(nips_save_to)
    print(
        f'Num files already downlaoded for {CONFERENCE}-{year}: ', len(downloaded_hashes))

    # parse html
    # for each year retrive list of links to paper pages
    response = requests.get(BASE + f'paper_files/paper/{year}')
    soup = BeautifulSoup(response.text, "html.parser")
    # 'paper title' is NIPS specific
    paper_links = soup.findAll("a", {'title': 'paper title'})
    paper_links = [link['href'] for link in paper_links]
    paper_links = [
        link for link in paper_links if link.split('/')[-1].split('-')[0] not in downloaded_hashes]
    print(
        f'Num files to be downlaoded: ', len(paper_links))
    for link in paper_links:  # for each paper page retrive links to paper pdfs
        response2 = requests.get(BASE + link)
        soup2 = BeautifulSoup(response2.text, "html.parser")
        # 'Paper-Conference' is NIPS specific
        pdf_links = soup2.select("a[href$='.pdf']")
        pdf_links = [l for l in pdf_links if 'Paper' in l]
        for pdf_link in pdf_links:
            # we expect to see only one pdf file thus [0]
            pdf_link_href = pdf_link['href']
            link_to_download = urljoin(BASE, pdf_link_href)

            # download
            fname = link_to_download.split('/')[-1]
            save_file(link_to_download, nips_save_to + fname)
            print(f'downloaded: {link_to_download}')


def get_icml_papers(year):
    BASE = 'https://icml.cc/virtual/2022/papers.html?filter=titles'
    CONFERENCE = 'nips'
    nips_save_to = SAVE_TO + f'{CONFERENCE}/' + f'{year}/'
    downloaded_hashes = get_set_of_hashes_downloaded(nips_save_to)
    print(
        f'Num files already downlaoded for {CONFERENCE}-{year}: ', len(downloaded_hashes))

    # parse html
    # for each year retrive list of links to paper pages
    response = requests.get(BASE + f'paper_files/paper/{year}')
    soup = BeautifulSoup(response.text, "html.parser")
    # 'paper title' is NIPS specific
    paper_links = soup.findAll("a", {'title': 'paper title'})
    paper_links = [link['href'] for link in paper_links]
    paper_links = [
        link for link in paper_links if link.split('/')[-1].split('-')[0] not in downloaded_hashes]
    print(
        f'Num files to be downlaoded: ', len(paper_links))
    for link in paper_links:  # for each paper page retrive links to paper pdfs
        response2 = requests.get(BASE + link)
        soup2 = BeautifulSoup(response2.text, "html.parser")
        # 'Paper-Conference' is NIPS specific
        pdf_links = soup2.select("a[href$='.pdf']")
        pdf_links = [l for l in pdf_links if 'Paper' in l]
        for pdf_link in pdf_links:
            # we expect to see only one pdf file thus [0]
            pdf_link_href = pdf_link['href']
            link_to_download = urljoin(BASE, pdf_link_href)

            # download
            fname = link_to_download.split('/')[-1]
            save_file(link_to_download, nips_save_to + fname)
            print(f'downloaded: {link_to_download}')


def get_iclr_papers():
    return


# if __name__ == "__main__":
#     parser = argparse.ArgumentParser(description='Test argparse')
#     parser.add_argument('--years', nargs='+', required=True, type=str,
#                         help='years for which to download files')
#     parser.add_argument('--conference', required=True, type=str,
#                         help='conference for which to download files')

#     args = parser.parse_args()
#     for year in args.years:
#         year = int(year)

#         if year not in YEARS_OF_INTEREST:
#             raise ValueError(
#                 f"{year} is a wrong Year. These are allowed years: {YEARS_OF_INTEREST}")

#         if args.conference == 'nips':
#             get_nips_papers(year)
#         elif args.conference == 'icml':
#             get_icml_papers(year)
#         elif args.conference == 'iclr':
#             get_iclr_papers(year)
#         else:
#             raise ValueError(f"These are allowed conferences: {CONFERENCES}")


In [7]:
def get_pdf_link(soup):
    content = soup.select("meta[name$='citation_pdf_url']")[0]
    return content['content']

def get_nips_title(soup):
    content = soup.select("meta[name$='citation_title']")[0]
    return content['content']

def get_nips_abstract(soup):
    text = soup.find('h4', text='Abstract').find_next('p').find('p').get_text()
    print(text)


def get_nips_authors(soup):
    authors = []
    contents = soup.select("meta[name$='citation_title']")
    for content in contents:
        author = content['content']
        authors.append(author)
    return authors

def get_nips_publication_date(soup):
    content = soup.select("meta[name$='citation_publication_date']")[0]
    return content['content']

def get_nips_papers(year):
    BASE = 'https://papers.nips.cc/'
    CONFERENCE = 'nips'
    nips_save_to = SAVE_TO + f'{CONFERENCE}/' + f'{year}/'
    downloaded_hashes = get_set_of_hashes_downloaded(nips_save_to)
    print(
        f'Num files already downlaoded for {CONFERENCE}-{year}: ', len(downloaded_hashes))

    # parse html
    # for each year retrive list of links to paper pages
    response = requests.get(BASE + f'paper_files/paper/{year}')
    soup = BeautifulSoup(response.text, "html.parser")
    # 'paper title' is NIPS specific
    paper_links = soup.findAll("a", {'title': 'paper title'})
    paper_links = [link['href'] for link in paper_links]
    paper_links = [
        link for link in paper_links if link.split('/')[-1].split('-')[0] not in downloaded_hashes]
    print(
        f'Num files to be downlaoded: ', len(paper_links))
    for link in paper_links[:10]:  # for each paper page retrive links to paper pdfs
        print(link)
        response2 = requests.get(BASE + link)
        soup2 = BeautifulSoup(response2.text, "html.parser")
        get_nips_abstract(soup2)
        # 'Paper-Conference' is NIPS specific
#         pdf_links = soup2.select("a[href$='.pdf']")
#         pdf_links = [l for l in pdf_links if 'Paper' in l]
#         for pdf_link in pdf_links:
#             # we expect to see only one pdf file thus [0]
#             pdf_link_href = pdf_link['href']
#             link_to_download = urljoin(BASE, pdf_link_href)

#             # download
#             fname = link_to_download.split('/')[-1]
# #             save_file(link_to_download, nips_save_to + fname)
#             print(f'downloaded: {link_to_download}')

get_nips_papers(2022)

Num files already downlaoded for nips-2022:  2512
Num files to be downlaoded:  322
/paper_files/paper/2022/hash/dff528ce3e1390c88f10bbf5e722a241-Abstract-Conference.html
None
/paper_files/paper/2022/hash/dffd1c523512e557f4e75e8309049213-Abstract-Conference.html
None
/paper_files/paper/2022/hash/e04101138a3c94544760c1dbdf2c7a2d-Abstract-Conference.html
None
/paper_files/paper/2022/hash/e095c0a3717629aa5497601985bfcf0e-Abstract-Conference.html
None
/paper_files/paper/2022/hash/e0c07bb70721255482020afca44cabf2-Abstract-Conference.html
None
/paper_files/paper/2022/hash/e0ccda3cb17b084a6f43c62cfac4784b-Abstract-Conference.html
None
/paper_files/paper/2022/hash/e0cfde0ff720fa9674bb976e7f1b99d4-Abstract-Conference.html


KeyboardInterrupt: 

In [64]:
year = 2022
BASE = 'https://icml.cc/'
CONFERENCE = 'icml'
nips_save_to = SAVE_TO + f'{CONFERENCE}/' + f'{year}/'

# parse html
# for each year retrive list of links to paper pages
response = requests.get(BASE + f'virtual/{year}/papers.html?filter=titles')
soup = BeautifulSoup(response.text, "html.parser")
soup2 = soup.findAll("a")
paper_links = [s['href'] for s in soup2 if ('/').join(s['href'].split('/')[:-1]) == f'/virtual/{year}/poster']
for link in paper_links[:1]:  # for each paper page retrive links to paper pdfs
    response2 = requests.get(BASE + link)
    soup2 = BeautifulSoup(response2.text, "html.parser")
    soup2 = soup2.findAll("a", {'class': 'href_PDF'})
    link = soup2[0]['href']
    response3 = requests.get(link)
    soup3 = BeautifulSoup(response3.text, "html.parser")
    pdf_links = soup3.select("a[href$='.pdf']")

    for pdf_link in pdf_links:
        # we expect to see only one pdf file thus [0]
        link_to_download = pdf_link['href']

        # download
        fname = link_to_download.split('/')[-1]
        save_file(link_to_download, nips_save_to + fname)
        print(f'downloaded: {link_to_download}')

[<a href="https://proceedings.mlr.press/v162/sokota22a/sokota22a.pdf" onclick="ga('send', 'event', 'PDF Downloads', 'Download', 'https://proceedings.mlr.press/v162/sokota22a/sokota22a.pdf', 10);" target="_blank">Download PDF</a>]


In [54]:
soup2 = soup.findAll("a")
paper_links = [s['href'] for s in soup2 if ('/').join(s['href'].split('/')[:-1]) == f'/virtual/{year}/poster']
soup2

['/virtual/2022/poster/16975',
 '/virtual/2022/poster/18423',
 '/virtual/2022/poster/18415',
 '/virtual/2022/poster/18417',
 '/virtual/2022/poster/18421',
 '/virtual/2022/poster/18413',
 '/virtual/2022/poster/18411',
 '/virtual/2022/poster/18409',
 '/virtual/2022/poster/18407',
 '/virtual/2022/poster/18395',
 '/virtual/2022/poster/18405',
 '/virtual/2022/poster/18391',
 '/virtual/2022/poster/18393',
 '/virtual/2022/poster/18389',
 '/virtual/2022/poster/18403',
 '/virtual/2022/poster/18385',
 '/virtual/2022/poster/18387',
 '/virtual/2022/poster/18381',
 '/virtual/2022/poster/18379',
 '/virtual/2022/poster/18373',
 '/virtual/2022/poster/18371',
 '/virtual/2022/poster/18375',
 '/virtual/2022/poster/18367',
 '/virtual/2022/poster/18355',
 '/virtual/2022/poster/18361',
 '/virtual/2022/poster/18363',
 '/virtual/2022/poster/18353',
 '/virtual/2022/poster/18357',
 '/virtual/2022/poster/18347',
 '/virtual/2022/poster/18351',
 '/virtual/2022/poster/18349',
 '/virtual/2022/poster/18343',
 '/virtu

In [49]:
soup2.text

AttributeError: ResultSet object has no attribute 'text'. You're probably treating a list of elements like a single element. Did you call find_all() when you meant to call find()?

In [30]:
soup


<!DOCTYPE html>

<html lang="en" style="scroll-padding-top: 70px;">
<head>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1.0, shrink-to-fit=no" name="viewport"/>
<link href="https://fonts.googleapis.com/css?family=Open+Sans:300italic,400italic,600italic,700italic,800italic,400,300,600,700,800" rel="stylesheet"/>
<link href="https://fonts.googleapis.com/css?family=Lora:400,700,400italic,700italic" rel="stylesheet"/>
<link href="https://fonts.googleapis.com/css2?family=Exo:wght@400;700&amp;family=Lato:wght@400;700&amp;display=swap" rel="stylesheet"/>
<link href="/static/expo/fonts/font-awesome.min.css" rel="stylesheet"/>
<link crossorigin="anonymous" href="https://use.fontawesome.com/releases/v5.8.1/css/all.css" integrity="sha384-50oBUHEmvpQ+1lW4y57PTFmhCaXp0ML5d60M1M7uH2+nqUivzIebhndOJK28anvf" rel="stylesheet"/>
<link href="https://cdn.jsdelivr.net/npm/bootstrap-select@1.13.14/dist/css/bootstrap-select.min.css" rel="stylesheet"/>
<link href="/static/virtual/c