## Get pdfs from arxiv.org


In [1]:
# Set up to use local modules
%load_ext autoreload
%autoreload 2
import os
import sys
module_path = os.path.abspath(os.path.join('..')) # Add parent directory to path
sys.path.insert(0, module_path)

In [2]:
import urllib.request
import pyprojroot

PDF_DIR = pyprojroot.here("data")

In [3]:
# https://lukasschwab.me/arxiv.py/arxiv.html

import arxiv

# Construct the default API client.
client = arxiv.Client()

# Search for the 10 most recent articles matching the keyword "quantum."
search = arxiv.Search(
    query="quantum", max_results=10, sort_by=arxiv.SortCriterion.SubmittedDate
)

results = client.results(search)

# `results` is a generator; you can iterate over its elements one by one...
for r in client.results(search):
    print(r.title)
# ...or exhaust it into a list. Careful: this is slow for large results sets.
# all_results = list(results)
# print([r.title for r in all_results])

Observation of an inverse turbulent-wave cascade in a driven quantum gas
Robustness of Fixed Points of Quantum Channels and Application to Approximate Quantum Markov Chains
Polarization dependent non-Hermitian atomic grating controlled by dipole blockade effect
Centerless-BMS charge algebra
Driven Multiphoton Qubit-Resonator Interactions
Geometric Quantization Without Polarizations
Effective Lifshitz black holes, hydrodynamics, and transport coefficients in fluid/gravity correspondence
Optical Manipulation of Spin States in Ultracold Magnetic Atoms via an Inner-Shell Hz Transition
Single-layer tensor network approach for three-dimensional quantum systems
A Formulation of Quantum Fluid Mechanics and Trajectories


In [4]:
pdf_ids = ["1706.03762v6", "1605.08386v1"]

# Search for the paper with ID "1605.08386v1"
search_by_id = arxiv.Search(id_list=[pdf_ids[0]])
paper = next(client.results(search_by_id))
print(paper.title)

paper.download_pdf(dirpath=PDF_DIR, filename="example_paper.pdf")

Attention Is All You Need


'/home/jordan/documents/GitHub/arxiv-chat/data/example_paper.pdf'

In [5]:
papers = {
    "paper_1": {
        "title": "Attention is All You Need",
        "arxiv_id": "1706.03762",
        "filename": "vaswani_et_al_2017.pdf",
    },
    "paper_2": {
        "title": "BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding",
        "arxiv_id": "1810.04805",
        "filename": "devlin_et_al_2018.pdf",
    },
    "paper_3": {
        "title": "Generative Adversarial Nets",
        "arxiv_id": "1406.2661",
        "filename": "goodfellow_et_al_2014.pdf",
    },
    "paper_4": {
        "title": "Playing Atari with Deep Reinforcement Learning",
        "arxiv_id": "1312.5602",
        "filename": "mnih_et_al_2013.pdf",
    },
    "paper_5": {
        "title": "ImageNet Classification with Deep Convolutional Neural Networks",
        "arxiv_id": "",
        "alt_url": "https://proceedings.neurips.cc/paper_files/paper/2012/file/c399862d3b9d6b76c8436e924a68c45b-Paper.pdf",
        "filename": "krizhevsky_et_al_2012.pdf",
    },
}

for paper in papers.values():
    # Check if the paper has been downloaded
    if not os.path.exists(os.path.join(PDF_DIR, paper["filename"])):
        if paper["arxiv_id"] == "":
            print(paper["title"])
            urllib.request.urlretrieve(
                paper["alt_url"], os.path.join(PDF_DIR, paper["filename"])
            )
        else:
            search_by_id = arxiv.Search(id_list=[paper["arxiv_id"]])
            paper_info = next(client.results(search_by_id))
            print(paper_info.title)
            paper_info.download_pdf(dirpath=PDF_DIR, filename=paper["filename"])
    else:
        print(f" Already downloaded: {paper['title']}")

 Already downloaded: Attention is All You Need
 Already downloaded: BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding
 Already downloaded: Generative Adversarial Nets
 Already downloaded: Playing Atari with Deep Reinforcement Learning
 Already downloaded: ImageNet Classification with Deep Convolutional Neural Networks


## Get most recent ML papers from arxiv
