In [1]:
import requests
from bs4 import BeautifulSoup

import glob
import yaml
import re

from deco import synchronized, concurrent
import deco

In [2]:
yaml.reader.Reader.NON_PRINTABLE = re.compile(
    u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]')

In [3]:
def arxiv_to_bibtex(aXid):
    res = BeautifulSoup(
        requests.get(
            "https://arxiv2bibtex.org/?q={}".format(aXid)
        ).text,
        "html5lib"
    ).find("textarea").text.strip()
    if res[0] != "@":
        raise ValueError("Failed to fetch aXid  {}.".format(aXid))
    return res

@concurrent
def doi_to_bibtex(doi):
    res = requests.get(
        "http://dx.doi.org/{}".format(doi),
        headers={"Accept": "text/bibliography; style=bibtex"}
    ).text.strip()
    if res[0] != "@":
        raise ValueError("Failed to fetch doi  {}.".format(doi))
    return res

In [4]:
@synchronized
def load_bibs():
    files = glob.glob("../_posts/*.md")

    bibs = {}

    failures = []

    for i, f in enumerate(files):
        lines = open(f, "r").readlines()

        info = yaml.load(
            "".join(lines[1:lines.index("---\n", 2)])
        )

        try:
            if 'doi' in info:
                bibs[i] = doi_to_bibtex(info['doi'])
#             elif 'arxiv' in info:
#                 bibs[i] = arxiv_to_bibtex(info['arxiv'])
            else:
                bibs[i] = ""
                failures.append(i)
        except:
            bibs[i] = ""
            failures.append(i)

        print("{}%   ".format(int(100*i/len(files))), end="\r")
    return bibs

In [5]:
def load_arxivs():
    files = glob.glob("../_posts/*.md")

    bibs = {}

    failures = []

    for i, f in enumerate(files):
        lines = open(f, "r").readlines()

        info = yaml.load(
            "".join(lines[1:lines.index("---\n", 2)])
        )

        try:
            if 'arxiv' in info:
                bibs[i] = arxiv_to_bibtex(info['arxiv'])
            else:
                bibs[i] = ""
                failures.append(i)
        except:
            bibs[i] = ""
            failures.append(i)

        print("{}%   ".format(int(100*i/len(files))), end="\r")
    return bibs, failures

In [6]:
bibs = load_bibs()
bibrxs, failures = load_arxivs()

91%   

ValueError: Failed to fetch doi  10.1002/ece3.4273.

In [37]:
bibfile = "\n\n".join([b for _, b in bibs.items() if b != ""])
bibfile += "\n\n".join([b for _, b in bibrxs.items() if b != ""])

In [39]:
with open('365papers.bib', 'w') as fh:
    fh.write(bibfile)