In [1]:
from bs4 import BeautifulSoup
import urllib, urllib.request
import re
import numpy as np
from datetime import date
import datetime
import os
import arxiv

In [2]:
trivial_words_list_default = ['and','a','the','','of','in','on','it','after','for','ever','never','since','at','to','too'
                              , 'We', 'show', 'that', 'this', 'with', 'by', 'from', 'as', 'be', 'are', 'have', 'has', 'can', 
                              'could', 'will', 'would', 'may', 'might', 'must', 'shall', 'should', 'do', 'such', 'both', 'assume'
                              'also', 'an', 'any', 'each', 'every', 'either', 'neither', 'other', 'another', 'such', 'same', 'several',
                              'variable', 'various', 'well', 'where', 'which', 'while', 'wide', 'within', 'yet', 'you', 'your', 'yours', 'yourself', 
                              'but', 'been']

In [3]:
def download_articles_of_day(day=None, num_articles=10):
    """
    Downloads the titles and summaries of a given date in the condensed matter category. 
    If no inputs are given, it will do today's date and 10 articles

    Parameters
    ----------
    day : date.time format
        A given date
    num_articles : int
                    The number of articles requested

    Returns
    -------
    titles : bs4 ResultSet
            Contains all titles requested, can be indexed with []
    summaries : bs4 ResultSet
                Contains all abstracts belonging to the titles. 

    """
    if day == None:
        day = date.today()

    tomorrow = day+datetime.timedelta(days=1)
    url = f'http://export.arxiv.org/api/query?search_query=all:condensed%20matter&submittedDate:[{day}+TO+{tomorrow}]&start=0&max_results={num_articles}&sortBy=submittedDate&sortOrder=descending'
    data = urllib.request.urlopen(url)
    Bsoup = BeautifulSoup(data, 'html.parser')
    titles, summaries, ids = Bsoup.find_all(
        'title'), Bsoup.find_all('summary'), Bsoup.find_all('id')

    return titles, summaries, ids

In [4]:
def show_subset_of_articles(todays_articles, num_articles_to_show=4, only_title=True):
    """
    Shows a subset of the articles and asks the user if they are interesting.

    Parameters
    ----------
    todays_articles : tuple
        A tuple of the form (titles, summaries, ids)

    num_articles_to_show : int, optional
        The number of articles to show. The default is 4.

    Returns
    -------
    interesting_articles : list
        A list of the indices of the interesting articles.
    """

    titles, summaries, _ = todays_articles
    interesting_articles = []

    for i in range(num_articles_to_show):
        print(f'Aritlce Number {i}')
        print(titles[i+1].text)
        if only_title == False:
            print(summaries[i].text)
        print()
        while True:
            answer = input("Is this article interesting? (yes/no/stop): ")
            if answer == 'yes':
                interesting_articles.append(i)
                break
            elif answer == 'no':
                break
            elif answer == 'stop':
                return interesting_articles
            else:
                print("Please enter yes, no, or stop")

    return interesting_articles

In [5]:
def filter_out_non_trivial_words(wordlist, trivial_words_list=trivial_words_list_default):
    """
    Function that removes trivial words from a list of words. 
    Parameters
    ----------
    wordlist : list of strings
                , may be lowercase or uppercase 
    trivial_words_list : list of lowercase strings 
                        that should be removed from input "wordlist"
    
    Returns
    -------
    wordlist_nontrivial : list of strings 
                        where trivial words are removed.


    """
    wordlist_nontrivial = []

    for i in range(len(wordlist)):
        word = wordlist[i]
        # conver input words to lowercase and remove '\n' characters (if present)
        if word.lower().strip('\n').strip(',').strip('.') not in trivial_words_list:
            wordlist_nontrivial.append(word.strip('\n').strip(',').strip('.'))

    return wordlist_nontrivial

In [6]:
def most_likely_interesting_articles(
    info, interesting_indices, shown_upto, trivial_words_list
):
    """
    Parameters
    ----------
    info : tuple
        Tuple containing titles, summaries, and ids of articles
    interesting_indices : list
        List of indices of interesting articles
    shown_upto : int
        Number of articles shown so far
    trivial_words_list : list
        List of trivial words to be removed from articles

    Returns
    -------
    comparison_percentages : list
        List of percentages of interesting words in each article

    Notes
    -----
    Define percentage as how many words of an article appear in the interesting
    articles list divided by total number of words in the article.
    """

    assert len(interesting_indices) > 0, "The interesting articles must be more than 0"

    titles, summaries, _ = info
    interesting_wordlist = []
    for i in interesting_indices:
        interesting_wordlist += titles[i + 1].text.split()
        interesting_wordlist += summaries[i].text.split()

    interesting_wordlist_nontrivial = filter_out_non_trivial_words(
        interesting_wordlist, trivial_words_list
    )

    unique_words = np.unique(interesting_wordlist_nontrivial)

    comparison_percentages = []
    for i in range(len(summaries) - shown_upto):
        wordlist = summaries[i + shown_upto].text.split()
        wordlist_nontrivial = filter_out_non_trivial_words(wordlist, trivial_words_list)
        interesting_words = len(
            np.nonzero(np.isin(wordlist_nontrivial, unique_words))[0]
        )
        comparison_percentages.append(interesting_words / len(wordlist_nontrivial))

    print("The next most likely interesting article is:")
    print()
    print(titles[np.argmax(comparison_percentages) + shown_upto + 1].text)
    print(summaries[np.argmax(comparison_percentages) + shown_upto].text)
    print(f"With an overlap in words of {np.max(comparison_percentages)*100:.2f}%")
    return np.argmax(comparison_percentages) + shown_upto

In [9]:
def get_arxivID_of_papers(ids):
    """
    A list of internet links including the ArXiv ids is converted to a list with only the arxiv ids.

    Parameters
    ----------
    ids : list of internet links including the ArXiv ids for every paper.


    Returns
    -------
    list of ArXiv ids according to the format: "YYMM.NNNNNv{versionumber}" for every abstract/title of every paper. YY = year, MM = month. NNNNN = som identifier number 

    """
    idlist_total = []
    for i in range(len(ids)):
        id1 = ids[i]
        if i > 0:
            string_seperated = re.split('/', id1.string)
            idlist_pdf = string_seperated[4]
            # ArXiv id is given by: YYMM.NNNNNv1 therefore when seperate it should have length 14.
            # See: https://info.arxiv.org/help/arxiv_identifier.html#new
            numberlist = re.split('', idlist_pdf)
            assert len(
                numberlist) == 14, 'This is not the ArXiv id, pdf file cannot be saved'
            idlist_total.append(idlist_pdf)

    return idlist_total


def save_articles_as_pdf(index_article_list,
                         ids):
    """
    Saves interesting papers  as .pdf file in the current repository.

    Parameters
    ----------
    index_article_list : list of indices corresponding to the papers that should be saved as pdf.
    ids         : list of internet links including the ArXiv ids for every paper.
    save_papers : boolean, if True papers are saved. If False they are not saved.

    Returns
    -------
    prints article's filename of the saved papers.

    """

    idlist_total = get_arxivID_of_papers(ids)

    # Some checks
    assert max(index_article_list) <= len(
        ids), 'Index of paper does not exist. All indices should be smaller than {} '.format(len(ids))
    assert min(index_article_list) >= 0, 'Negative index for paper is not allowed.'

    for paper_index in index_article_list:
        article_id = idlist_total[paper_index]
        paper = next(arxiv.Search(id_list=[article_id]).results())

        # name of file
        filename = 'paper_' + str(paper_index) + '_' + article_id + '.pdf'
        print(f'Downloaded paper with {filename}')
        if os.path.isfile(filename):
            print('File already exists')
        else:
            paper.download_pdf(filename=filename)

In [10]:
#Main loop example:
info = download_articles_of_day(day=None, num_articles=50)
intreseting = show_subset_of_articles(info, num_articles_to_show=8, only_title=True)
interesting_article_index = most_likely_interesting_articles(info, intreseting, 8, trivial_words_list_default)
titles, summaries, ids = info
save_articles_as_pdf([interesting_article_index], ids)

Aritlce Number 0
Constraining the onset density for the QCD phase transition with the
  neutrino signal from core-collapse supernovae



Is this article interesting? (yes/no/stop):  yes


Aritlce Number 1
Time-Asymmetric Protocol Optimization for Efficient Free Energy
  Estimation



Is this article interesting? (yes/no/stop):  yes


Aritlce Number 2
Imaging 3D Chemistry at 1 nm Resolution with Fused Multi-Modal Electron
  Tomography



Is this article interesting? (yes/no/stop):  no


Aritlce Number 3
A Cookbook of Self-Supervised Learning



Is this article interesting? (yes/no/stop):  yes


Aritlce Number 4
Colour-Flavour Locked Quark Stars in Light of the Compact Object in HESS
  J1731-347 and the GW190814 Event



Is this article interesting? (yes/no/stop):  stop


The next most likely interesting article is:

Reconstruction of an Observationally Constrained $f(R, T)$ gravity model
  In this paper, an attempt is made to construct a
Friedmann-Lemaitre-Robertson-Walker model in $f(R,T)$ gravity with a perfect
fluid that yields acceleration at late times. We take $f(R,T)$ as $R$ + $8\pi
\mu T$. As in the $\Lambda$CDM model, we take the matter to consist of two
components, viz., $\Omega_m$ and $\Omega_{\mu}$ such that $\Omega_m$ +
$\Omega_{\mu}$=1. The parameter $\Omega_m$ is the matter density (baryons +
dark matter), and $\Omega_{\mu}$ is the density associated with the Ricci
scalar $R$ and the trace $T$ of the energy momentum tensor, which we shall call
dominant matter. We find that at present $\Omega_{\mu}$ is dominant over
$\Omega_m$, and that the two are in the ratio 3:1 to 3:2 according to the three
data sets: (i) 77 Hubble OHD data set (ii) 580 SNIa supernova distance modulus
data set and (iii) 66 pantheon SNIa data which include high red shi