In [113]:
## “Thank you to arXiv for use of its open access interoperability.”

In [1]:
from bs4 import BeautifulSoup
import urllib, urllib.request
import re
import numpy as np
from datetime import date
import datetime

In [2]:
### To install: open terminal and run: "pip install arxiv"
## For saving paper's pdfs
import arxiv

ModuleNotFoundError: No module named 'arxiv'

In [4]:
# paper = next(arxiv.Search(id_list=["1605.08386v1"]).results())
# paper.download_pdf(filename="downloaded-paper.pdf")

In [3]:
def download_articles_of_day(day=None, num_articles=10):
    """
    Downloads the titles and summaries of a given date in the condensed matter category. 
    If no inputs are given, it will do today's date and 10 articles
    
    Parameters
    ----------
    day : date.time format
        A given date
    num_articles : int
                    The number of articles requested
    
    Returns
    -------
    titles : bs4 ResultSet
            Contains all titles requested, can be indexed with []
    summaries : bs4 ResultSet
                Contains all abstracts belonging to the titles. 
    
    """
    if day==None:
        day = date.today()

    tomorrow=day+datetime.timedelta(days=1)
    url = f'http://export.arxiv.org/api/query?search_query=all:condensed%20matter&submittedDate:[{day}+TO+{tomorrow}]&start=0&max_results={num_articles}&sortBy=submittedDate&sortOrder=descending'
    data = urllib.request.urlopen(url)
    Bsoup = BeautifulSoup(data, 'html.parser')
    titles,summaries,ids =  Bsoup.find_all('title'), Bsoup.find_all('summary'),Bsoup.find_all('id')

    return titles, summaries,ids

In [4]:
titles, summaries,ids = download_articles_of_day(day=None, num_articles=1)



In [5]:
info = download_articles_of_day(day=None, num_articles=20)

In [16]:
def show_subset_of_articles(todays_articles, num_articles_to_show=4):
    
    titles, summaries, _ = todays_articles
    interesting_articles = []
    
    for i in range(num_articles_to_show): 
        print(f'Aritlce Number {i}')
        print(titles[i+1].text)
        print(summaries[i].text)
        print()
        while True:
            answer = input("Is this article interesting? (yes/no/stop): ")
            if answer == 'yes':
                interesting_articles.append(i)
                break
            elif answer == 'no':
                break
            elif answer == 'stop':
                return interesting_articles
            else:
                print("Please enter yes, no, or stop")
        
    return interesting_articles

In [17]:
intreseting = show_subset_of_articles(info)

Aritlce Number 0
Universal scaling function ansatz for finite-temperature jamming
  We cast a nonzero-temperature analysis of the jamming transition into the
framework of a scaling ansatz. We show that four distinct regimes for scaling
exponents of thermodynamic derivatives of the free energy such as pressure,
bulk and shear moduli, can be consolidated by introducing a universal scaling
function with two branches. Both the original analysis and the scaling theory
assume that the system always resides in a single basis in the energy
landscape. The two branches are separated by a line $T^*(\Delta \phi)$ in the
$T-\Delta \phi$ plane, where $\Delta \phi=\phi-\phi_c^\Lambda$ is the deviation
of the packing fraction from its critical, jamming value, $\phi_c^\Lambda$, for
that basin. The branch for $T<T^*(\Delta \phi)$ reduces at $T=0$ to an earlier
scaling ansatz that is restricted to $T=0$, $\Delta \phi \ge 0$, while the
branch for $T>T^*(\Delta \phi)$ reproduces exponents observed for ther

In [21]:
print(intreseting)

[0, 2, 3]


In [32]:
trivial_words_list_default = ['and','a','the','','of','in','on','it','after','for','ever','never','since','at','to','too']

def filter_out_non_trivial_words(wordlist,trivial_words_list=trivial_words_list_default):
    '''
    Function that removes trivial words from a list of words. 
    Input:
    wordlist            = list of strings, may be lowercase or uppercase 
    trivial_words_list  = list of lowercase strings that should be removed from input "wordlist"
    Ouput:
    wordlist_nontrivial = list of strings where trivial words are removed.
    '''
    wordlist_nontrivial = []
        
    for i in range(len(wordlist)):
        word =  wordlist[i]

        ## check if word is a trivial word
        ## conver input words to lowercase and remove '\n' characters (if present)
        if word.lower().strip('\n') not in trivial_words_list:
            wordlist_nontrivial.append(word.strip('\n'))

    return wordlist_nontrivial

In [102]:
### For every paper, filter trivial words from the title

def filter_nontrivial_words_from_papers(input_data,trivial_words_list=trivial_words_list_default):
    
    """
    A list of unsplitted titles/abstracts for every paper.
    For each paper the title/abstract is splitted and all non-trivial words are removed.
    Output is a list of words for every paper
    
    Parameters
    ----------
    input_data : list of of unsplitted titles/abstracts for every paper.
    trivial_words_list : list of lowercase trivial words that should be removed 
    
    Returns
    -------
    list of non-trivial words for every abstract/title of every paper.
    
    """
    
    ### input data is splitted.
    splitted_strings_list = []
    for i in range(len(input_data)):
        title = input_data[i]

        ## skip the first element
        if i > 0:
            # print(title.string)
            ## seperate string on ' ' & '\n' characters
            string_splitted = re.split(' ', title.string)
            string_splitted = [ re.split('\n', element)[0]  for element in string_splitted ]
            # print(string_splitted)
            splitted_strings_list.append(string_splitted)
            
    
    print('------')
    ### from this data trivial words are removed. 
    wordlist_nontrivial_paper = []

    for index_paper in range(len(splitted_strings_list)):
        wordlist            = splitted_strings_list[index_paper]
        wordlist_nontrivial = filter_out_non_trivial_words(wordlist,trivial_words_list)

        print(wordlist_nontrivial)
        wordlist_nontrivial_paper.append(wordlist_nontrivial)
        
    return wordlist_nontrivial_paper

In [109]:

def get_arxivID_of_papers(ids):
    
    """
    A list of internet links including the ArXiv ids is converted to a list with only the arxiv ids.
    
    Parameters
    ----------
    ids : list of internet links including the ArXiv ids for every paper.
    
    
    Returns
    -------
    list of ArXiv ids according to the format: "YYMM.NNNNNv{versionumber}" for every abstract/title of every paper. YY = year, MM = month. NNNNN = som identifier number 
    
    """
    
    
    idlist_total = []
    for i in range(len(ids)):
            id1 = ids[i]

            ## skip the first element
            if i > 0:
                # print(id1.string)
                ## seperate string on '/' characters
                string_seperated = re.split('/', id1.string)
                idlist_pdf = idlist_converted[4]
                # print(idlist_pdf)

                #### ArXiv id is given by: YYMM.NNNNNv1 therefore when seperate it should have length 14.
                ### See: https://info.arxiv.org/help/arxiv_identifier.html#new
                numberlist = re.split('',idlist_pdf)
                assert len(numberlist) == 14, 'This is not the ArXiv id, pdf file cannot be saved'

                idlist_total.append(idlist_pdf)

                # year_index  = 10*int(numberlist[1]) + int(numberlist[2])
                # month_index = 10*int(numberlist[3]) + int(numberlist[4])
                
                
    return idlist_total


def save_articles_as_pdf(index_article_list,
                         ids,save_papers=
                        False):
    
    
    """
    Saves interesting papers  as .pdf file in the current repository.
    
    Parameters
    ----------
    index_article_list : list of indices corresponding to the papers that should be saved as pdf.
    ids         : list of internet links including the ArXiv ids for every paper.
    save_papers : boolean, if True papers are saved. If False they are not saved.
    
    Returns
    -------
    prints article's filename of the saved papers.
    
    """
    
    
    idlist_total = get_arxivID_of_papers(ids)
    
    ## Some checks
    assert max(index_article_list) <= len(ids), 'Index of paper does not exist. All indices should be smaller than {} '.format(len(ids))
    assert min(index_article_list) >= 0, 'Negative index for paper is not allowed.'
    
    for paper_index in index_article_list:
        article_id  = idlist_total[paper_index]
        paper = next(arxiv.Search(id_list=[article_id]).results())
        
        ## name of file
        filename = 'paper_' + str(paper_index) + '_' + article_id + '.pdf'
        print(filename)
        if save_papers == True:
            paper.download_pdf(filename=filename)
            

In [110]:
# summaries[0]

# # print(summaries[0].string)
# print(re.split(' ', summaries[0].string))

In [111]:
title_paper_wordlist_nontrivial_paper = filter_nontrivial_words_from_papers(titles,trivial_words_list=trivial_words_list_default)

------
['Symmetry', 'Fractionalized', '(Irrationalized)', 'Fusion', 'Rules', 'Two', 'Domain-Wall', 'Verlinde', 'Formulae']
['Signatures', 'Fractional', 'Quantum', 'Anomalous', 'Hall', 'States', 'Twisted', 'MoTe2', 'Bilayer']
['Effective', 'electric', 'field:', 'quantifying', 'sensitivity', 'searches', 'new', 'P,T-odd', 'physics', 'with', 'EuCl$_3\\cdot$6H$_2$O']
['Big', 'Bang', 'initial', 'conditions', 'self-interacting', 'hidden', 'dark', 'matter']
['Multi-Purpose', 'Platform', 'Analog', 'Quantum', 'Simulation']
['Exciton', 'band', 'structure', 'V$_2$O$_5$']
['first', 'application', 'machine', 'deep', 'learning', 'background', 'rejection', 'ALPS', 'II', 'TES', 'detector']
['Search', 'gravitational-lensing', 'signatures', 'full', 'third', 'observing', 'run', 'LIGO-Virgo', 'network']
['Detached', 'Continuous', 'Circumstellar', 'Matter', 'Type', 'Ibc', 'Supernovae', 'from', 'Mass', 'Eruption']
['NvDEx-100', 'Conceptual', 'Design', 'Report']


In [112]:
trivial_words_list_default = ['and','by','a','the','',  '','of','in','on','it','after',
                              'for','ever','never','since','at','to','too','e.g.','are',
                              'nm','is','as','we','i','go','not','can','be','that']

summaries_papers_wordlist_nontrivial = filter_nontrivial_words_from_papers(summaries,trivial_words_list=trivial_words_list_default)

------
['interplay', 'between', 'spontaneous', 'symmetry', 'breaking', 'topology', 'result', 'exotic', 'quantum', 'states', 'matter.', 'celebrated', 'example', 'quantum', 'Hall', '(QAH)', 'state,', 'which', 'exhibits', 'an', 'integer', 'quantum', 'Hall', 'effect', 'magnetic', 'field', 'thanks', 'its', 'intrinsic', 'ferromagnetism.', 'presence', 'electron-electron', 'interactions,', 'exotic', 'fractional-QAH', '(FQAH)', 'states', 'magnetic', 'field', 'emerge.', 'These', 'states', 'could', 'host', 'fractional', 'excitations,', 'non-Abelian', 'anyons', '-', 'crucial', 'building', 'blocks', 'topological', 'quantum', 'Flat', 'Chern', 'bands', 'widely', 'considered', 'desirable', 'venue', 'FQAH', 'state.', 'this', 'purpose,', 'twisted', 'transition', 'metal', 'homobilayers', 'rhombohedral', 'stacking', 'have', 'recently', 'been', 'promising', 'material', 'platform.', 'Here,', 'report', 'experimental', 'FQAH', 'states', '3.7-degree', 'twisted', 'MoTe2', 'bilayer.', 'Magnetic', 'dichroism', 'm

In [119]:
idlist_total = get_arxiv_id_papers(ids)


save_articles_as_pdf(index_article_list =[0],ids=ids,save_papers=False)

paper_0_2304.08362v1.pdf
