In [21]:
## “Thank you to arXiv for use of its open access interoperability.”

In [88]:
from bs4 import BeautifulSoup
import urllib, urllib.request
import re
import numpy as np
from datetime import date
import datetime

In [89]:
def download_articles_of_day(day=None, num_articles=10):
    """
    Downloads the titles and summaries of a given date in the condensed matter category. 
    If no inputs are given, it will do today's date and 10 articles
    
    Parameters
    ----------
    day : date.time format
        A given date
    num_articles : int
                    The number of articles requested
    
    Returns
    -------
    titles : bs4 ResultSet
            Contains all titles requested, can be indexed with []
    summaries : bs4 ResultSet
                Contains all abstracts belonging to the titles. 
    
    """
    if day==None:
        day = date.today()

    tomorrow=day+datetime.timedelta(days=1)
    url = f'http://export.arxiv.org/api/query?search_query=all:condensed%20matter&submittedDate:[{day}+TO+{tomorrow}]&start=0&max_results=10&sortBy=submittedDate&sortOrder=descending'
    data = urllib.request.urlopen(url)
    # summaries = 
    Bsoup = BeautifulSoup(data, 'html.parser')
    titles,summaries =  Bsoup.find_all('title'), Bsoup.find_all('summary')

    return titles, summaries

In [90]:
titles, summaries = download_articles_of_day(day=None, num_articles=10)



In [104]:
trivial_words_list_default = ['and','a','the','','of','in','on','it','after','for','ever','never','since','at','to','too']

def filter_out_non_trivial_words(wordlist,trivial_words_list=trivial_words_list_default):
    '''
    Function that removes trivial words from a list of words. 
    Input:
    wordlist            = list of strings, may be lowercase or uppercase 
    trivial_words_list  = list of lowercase strings that should be removed from input "wordlist"
    Ouput:
    wordlist_nontrivial = list of strings where trivial words are removed.
    '''
    wordlist_nontrivial = []
        
    for i in range(len(wordlist)):
        word =  wordlist[i]

        ## check if word is a trivial word
        ## conver input words to lowercase and remove '\n' characters (if present)
        if word.lower().strip('\n') not in trivial_words_list:
            wordlist_nontrivial.append(word.strip('\n'))

    return wordlist_nontrivial

In [105]:
### For every paper, filter trivial words from the title

def filter_nontrivial_words_from_papers(input_data,trivial_words_list=trivial_words_list_default):
    
    """
    A list of unsplitted titles/abstracts for every paper.
    For each paper the title/abstract is splitted and all non-trivial words are removed.
    Output is a list of words for every paper
    
    Parameters
    ----------
    input_data : list of of unsplitted titles/abstracts for every paper.
    trivial_words_list : list of lowercase trivial words that should be removed 
    
    Returns
    -------
    list of non-trivial words for every abstract/title of every paper.
    
    """
    
    ### input data is splitted.
    splitted_strings_list = []
    for i in range(len(input_data)):
        title = input_data[i]

        ## skip the first element
        if i > 0:
            # print(title.string)
            ## seperate string on ' ' & '\n' characters
            string_splitted = re.split(' ', title.string)
            string_splitted = [ re.split('\n', element)[0]  for element in string_splitted ]
            # print(string_splitted)
            splitted_strings_list.append(string_splitted)
            
    
    print('------')
    ### from this data trivial words are removed. 
    wordlist_nontrivial_paper = []

    for index_paper in range(len(splitted_strings_list)):
        wordlist            = splitted_strings_list[index_paper]
        wordlist_nontrivial = filter_out_non_trivial_words(wordlist,trivial_words_list)

        print(wordlist_nontrivial)
        wordlist_nontrivial_paper.append(wordlist_nontrivial)
        
    return wordlist_nontrivial_paper

In [106]:
# summaries[0]

# # print(summaries[0].string)
# print(re.split(' ', summaries[0].string))

In [110]:
title_paper_wordlist_nontrivial_paper = filter_nontrivial_words_from_papers(titles,trivial_words_list=trivial_words_list_default)

------
['Asymmetric', 'Thermal', 'Relaxation', 'Driven', 'Systems:', 'Rotations', 'Opposite', 'Ways']
['Josephson-like', 'tunnel', 'resonance', 'large', 'Coulomb', 'drag', 'GaAs-based', 'electron-hole', 'bilayers']
['Addressing', 'self-interaction', 'ELDER', 'dark', 'matter', 'from', '21-cm', 'signal']
['$f(\\mathcal{G},\\mathrm{\\textit{T}})$', 'Gravity', 'Bouncing', 'Universe', 'with', 'Cosmological', 'Parameters']
['Chaotic', 'interactions', 'between', 'dark', 'matter', 'dark', 'energy']
['anti-correlation', 'between', 'pericentric', 'distance', 'inner', 'dark', 'matter', 'density', 'Milky', "Way's", 'dwarf', 'spheroidal', 'galaxies']
['Finite', 'Temperature', 'Dynamics', 'Spin', 'Solitons', 'with', 'Applications', 'Thermocouples', 'Refrigerators']
['use', 'dielectric', 'elements', 'axion', 'searches', 'with', 'microwave', 'resonant', 'cavities']
['Theoretical', 'study', 'competition', 'between', 'folding', 'contact', 'interactions', 'properties', 'polymers', 'using', 'self-avoid', 

In [112]:
trivial_words_list_default = ['and','by','a','the','',  '','of','in','on','it','after',
                              'for','ever','never','since','at','to','too','e.g.','are',
                              'nm','is','as','we','i','go','not','can','be','that']

summaries_papers_wordlist_nontrivial = filter_nontrivial_words_from_papers(summaries,trivial_words_list=trivial_words_list_default)

------
['Bilayers', 'consisting', 'two-dimensional', '(2D)', 'electron', 'hole', 'gases', 'separated', '10', 'thick', 'AlGaAs', 'barrier', 'formed', 'charge', 'accumulation', 'grown', 'GaAs.', 'Both', 'vertical', 'lateral', 'electric', 'transport', 'millikelvin', 'temperature', 'range.', 'conductivity', 'between', 'shows', 'sharp', 'tunnel', 'resonance', 'density', '$1.1', '\\cdot', '10^{10}', '\\text{', 'which', 'consistent', 'with', 'Josephson-like', 'enhanced', 'tunnel', 'tunnel', 'resonance', 'disappears', 'with', 'increasing', 'densities', '2D', 'charge', 'gases', 'start', 'show', '2D-Fermi-gas', 'behavior.', 'Interlayer', 'persist', 'causing', 'positive', 'drag', 'voltage', 'very', 'large', 'densities.', 'transition', 'from', 'Josephson-like', 'tunnel', 'resonance', 'behavior', 'interpreted', 'phase', 'transition', 'from', 'an', 'exciton', 'gas', 'Bose-Einstein-condensate', 'state', 'degenerate', 'electron-hole', 'Fermi', 'gas.']
['self-interacting', 'dark', 'matter', 'affect', '