# Arxiv Explorer Tools - minimal weighted match
- Fast: ~5-10 sec to run vs. 5-10 min for embedding or TFIDF versions.
- multi-topic: use as many pre-set seaches as you want
- extracts articles on topics of interest from the too-many-to-look-through daily pages of articles that come out each day.
- saves results to json (for automation later) and html (for easy reading and linking)
- minimal weighted match uses a list of phrases and an integer weight for each
- arxiv reading uses 'beautiful soup'

### Setup & Install:
- have python installed and use an python env
- use a jupyter notebook or script, etc.
- for specialty topics you can create extensive weighted search profiles.

  


- https://pypi.org/project/beautifulsoup4/

requirements.txt ->
```
scikit-learn
scipy
numpy
beautifulsoup4
```

In [1]:
import time
from datetime import datetime

start_time_whole_single_task = datetime.now()
# end_time_whole_single_task = datetime.now()


def duration_min_sec(start_time, end_time):

    duration = end_time - start_time

    duration_seconds = duration.total_seconds()

    minutes = int(duration_seconds // 60)
    seconds = duration_seconds % 60
    time_message = f"{minutes}_min__{seconds:.1f}_sec"

    return time_message

# # start_time_whole_single_task = datetime.now()
# end_time_whole_single_task = datetime.now()
# duration_time = duration_min_sec(start_time_whole_single_task, end_time_whole_single_task)
# print(f"Duration to run -> {duration_time}")

# minimal weighted matching code

In [2]:
# import math
# from collections import Counter


# And an even more simplistic basic key word search (with optional weights)

import re

def rank_documents_on_weighted_matches(documents, keyword_weights):
    """
    Ranks documents based on the presence of weighted keywords-phrases.
    comparison looks at text without:
    - captialization
    - spaces
    - newlines
    - special symbols

    Parameters:
    documents (list of str): The list of documents to be ranked.
    keyword_weights (list of tuple): A list of tuples, where the first element is the keyword and the
    second element is the corresponding weight.

    Returns:
    list of (str, float): A list of tuples, where the first element is the document and the
    second element is the ranking score.
    """

    ranked_documents = []

    for document in documents:
        score = 0
        # Make the document lowercase and strip all symbols, spaces, and newline characters
        match_document = re.sub(r'[^\w\s]', '', document.lower()).replace('\n', '').replace(' ','')
        # print(match_document)
        for keyword, weight in keyword_weights:

            # Make the keyword lowercase and strip all symbols, spaces, and newline characters
            match_keyword = re.sub(r'[^\w\s]', '', keyword.lower()).replace('\n', '').replace(' ','')
            # print(match_keyword)
            # Check if the keyword-phrase is in the document
            if match_keyword in match_document:
                # If the keyword-phrase is in the document, add its weight to the score
                score += weight

        ranked_documents.append((document, score))

    # Sort the documents by their ranking scores in descending order
    ranked_documents.sort(key=lambda x: x[1], reverse=True)

    return ranked_documents


# ################
# # Example usage
# ################
# corpus = [
#     "This is the first document about machine learning.",
#     "The second document discusses data analysis and visualization.",
#     "The third document focuses on natural language processing.",
#     "The fourth document talks about deep learning and neural networks.",
#     """to test line breaks
#     Emotion mining
#      data
#     analysis
#     Keywords: emotion mining, sentiment analysis, natural disasters, psychology, technological disasters""",
# ]

# keyword_weights = [("machine learning", 3), ("data analysis", 2), ("natural language processing", 4), ("deep learning", 5), ("neural networks", 6)]

# ranked_documents = rank_documents_on_weighted_matches(corpus, keyword_weights)

# for document, score in ranked_documents:
#     print(f"Document: {document}\nScore: {score}\n")


# Arxiv Explorerer


In [3]:
###################
# Arxiv Explorerer
###################

# step 1: embed the search-phrase
# step 2: embed each text
# step 3: get scores
# step 4: evaluates if score is succss or fail
# step 5: if success: do stuff with text


import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime


start_time_whole_single_task = datetime.now()


# ##########################################
# # Make comparison phrase and vectorize it
# ##########################################
# comparison_phrase = "computer vision resolution enhancement"
# # comparison_phrase = "cyber security"
# # comparison_phrase = "natural language processing"


# Get Article Corpus

In [4]:
start_segment_time = datetime.now()

#####################
# Get Article Corpus
#####################

# List to hold all article data
article_data = []

# # Make a request to the website
r = requests.get('https://arxiv.org/list/cs/new')

url = "https://arxiv.org/list/cs/new"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# # Find all the articles
articles = soup.find_all('dt')

# # Find all the titles
articles_title = soup.find_all('div', {'class': 'list-title mathjax'})

# Find all the subject on the page
articles_subject = soup.find_all('dd')


###############
# make corpus
###############

corpus = []
report_list = []
article_dicts = []

for this_index, article in enumerate(articles):

    ################################################
    # Extract each field of data about each article
    ################################################

    # Extract the title
    title = articles_title[this_index].text.split('Title:')[1].strip()

    # Extract the subjects
    subjects = articles_subject[this_index].find('span', {'class': 'primary-subject'}).text

    arxiv_id = article.find('a', {'title': 'Abstract'}).text.strip()

    abstract_p = article.find_next_sibling('dd').find('p', {'class': 'mathjax'})

    # Extract the abstract
    if abstract_p:
        abstract = abstract_p.text.strip()
    else:
        abstract = ""

    pdf_link_segment = article.find('a', {'title': 'Download PDF'})['href']

    arxiv_id = article.find('a', {'title': 'Abstract'}).text.strip()
    pdf_link = f"https://arxiv.org{pdf_link_segment}"
    paper_link = f"https://arxiv.org/abs/{arxiv_id[6:]}"

    # extracted_article_string = title + " " + abstract + " " + str(subjects)

    # assemble corpus
    article_characters = f"{this_index}|||| "

    article_characters += f"\n'arxiv_id': {arxiv_id}, "
    article_characters += f"\n'paper_link': {paper_link}, "
    article_characters += f"\n'pdf_link': {pdf_link}, "

    article_characters += "\nTitle: " + title + " "
    article_characters += "\nSubjects: " + subjects + " "
    article_characters += "\nAbstract: " + abstract

    ##################################
    # Make Bundles (sharing an index)
    ##################################

    # # add to corpus: just the meaningful text
    # corpus.append(extracted_article_string)

    # add to simple report_list: includes link and article ID info
    report_list.append(article_characters)

    # Append the data to the list
    article_dicts.append({
        'title': title,
        'abstract': abstract,
        'paper_link': paper_link,
        'pdf_link': pdf_link,
        'subjects': subjects,
        'arxiv_id': arxiv_id,
        'article_sequence_index': this_index,
    })

    # using this because only basic search works
    corpus = report_list


# # Segment Timer
# start_segment_time = datetime.now()
end_segment_time = datetime.now()
duration_time = duration_min_sec(start_segment_time, end_segment_time)
print(f"Duration to run segment -> {duration_time}")

Duration to run segment -> 0_min__5.4_sec


In [5]:
# inspection (size of corpus)
len(corpus)

537

# print and save: code

In [6]:
from datetime import datetime


def print_and_save(ranked_documents, top_n, name_of_set):
    # Posix UTC Seconds
    # make readable time
    # from datetime import datetime
    date_time = datetime.now()
    clean_timestamp = date_time.strftime('%Y-%m-%d__%H%M%S%f')

    counter = 0

    results_json_list = []

    for document, score in ranked_documents:

        if score != 0:

            blurb = f"Document: {document}\nScore: {score}\n"

            print(blurb)

        this_index = int(document.split('||||')[0])

        data_dict = article_dicts[this_index]

        results_json_list.append(data_dict)

        counter += 1
        if counter >= top_n:
            break


    #############
    # Write Data
    #############

    # Save the data to a JSON file
    with open(f'{name_of_set}_articles_{clean_timestamp}.json', 'w') as f:
        json.dump(results_json_list, f)

    # Create an HTML file
    html = '<html><body>'
    for article in results_json_list:
        html += f'<h2><a href="{article["paper_link"]}">{article["title"]}</a></h2>'
        html += f'<p>{article["abstract"]}</p>'
        html += f'<p>Subjects: {str(article["subjects"])}</p>'

        html += f'<a href="{article["paper_link"]}">{article["paper_link"]}</a>'
        html += f'<p>paper link: {str(article["paper_link"])}</p>'

        html += f'<a href="{article["pdf_link"]}">{article["pdf_link"]}</a>'
        html += f'<p>pdf link: {str(article["pdf_link"])}</p>'

        html += f'<p>arxiv id: {str(article["arxiv_id"])}</p>'
        html += f'<p>article_sequence_index id: {str(article["article_sequence_index"])}</p>'

    html += '</body></html>'


    # Save the HTML to a file
    with open(f'{name_of_set}_articles{clean_timestamp}.html', 'w') as f:
        f.write(html)

# Find top-n articles: use keyword/weights

In [7]:
# Max Results Returned
top_n = 3

list_of_lists_of_weights = [
    # # keyword_weights =
    # [
    #     ("computer vision", 3),
    #     ("resolution", 2),
    #     # ("natural language processing", 4),
    #     # ("deep learning", 5),
    #     ("neural networks", 6),
    # ],

    # # keyword_weights =
    [
        ("benchmark", 5),
        ("model evaluation", 5),
        ("test", 2),
        ("measure", 2),
    ],


    # # keyword_weights =
    [
        ("training set", 5),
        ("synthetic", 2),
        ("generate", 2),
        ("measure", 2),
    ],

    # keyword_weights =
    [
        ("graph", 5),
        ("graph generation", 8),
        ("subgraph", 2),
        ("hierarchical graph", 2),
        ("embedding", 2),
        ("knowledge graph", 2),

        ("graph neural networks", 2),
        ("graph representation", 2),
        ("node", 2),
         ## collisions: cryptograph, geograph,
    ],

]

counter = 0
for keyword_weights in list_of_lists_of_weights:

    ranked_documents = rank_documents_on_weighted_matches(corpus, keyword_weights)

    # user first list item as name of set
    name_of_set = list_of_lists_of_weights[counter][0][0]
    print(f"\n\nSet: {name_of_set}")

    print_and_save(ranked_documents, top_n, name_of_set)
    counter += 1



Set: benchmark
Document: 413|||| 
'arxiv_id': arXiv:2403.12844, 
'paper_link': https://arxiv.org/abs/2403.12844, 
'pdf_link': https://arxiv.org/pdf/2403.12844, 
Title: MELTing point: Mobile Evaluation of Language Transformers 
Subjects: Machine Learning (cs.LG) 
Abstract: Transformers have revolutionized the machine learning landscape, gradually making their way into everyday tasks and equipping our computers with "sparks of intelligence". However, their runtime requirements have prevented them from being broadly deployed on mobile. As personal devices become increasingly powerful and prompt privacy becomes an ever more pressing issue, we explore the current state of mobile execution of Large Language Models (LLMs). To achieve this, we have created our own automation infrastructure, MELT, which supports the headless execution and benchmarking of LLMs on device, supporting different models, devices and frameworks, including Android, iOS and Nvidia Jetson devices. We evaluate popular i

In [8]:
# See files
!ls

 benchmark_articles2024-07-29__114407837432.html
 benchmark_articles_2024-07-29__114407837432.json
 graph_articles2024-07-29__114407991894.html
 graph_articles_2024-07-29__114407991894.json
 sample_data
'training set_articles2024-07-29__114407902903.html'
'training set_articles_2024-07-29__114407902903.json'


# Final Timer

In [9]:
end_time_whole_single_task = datetime.now()
duration_time = duration_min_sec(start_time_whole_single_task, end_time_whole_single_task)
print(f"Duration to run -> {duration_time}")

Duration to run -> 0_min__6.1_sec
