# Arxiv Explorer Tools - minimal weighted match
- Fast: ~5-10 sec to run vs. 5-10 min for embedding or TFIDF versions.
- multi-topic: use as many pre-set seaches as you want
- extracts articles on topics of interest from the too-many-to-look-through daily pages of articles that come out each day.
- saves results to json (for automation later) and html (for easy reading and linking)
- minimal weighted match uses a list of phrases and an integer weight for each
- arxiv reading uses 'beautiful soup'

### Setup & Install:
- have python installed and use an python env
- use a jupyter notebook or script, etc.
- for specialty topics you can create extensive weighted search profiles.

### See:
- https://medium.com/@GeoffreyGordonAshbrook/search-with-non-generative-ai-d0a3cc77164b
- https://github.com/lineality/arxiv_explorer_tools


- https://pypi.org/project/beautifulsoup4/

requirements.txt ->
```
scikit-learn
scipy
numpy
beautifulsoup4
```

In [1]:
import time
from datetime import datetime

start_time_whole_single_task = datetime.now()
# end_time_whole_single_task = datetime.now()


def duration_min_sec(start_time, end_time):

    duration = end_time - start_time

    duration_seconds = duration.total_seconds()

    minutes = int(duration_seconds // 60)
    seconds = duration_seconds % 60
    time_message = f"{minutes}_min__{seconds:.1f}_sec"

    return time_message

# # start_time_whole_single_task = datetime.now()
# end_time_whole_single_task = datetime.now()
# duration_time = duration_min_sec(start_time_whole_single_task, end_time_whole_single_task)
# print(f"Duration to run -> {duration_time}")

# minimal weighted matching code

In [2]:
# import math
# from collections import Counter


# And an even more simplistic basic key word search (with optional weights)

import re

def rank_documents_on_weighted_matches(documents, keyword_weights):
    """
    Ranks documents based on the presence of weighted keywords-phrases.
    comparison looks at text without:
    - captialization
    - spaces
    - newlines
    - special symbols

    Parameters:
    documents (list of str): The list of documents to be ranked.
    keyword_weights (list of tuple): A list of tuples, where the first element is the keyword and the
    second element is the corresponding weight.

    Returns:
    list of (str, float): A list of tuples, where the first element is the document and the
    second element is the ranking score.
    """
    """
    string cleaning steps:
    - lower
    - strip extra spaces
    - remove symbols
    - remove newlines

    """

    ranked_documents = []

    for document in documents:
        score = 0
        # Make the document lowercase and strip all symbols, spaces, and newline characters
        match_this_cleaned_document = re.sub(r'[^\w\s]', '', document.lower()).replace('\n', '').replace(' ','')
        # print(match_this_cleaned_document)
        for keyword, weight in keyword_weights:

            # Make the keyword lowercase and strip all symbols, spaces, and newline characters
            match_this_cleaned_keyword = re.sub(r'[^\w\s]', '', keyword.lower()).replace('\n', '').replace(' ','')
            # print(match_this_cleaned_keyword)
            # Check if the keyword-phrase is in the document
            if match_this_cleaned_keyword in match_this_cleaned_document:
                # If the keyword-phrase is in the document, add its weight to the score
                score += weight

        ranked_documents.append((document, score))

    # Sort the documents by their ranking scores in descending order
    ranked_documents.sort(key=lambda x: x[1], reverse=True)

    return ranked_documents


# ################
# # Example usage
# ################
# corpus = [
#     "This is the first document about machine learning.",
#     "The second document discusses data analysis and visualization.",
#     "The third document focuses on natural language processing.",
#     "The fourth document talks about deep learning and neural networks.",
#     """to test line breaks
#     Emotion mining
#      data
#     analysis
#     Keywords: emotion mining, sentiment analysis, natural disasters, psychology, technological disasters""",
# ]

# keyword_weights = [("machine learning", 3), ("data analysis", 2), ("natural language processing", 4), ("deep learning", 5), ("neural networks", 6)]

# ranked_documents = rank_documents_on_weighted_matches(corpus, keyword_weights)

# for document, score in ranked_documents:
#     print(f"Document: {document}\nScore: {score}\n")


# Arxiv Explorerer


In [3]:
###################
# Arxiv Explorerer
###################

# step 1: embed the search-phrase
# step 2: embed each text
# step 3: get scores
# step 4: evaluates if score is succss or fail
# step 5: if success: do stuff with text


import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime


start_time_whole_single_task = datetime.now()


# ##########################################
# # Make comparison phrase and vectorize it
# ##########################################
# comparison_phrase = "computer vision resolution enhancement"
# # comparison_phrase = "cyber security"
# # comparison_phrase = "natural language processing"


## Get Article Corpus

In [4]:
start_segment_time = datetime.now()

#####################
# Get Article Corpus
#####################

# List to hold all article data
article_data = []

# # Make a request to the website
r = requests.get('https://arxiv.org/list/cs/new')

url = "https://arxiv.org/list/cs/new"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# # Find all the articles
articles = soup.find_all('dt')

# # Find all the titles
articles_title = soup.find_all('div', {'class': 'list-title mathjax'})

# Find all the subject on the page
articles_subject = soup.find_all('dd')


###############
# make corpus
###############

corpus = []
report_list = []
article_dicts = []

for this_index, article in enumerate(articles):

    ################################################
    # Extract each field of data about each article
    ################################################

    # Extract the title
    title = articles_title[this_index].text.split('Title:')[1].strip()

    # Extract the subjects
    subjects = articles_subject[this_index].find('span', {'class': 'primary-subject'}).text

    arxiv_id = article.find('a', {'title': 'Abstract'}).text.strip()

    abstract_p = article.find_next_sibling('dd').find('p', {'class': 'mathjax'})

    # Extract the abstract
    if abstract_p:
        abstract = abstract_p.text.strip()
    else:
        abstract = ""

    pdf_link_segment = article.find('a', {'title': 'Download PDF'})['href']

    arxiv_id = article.find('a', {'title': 'Abstract'}).text.strip()
    pdf_link = f"https://arxiv.org{pdf_link_segment}"
    paper_link = f"https://arxiv.org/abs/{arxiv_id[6:]}"

    # extracted_article_string = title + " " + abstract + " " + str(subjects)

    # assemble corpus
    article_characters = f"{this_index}|||| "

    article_characters += f"\n'arxiv_id': {arxiv_id}, "
    article_characters += f"\n'paper_link': {paper_link}, "
    article_characters += f"\n'pdf_link': {pdf_link}, "

    article_characters += "\nTitle: " + title + " "
    article_characters += "\nSubjects: " + subjects + " "
    article_characters += "\nAbstract: " + abstract

    ##################################
    # Make Bundles (sharing an index)
    ##################################

    # # add to corpus: just the meaningful text
    # corpus.append(extracted_article_string)

    # add to simple report_list: includes link and article ID info
    report_list.append(article_characters)

    # Append the data to the list
    article_dicts.append({
        'title': title,
        'abstract': abstract,
        'paper_link': paper_link,
        'pdf_link': pdf_link,
        'subjects': subjects,
        'arxiv_id': arxiv_id,
        'article_sequence_index': this_index,
    })

    # using this because only basic search works
    corpus = report_list


# # Segment Timer
# start_segment_time = datetime.now()
end_segment_time = datetime.now()
duration_time = duration_min_sec(start_segment_time, end_segment_time)
print(f"Duration to run segment -> {duration_time}")

# ALL Save the data to a JSON file
date_time = datetime.now()
all_article_dicts_clean_timestamp = date_time.strftime('%Y-%m-%d__%H%M%S%f')
with open(f'all_arxiv_article_dicts_{all_article_dicts_clean_timestamp}.json', 'a') as f:
    json.dump(article_dicts, f)

Duration to run segment -> 0_min__9.0_sec


In [5]:
# inspection (size of corpus)
len(corpus)

1099

# print and save: code

In [6]:
from datetime import datetime

########################################
# Filter, Save, & Print the Raw Results
########################################
# ALL Save the data to a JSON file
date_time = datetime.now()
all_arxiv_results_clean_timestamp = date_time.strftime('%Y-%m-%d__%H%M%S%f')
all_articles_list = []
all_results_json_list = []

def result_counter(ranked_documents):
    """
    count non-zero scored results
    """

    result_count = 0

    for this_doc in ranked_documents:
        score = this_doc[1]

        if score != 0:
            result_count += 1

    return result_count



def score_filtered_result_counter(ranked_documents, score_floor=0):
    """
    count non-zero scored results that are greater than or equal to score_floor
    """

    result_count = 0

    for this_doc in ranked_documents:
        score = this_doc[1]

        if score != 0 and score >= score_floor:
            result_count += 1

    return result_count


def print_and_save(ranked_documents, top_n, name_of_set, score_floor=5):
    # Posix UTC Seconds
    # make readable time
    # from datetime import datetime
    date_time = datetime.now()
    clean_timestamp = date_time.strftime('%Y-%m-%d__%H%M%S%f')

    counter = 0

    results_json_list = []

    for document, score in ranked_documents:

        if score >= score_floor:

            blurb = f"Document: {document}\nScore: {score}\n"

            print(blurb)

        this_index = int(document.split('||||')[0])

        data_dict = article_dicts[this_index]

        results_json_list.append(data_dict)
        all_results_json_list.append(data_dict)

        counter += 1
        if counter >= top_n:
            break

    #############
    # Write Data
    #############

    # Save the data to a JSON file
    with open(f'{name_of_set}_articles_{clean_timestamp}.json', 'w') as f:
        json.dump(results_json_list, f)

    # Create an HTML file
    html = '<html><body>'
    for article in results_json_list:
        html += f'<h2><a href="{article["paper_link"]}">{article["title"]}</a></h2>'
        html += f'<p>{article["abstract"]}</p>'
        html += f'<p>Subjects: {str(article["subjects"])}</p>'

        html += f'<a href="{article["paper_link"]}">{article["paper_link"]}</a>'
        html += f'<p>paper link: {str(article["paper_link"])}</p>'

        html += f'<a href="{article["pdf_link"]}">{article["pdf_link"]}</a>'
        html += f'<p>pdf link: {str(article["pdf_link"])}</p>'

        html += f'<p>arxiv id: {str(article["arxiv_id"])}</p>'
        html += f'<p>article_sequence_index id: {str(article["article_sequence_index"])}</p>'

    html += '</body></html>'


    # Save the HTML to a file
    with open(f'{name_of_set}_articles{clean_timestamp}.html', 'w') as f:
        f.write(html)


def match_print_save(list_of_lists_of_weights, top_n, score_floor):
    date_time = datetime.now()
    clean_timestamp = date_time.strftime('%Y-%m-%d__%H%M%S%f')

    counter = 0
    for keyword_weights in list_of_lists_of_weights:

        ranked_documents = rank_documents_on_weighted_matches(corpus, keyword_weights)

        # user first list item as name of set
        name_of_set = list_of_lists_of_weights[counter][0][0]

        result_quantity = result_counter(ranked_documents)

        score_floor_filtered_quantity = score_filtered_result_counter(ranked_documents, score_floor)

        this_max_number = top_n

        if top_n > result_quantity:
            this_max_number = result_quantity

        print(f"\n\nSet Name: {name_of_set}")
        print(f"Total Matches in Set: {result_quantity}")
        print(f"Matches Above Score-Floor in Set: {score_floor_filtered_quantity}")
        print(clean_timestamp)

        print(f"\nShowing {score_floor_filtered_quantity} in top-{this_max_number} out of {result_quantity} total results.     -> {score_floor_filtered_quantity} of {this_max_number}/{result_quantity}")
        print(f"(Ceiling set at {top_n} (top_n) filtered results.)    -> {top_n}")
        print(f"(Minimum-included-score, 'Score-Floor' set at {score_floor}) -> {score_floor}\n\n")

        print_and_save(ranked_documents, top_n, name_of_set, score_floor)
        counter += 1


        # ALL Save the data to a JSON file
        with open(f'all_arxiv_results_{all_arxiv_results_clean_timestamp}.json', 'a') as f:
            json.dump(all_results_json_list, f)

# set of searches
(optional)

In [7]:
# ########
# # Batch
# ########

# # example multi-list

# list_of_lists_of_weights = [
#     # keyword_weights =
#     [
#         ("computer vision", 3),
#         ("resolution", 2),
#         # ("natural language processing", 4),
#         # ("deep learning", 5),
#         ("neural networks", 6),
#     ],


#     # keyword_weights =
#     [
#         ("distance measure", 10),
#         ("similarity measure", 10),
#         ("vector distance", 10),
#         ("distance metric", 10),
#         ("similarity metric", 10),
#         ("dimension reduction", 10),


#         ("similarity", 1),
#         ("distance", 1),
#         ("metric", 1),

#     ],


#     # # keyword_weights =
#     # ("cognitive science", 2),  # much too broad...
#     [
#         ("mental health", 5),
#         ("psychological health", 5),
#         ("psycholog", 2),  # stem vs. lemma


#         ("mental health care", 3),
#         ("neuroscience", 2),
#         ("psychological assessment", 2),
#         ("personality assessment", 2),
#         ("personality inference", 2),
#         ("personality traits", 2),
#         ("personality dimensions", 2),
#         ("emotion", 15),
#         ("sports psychology", 15),
#         # ("", 2),
#         # ("", 2),



#         # disease terms
#         ("depression", 5),
#         ("anxiety", 5),
#         ("mental disorders", 2),
#         ("social anxiety disorder", 4),
#         ("mental illness", 2),
#         ("Major Depressive Disorder", 2),
#         ("MDD", 2),
#         ("psychological stressors", 2),
#         ("cognitive impairment", 2),
#         ("mci", 2),
#         # ("", 2),
#         # ("", 2),
#         # ("", 2),

#         ],


#     # # keyword_weights =
#     [
#         ("benchmark", 5),
#         ("model evaluation", 5),
#         ("test", 2),
#         ("measure", 2),
#     ],


#     # # keyword_weights =
#     [
#         ("training set", 5),
#         ("synthetic", 2),
#         ("generate", 2),
#         ("measure", 2),
#     ],

#     # keyword_weights =
#     [
#         ("graph", 5),
#         ("graph generation", 8),
#         ("subgraph", 2),
#         ("hierarchical graph", 2),
#         ("embedding", 2),
#         ("knowledge graph", 2),

#         ("graph neural networks", 2),
#         ("graph representation", 2),
#         ("node", 2),
#          ## collisions: cryptograph, geograph,
#     ],

# ]

# top_n = 45
# score_floor = 3
# match_print_save(list_of_lists_of_weights, top_n, score_floor)

# Find top-n articles: use keyword/weights

In [8]:
top_n = 45
score_floor = 2
list_of_lists_of_weights = [[
        ("Manifold Approximation", 10),
        ("UMAP", 10),
        ("Uniform Manifold Approximation and Projection", 10),
        ("Manifold hypothesis", 10),
        ("dimensionality reduction", 10),
        ("dimension reduction", 10),
        ("dimension reduction technique", 10),

        ("stress", 1),
        ("Manifold", 1),
        ("lower-dimensional", 1),
        ("visualiz", 1),
        ("projection", 1),
        ("project", 1),
        ("dimensionality", 1),
        ("reduction", 1),
    ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: Manifold Approximation
Total Matches in Set: 132
Matches Above Score-Floor in Set: 30
2024-09-10__122809159498

Showing 30 in top-45 out of 132 total results.     -> 30 of 45/132
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 2) -> 2


Document: 946|||| 
'arxiv_id': arXiv:2408.02761, 
'paper_link': https://arxiv.org/abs/2408.02761, 
'pdf_link': https://arxiv.org/pdf/2408.02761, 
Title: Dimensionality Reduction and Nearest Neighbors for Improving Out-of-Distribution Detection in Medical Image Segmentation 
Subjects: Computer Vision and Pattern Recognition (cs.CV) 
Abstract: Clinically deployed deep learning-based segmentation models are known to fail on data outside of their training distributions. While clinicians review the segmentations, these models tend to perform well in most instances, which could exacerbate automation bias. Therefore, detecting out-of-distribution images at inference is critical to warn the clinic

In [9]:
top_n = 45
score_floor = 3
list_of_lists_of_weights = [[
        ("distance measure", 10),
        ("similarity measure", 10),
        ("vector distance", 10),
        ("distance metric", 10),
        ("similarity metric", 10),
        ("dimension reduction", 10),

        ("similarity", 1),
        ("distance", 1),
        ("metric", 1),
    ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: distance measure
Total Matches in Set: 198
Matches Above Score-Floor in Set: 6
2024-09-10__122809545359

Showing 6 in top-45 out of 198 total results.     -> 6 of 45/198
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 3) -> 3


Document: 678|||| 
'arxiv_id': arXiv:2304.01397, 
'paper_link': https://arxiv.org/abs/2304.01397, 
'pdf_link': https://arxiv.org/pdf/2304.01397, 
Title: LTM: Scalable and Black-box Similarity-based Test Suite Minimization based on Language Models 
Subjects: Software Engineering (cs.SE) 
Abstract: Test suites tend to grow when software evolves, making it often infeasible to execute all test cases with the allocated testing budgets, especially for large software systems. Test suite minimization (TSM) is employed to improve the efficiency of software testing by removing redundant test cases, thus reducing testing time and resources, while maintaining the fault detection capability of the test suite. M

In [21]:
top_n = 45
score_floor = 3
list_of_lists_of_weights = [[
        ("parametric", 10),
    ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: parametric
Total Matches in Set: 16
Matches Above Score-Floor in Set: 16
2024-09-10__123003825033

Showing 16 in top-16 out of 16 total results.     -> 16 of 16/16
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 3) -> 3


Document: 6|||| 
'arxiv_id': arXiv:2409.04462, 
'paper_link': https://arxiv.org/abs/2409.04462, 
'pdf_link': https://arxiv.org/pdf/2409.04462, 
Title: New parametric identification method for a preference model 
Subjects: Systems and Control (eess.SY) 
Abstract: This article presents a contribution to multi-criteria decision support intended for industrial decision-makers in order to determine the best compromise between design criteria when working on risky or innovative products. In (RENAUD et al. 2008) we used the OWA operator (Ordered Weighted Average), a well-known multi-criteria analysis technique introduced by (YAGER 1988). The interest of this aggregation method is, beyond its ease of use, its ab

In [10]:
top_n = 45
score_floor = 2
list_of_lists_of_weights = [[
        ("survey", 1),
        ("election", 1),
        ("voting", 1),
        ("poll", 1),
        ("vote", 1),
        ("candidate", 1),

        ("selection", .5),
        ("coordination", .5),
        ("consensus", .5),
        ("campaign", .5),

        ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: survey
Total Matches in Set: 113
Matches Above Score-Floor in Set: 7
2024-09-10__122809723833

Showing 7 in top-45 out of 113 total results.     -> 7 of 45/113
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 2) -> 2


Document: 968|||| 
'arxiv_id': arXiv:2408.11755, 
'paper_link': https://arxiv.org/abs/2408.11755, 
'pdf_link': https://arxiv.org/pdf/2408.11755, 
Title: On the Distortion of Committee Election with 1-Euclidean Preferences and Few Distance Queries 
Subjects: Computer Science and Game Theory (cs.GT) 
Abstract: We consider committee election of $k \geq 2$ (out of $m \geq k+1$) candidates, where the voters and the candidates are associated with locations on the real line. Each voter's cardinal preferences over candidates correspond to her distance to the candidate locations, and each voter's cardinal preferences over committees is defined as her distance to the nearest candidate elected in the committee. We cons

In [11]:
top_n = 45
score_floor = 1
list_of_lists_of_weights = [[
        ("disinformation", 1),
        ("manipulate public opinion", 1),
        ("conspiracy", 1),
        ("radicalization", 1),
        ("conspiracy theories", 1),
        ("violent extremism", 2),

        ("extremism", 1),
        ("extremist", 1),
        ("extreme views", 1),
        ("extreme beliefs", 1),
        ("extreme action", 1),
        ("ideology", .5),        ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: disinformation
Total Matches in Set: 0
Matches Above Score-Floor in Set: 0
2024-09-10__122809886302

Showing 0 in top-0 out of 0 total results.     -> 0 of 0/0
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 1) -> 1




In [12]:
top_n = 45
score_floor = 1
list_of_lists_of_weights = [[
        ("Speech-LLM", 1),

        ("spoken language understanding", 1),

        ("speech to text", 1),
        ("text to speech", 1),

        ("audio modality", .5),
        ("speech encoder", .5),
        ("SLU", .5),
        ("stt", .5),
        ("tts", .5),

        ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: Speech-LLM
Total Matches in Set: 171
Matches Above Score-Floor in Set: 4
2024-09-10__122810053756

Showing 4 in top-45 out of 171 total results.     -> 4 of 45/171
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 1) -> 1


Document: 378|||| 
'arxiv_id': arXiv:2409.05356, 
'paper_link': https://arxiv.org/abs/2409.05356, 
'pdf_link': https://arxiv.org/pdf/2409.05356, 
Title: IndicVoices-R: Unlocking a Massive Multilingual Multi-speaker Speech Corpus for Scaling Indian TTS 
Subjects: Computation and Language (cs.CL) 
Abstract: Recent advancements in text-to-speech (TTS) synthesis show that large-scale models trained with extensive web data produce highly natural-sounding output. However, such data is scarce for Indian languages due to the lack of high-quality, manually subtitled data on platforms like LibriVox or YouTube. To address this gap, we enhance existing large-scale ASR datasets containing natural conversations collec

In [13]:
top_n = 45
score_floor = .5
list_of_lists_of_weights = [[
        ("multiple agents", 1),
        ("Multiagent Systems", 1),
        ("Multiagent", 1),
        ("(cs.MA)", 1),
        ("multi-agent and multi-rack path finding", 1),  #  (MARPF)

        ("agent interactions", 1),
        ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: multiple agents
Total Matches in Set: 28
Matches Above Score-Floor in Set: 28
2024-09-10__122810203970

Showing 28 in top-28 out of 28 total results.     -> 28 of 28/28
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 0.5) -> 0.5


Document: 251|||| 
'arxiv_id': arXiv:2409.05037, 
'paper_link': https://arxiv.org/abs/2409.05037, 
'pdf_link': https://arxiv.org/pdf/2409.05037, 
Title: Towards Multi-agent Policy-based Directed Hypergraph Learning for Traffic Signal Control 
Subjects: Multiagent Systems (cs.MA) 
Abstract: Deep reinforcement learning (DRL) methods that incorporate graph neural networks (GNNs) have been extensively studied for intelligent traffic signal control, which aims to coordinate traffic signals effectively across multiple intersections. Despite this progress, the standard graph learning used in these methods still struggles to capture higher-order correlations in real-world traffic flow. In this paper, we

In [14]:
top_n = 45
score_floor = .5
list_of_lists_of_weights = [[
        ("Agents for Software Engineering", .5),
        ("ai writing code", .5),
        ("coding done by ai", .5),
        ("AI-Generated Code", .5),
        ("Generated Code", .5),
        ("code generation", .5),
        ("ai code writing", .5),
        ("solutions to produce computer code", .5),
        ("Generated Code", .5),

        ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: Agents for Software Engineering
Total Matches in Set: 6
Matches Above Score-Floor in Set: 6
2024-09-10__122810365840

Showing 6 in top-6 out of 6 total results.     -> 6 of 6/6
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 0.5) -> 0.5


Document: 834|||| 
'arxiv_id': arXiv:2404.15639, 
'paper_link': https://arxiv.org/abs/2404.15639, 
'pdf_link': https://arxiv.org/pdf/2404.15639, 
Title: CodeIP: A Grammar-Guided Multi-Bit Watermark for Large Language Models of Code 
Subjects: Computation and Language (cs.CL) 
Abstract: Large Language Models (LLMs) have achieved remarkable progress in code generation. It now becomes crucial to identify whether the code is AI-generated and to determine the specific model used, particularly for purposes such as protecting Intellectual Property (IP) in industry and preventing cheating in programming exercises. To this end, several attempts have been made to insert watermarks into machine-gen

In [15]:
top_n = 45
score_floor = .5
list_of_lists_of_weights = [[
        ("e-Learners", 1),
        ("educational content", 1),
        ("learning styles", 1),
        ("educational process", 1),
        ("human learning", 1),

        ("education", .5),
        ("learner", .5),
        ("individual needs", .5),

        ("learning sciences", .5),
        ("educational technology", .5),
        ("human-computer interaction", .5),
        ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: e-Learners
Total Matches in Set: 37
Matches Above Score-Floor in Set: 37
2024-09-10__122810543330

Showing 37 in top-37 out of 37 total results.     -> 37 of 37/37
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 0.5) -> 0.5


Document: 64|||| 
'arxiv_id': arXiv:2409.04645, 
'paper_link': https://arxiv.org/abs/2409.04645, 
'pdf_link': https://arxiv.org/pdf/2409.04645, 
Title: PAIGE: Examining Learning Outcomes and Experiences with Personalized AI-Generated Educational Podcasts 
Subjects: Human-Computer Interaction (cs.HC) 
Abstract: Generative AI is revolutionizing content creation and has the potential to enable real-time, personalized educational experiences. We investigated the effectiveness of converting textbook chapters into AI-generated podcasts and explored the impact of personalizing these podcasts for individual learner profiles. We conducted a 3x3 user study with 180 college students in the United States, compar

In [16]:
top_n = 45
score_floor = 2
list_of_lists_of_weights = [[
        ("collective behavior", 1),
        ("collective", 1),
        ("coordination", 1),
        ("oganization", 1),
        ("behavior", 1),
        ("ants", 1),
        ("insects", 1),
        ("worms", 1),
        ("swarm", 1),
        ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: collective behavior
Total Matches in Set: 147
Matches Above Score-Floor in Set: 16
2024-09-10__122810736940

Showing 16 in top-45 out of 147 total results.     -> 16 of 45/147
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 2) -> 2


Document: 104|||| 
'arxiv_id': arXiv:2409.04736, 
'paper_link': https://arxiv.org/abs/2409.04736, 
'pdf_link': https://arxiv.org/pdf/2409.04736, 
Title: LiTelFuzz : Swarms Fuzzing Based on Linear Temporal Logic Constraints 
Subjects: Cryptography and Security (cs.CR) 
Abstract: Multi-robot swarms utilize swarm intelligence to collaborate on tasks and play an increasingly significant role in a variety of practical scenarios. However, due to the complex design, multi-robot swarm systems often have vulnerabilities caused by logical errors, which can severely disrupt the normal operations of multi-robot swarms. Despite the significant security threats that logical vulnerabilities pose to multi-ro

In [17]:
top_n = 45
score_floor = 2
list_of_lists_of_weights = [[
        ("Retrieval-Augmented Systems", 1),
        ("RAG systems", 1),
        ("Retrieval-Augmented Generation", 1),
        ("RAG evaluation metric ", 3),
        # ("", 1),
        # ("", 1),
        # ("", 1),
        # ("", 1),
        # ("", 1),
        ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: Retrieval-Augmented Systems
Total Matches in Set: 9
Matches Above Score-Floor in Set: 2
2024-09-10__122810924051

Showing 2 in top-9 out of 9 total results.     -> 2 of 9/9
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 2) -> 2


Document: 482|||| 
'arxiv_id': arXiv:2409.05591, 
'paper_link': https://arxiv.org/abs/2409.05591, 
'pdf_link': https://arxiv.org/pdf/2409.05591, 
Title: MemoRAG: Moving towards Next-Gen RAG Via Memory-Inspired Knowledge Discovery 
Subjects: Computation and Language (cs.CL) 
Abstract: Retrieval-Augmented Generation (RAG) leverages retrieval tools to access external databases, thereby enhancing the generation quality of large language models (LLMs) through optimized context. However, the existing retrieval methods are constrained inherently, as they can only perform relevance matching between explicitly stated queries and well-formed knowledge, but unable to handle tasks involving ambiguous inform

In [18]:
top_n = 45
score_floor = 2
list_of_lists_of_weights = [[
        ("mental health", 5),
        ("psychological health", 5),
        ("psycholog", 2),  # stem vs. lemma
        ("mental health care", 3),
        ("neuroscience", 2),
        ("psychological assessment", 2),
        ("personality assessment", 2),
        ("personality inference", 2),
        ("personality traits", 2),
        ("personality dimensions", 2),
        ("emotion", 15),
        ("sports psychology", 15),
        ("sentiment recognition", 10),
        ("Emotion Recognition", 5),
        # ("", 5),
        # ("", 5),

        # disease terms
        ("depression", 5),
        ("anxiety", 5),
        ("mental disorders", 2),
        ("social anxiety disorder", 4),
        ("mental illness", 2),
        ("Major Depressive Disorder", 2),
        ("MDD", 2),
        ("psychological stressors", 2),
        ("cognitive impairment", 2),
        ("mci", 2),
        ("personality", 1)
        # ("", 2),
        ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: mental health
Total Matches in Set: 49
Matches Above Score-Floor in Set: 49
2024-09-10__122811079733

Showing 49 in top-45 out of 49 total results.     -> 49 of 45/49
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 2) -> 2


Document: 1|||| 
'arxiv_id': arXiv:2409.04447, 
'paper_link': https://arxiv.org/abs/2409.04447, 
'pdf_link': https://arxiv.org/pdf/2409.04447, 
Title: Leveraging Contrastive Learning and Self-Training for Multimodal Emotion Recognition with Limited Labeled Samples 
Subjects: Sound (cs.SD) 
Abstract: The Multimodal Emotion Recognition challenge MER2024 focuses on recognizing emotions using audio, language, and visual signals. In this paper, we present our submission solutions for the Semi-Supervised Learning Sub-Challenge (MER2024-SEMI), which tackles the issue of limited annotated data in emotion recognition. Firstly, to address the class imbalance, we adopt an oversampling strategy. Secondly, we prop

# Final Timer

In [19]:
end_time_whole_single_task = datetime.now()
duration_time = duration_min_sec(start_time_whole_single_task, end_time_whole_single_task)
print(f"Duration to run -> {duration_time}")

Duration to run -> 0_min__11.4_sec


In [20]:
# See files
!ls
print(len(all_results_json_list))

'Agents for Software Engineering_articles2024-09-10__122810515419.html'
'Agents for Software Engineering_articles_2024-09-10__122810515419.json'
 all_arxiv_article_dicts_2024-09-10__122808986433.json
 all_arxiv_results_2024-09-10__122809096144.json
'collective behavior_articles2024-09-10__122810876538.html'
'collective behavior_articles_2024-09-10__122810876538.json'
 disinformation_articles2024-09-10__122810027229.html
 disinformation_articles_2024-09-10__122810027229.json
'distance measure_articles2024-09-10__122809699531.html'
'distance measure_articles_2024-09-10__122809699531.json'
 e-Learners_articles2024-09-10__122810689394.html
 e-Learners_articles_2024-09-10__122810689394.json
'Manifold Approximation_articles2024-09-10__122809458737.html'
'Manifold Approximation_articles_2024-09-10__122809458737.json'
'mental health_articles2024-09-10__122811265632.html'
'mental health_articles_2024-09-10__122811265632.json'
'multiple agents_articles2024-09-10__122810329177.html'
'multiple age