# Arxiv Explorer Tools - minimal weighted match
- Fast: ~5-10 sec to run vs. 5-10 min for embedding or TFIDF versions.
- multi-topic: use as many pre-set seaches as you want
- extracts articles on topics of interest from the too-many-to-look-through daily pages of articles that come out each day.
- saves search results to json (for automation later) and html (for easy reading and linking)
- saves all articles for archiving
- minimal weighted match uses a list of phrases and an weight for each
- set score_floor and top_n to filter which results you see
- arxiv site reading uses 'beautiful soup'

### Setup & Install:
- have python installed and use an python env
- use a jupyter notebook or script, etc.
- for specialty topics you can create extensive weighted search profiles.

Note: should be able to run as a script or in a server, but notebooks are useful

### See:
- https://medium.com/@GeoffreyGordonAshbrook/search-with-non-generative-ai-d0a3cc77164b
- https://github.com/lineality/arxiv_explorer_tools

requirements.txt ->
```
requests
scikit-learn
scipy
numpy
beautifulsoup4
```
- https://pypi.org/project/beautifulsoup4/

# setup

In [1]:
import re  # standard library
import time  # standard library
from datetime import datetime  # standard library
from bs4 import BeautifulSoup  # pip install beautifulsoup4
import requests  # standard library
import json  # standard library

In [2]:
"""
Code-Bundle For Time:
 Commented-out code is for use in different places in the code.
"""


def duration_min_sec(start_time, end_time):

    duration = end_time - start_time

    duration_seconds = duration.total_seconds()

    minutes = int(duration_seconds // 60)
    seconds = duration_seconds % 60
    time_message = f"{minutes}_min__{seconds:.1f}_sec"

    return time_message


"""
Start (and Stop) your Time Tracking:
"""
start_time_whole_single_task = datetime.now()
# end_time_whole_single_task = datetime.now()

"""
Tally time at end.
"""
# # start_time_whole_single_task = datetime.now()
# end_time_whole_single_task = datetime.now()
# duration_time = duration_min_sec(start_time_whole_single_task, end_time_whole_single_task)
# print(f"Duration to run -> {duration_time}")

'\nTally time at end.\n'

# minimal weighted matching code

In [3]:
# An even more simplistic basic key word search (with optional weights)

def rank_documents_on_weighted_matches(documents, keyword_weights):
    """
    Ranks documents based on the presence of weighted keywords-phrases.
    comparison looks at text without:
    - captialization
    - spaces
    - newlines
    - special symbols

    Parameters:
    documents (list of str): The list of documents to be ranked.
    keyword_weights (list of tuple): A list of tuples, where the first element is the keyword and the
    second element is the corresponding weight.

    Returns:
    list of (str, float): A list of tuples, where the first element is the document and the
    second element is the ranking score.
    """
    """
    string cleaning steps:
    - lower
    - strip extra spaces
    - remove symbols
    - remove newlines

    """

    ranked_documents = []

    for document in documents:
        score = 0
        # Make the document lowercase and strip all symbols, spaces, and newline characters
        match_this_cleaned_document = re.sub(r'[^\w\s]', '', document.lower()).replace('\n', '').replace(' ','')
        # print(match_this_cleaned_document)
        for keyword, weight in keyword_weights:

            # Make the keyword lowercase and strip all symbols, spaces, and newline characters
            match_this_cleaned_keyword = re.sub(r'[^\w\s]', '', keyword.lower()).replace('\n', '').replace(' ','')
            # print(match_this_cleaned_keyword)
            # Check if the keyword-phrase is in the document
            if match_this_cleaned_keyword in match_this_cleaned_document:
                # If the keyword-phrase is in the document, add its weight to the score
                score += weight

        ranked_documents.append((document, score))

    # Sort the documents by their ranking scores in descending order
    ranked_documents.sort(key=lambda x: x[1], reverse=True)

    return ranked_documents


# ################
# # Example usage
# ################
# corpus = [
#     "This is the first document about machine learning.",
#     "The second document discusses data analysis and visualization.",
#     "The third document focuses on natural language processing.",
#     "The fourth document talks about deep learning and neural networks.",
#     """to test line breaks
#     Emotion mining
#      data
#     analysis
#     Keywords: emotion mining, sentiment analysis, natural disasters, psychology, technological disasters""",
# ]

# keyword_weights = [("machine learning", 3), ("data analysis", 2), ("natural language processing", 4), ("deep learning", 5), ("neural networks", 6)]

# ranked_documents = rank_documents_on_weighted_matches(corpus, keyword_weights)

# for document, score in ranked_documents:
#     print(f"Document: {document}\nScore: {score}\n")


# Arxiv Explorerer


In [4]:
###################
# Arxiv Explorerer
###################
# step 1: embed the search-phrase
# step 2: embed each text
# step 3: get scores
# step 4: evaluates if score is succss or fail
# step 5: if success: do stuff with text


# # Imports
# from bs4 import BeautifulSoup  # pip install beautifulsoup4
# import requests  # standard library
# import json  # standard library
# from datetime import datetime  # standard library

## Get Article Corpus

In [5]:
start_segment_time = datetime.now()

#####################
# Get Article Corpus
#####################

# List to hold all article data
article_data = []

# # Make a request to the website
r = requests.get('https://arxiv.org/list/cs/new')

url = "https://arxiv.org/list/cs/new"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# # Find all the articles
articles = soup.find_all('dt')

# # Find all the titles
articles_title = soup.find_all('div', {'class': 'list-title mathjax'})

# Find all the subject on the page
articles_subject = soup.find_all('dd')


###############
# make corpus
###############

corpus = []
report_list = []
article_dicts = []

for this_index, article in enumerate(articles):

    ################################################
    # Extract each field of data about each article
    ################################################

    # Extract the title
    title = articles_title[this_index].text.split('Title:')[1].strip()

    # Extract the subjects
    subjects = articles_subject[this_index].find('span', {'class': 'primary-subject'}).text

    arxiv_id = article.find('a', {'title': 'Abstract'}).text.strip()

    abstract_p = article.find_next_sibling('dd').find('p', {'class': 'mathjax'})

    # Extract the abstract
    if abstract_p:
        abstract = abstract_p.text.strip()
    else:
        abstract = ""

    pdf_link_segment = article.find('a', {'title': 'Download PDF'})['href']

    arxiv_id = article.find('a', {'title': 'Abstract'}).text.strip()
    pdf_link = f"https://arxiv.org{pdf_link_segment}"
    paper_link = f"https://arxiv.org/abs/{arxiv_id[6:]}"

    # extracted_article_string = title + " " + abstract + " " + str(subjects)

    # assemble corpus
    article_characters = f"{this_index}|||| "

    article_characters += f"\n'arxiv_id': {arxiv_id}, "
    article_characters += f"\n'paper_link': {paper_link}, "
    article_characters += f"\n'pdf_link': {pdf_link}, "

    article_characters += "\nTitle: " + title + " "
    article_characters += "\nSubjects: " + subjects + " "
    article_characters += "\nAbstract: " + abstract

    ##################################
    # Make Bundles (sharing an index)
    ##################################

    # # add to corpus: just the meaningful text
    # corpus.append(extracted_article_string)

    # add to simple report_list: includes link and article ID info
    report_list.append(article_characters)

    # Append the data to the list
    article_dicts.append({
        'title': title,
        'abstract': abstract,
        'paper_link': paper_link,
        'pdf_link': pdf_link,
        'subjects': subjects,
        'arxiv_id': arxiv_id,
        'article_sequence_index': this_index,
    })

    # using this because only basic search works
    corpus = report_list


# # Segment Timer
# start_segment_time = datetime.now()
end_segment_time = datetime.now()
duration_time = duration_min_sec(start_segment_time, end_segment_time)
print(f"Duration to run segment -> {duration_time}")

# ALL Save the data to a JSON file
date_time = datetime.now()
all_article_dicts_clean_timestamp = date_time.strftime('%Y-%m-%d__%H%M%S%f')
with open(f'all_arxiv_article_dicts_{all_article_dicts_clean_timestamp}.json', 'a') as f:
    json.dump(article_dicts, f)

Duration to run segment -> 0_min__5.6_sec


In [6]:
# inspection (size of corpus)
len(corpus)

612

# print and save: code

In [7]:
# from datetime import datetime  # standard library

########################################
# Filter, Save, & Print the Raw Results
########################################
# ALL Save the data to a JSON file
date_time = datetime.now()
all_arxiv_results_clean_timestamp = date_time.strftime('%Y-%m-%d__%H%M%S%f')
all_articles_list = []
all_results_json_list = []


def result_counter(ranked_documents):
    """
    count non-zero scored results
    """

    result_count = 0

    for this_doc in ranked_documents:
        score = this_doc[1]

        if score != 0:
            result_count += 1

    return result_count


def score_filtered_result_counter(ranked_documents, score_floor=0):
    """
    count non-zero scored results that are greater than or equal to score_floor
    """

    result_count = 0

    for this_doc in ranked_documents:
        score = this_doc[1]

        if score != 0 and score >= score_floor:
            result_count += 1

    return result_count


def print_and_save(ranked_documents, top_n, name_of_set, score_floor=5):
    # Posix UTC Seconds
    # make readable time
    # from datetime import datetime
    date_time = datetime.now()
    clean_timestamp = date_time.strftime('%Y-%m-%d__%H%M%S%f')

    counter = 0

    results_json_list = []

    for document, score in ranked_documents:

        if score >= score_floor:

            blurb = f"Document: {document}\nScore: {score}\n"

            print(blurb)

        this_index = int(document.split('||||')[0])

        data_dict = article_dicts[this_index]

        results_json_list.append(data_dict)
        all_results_json_list.append(data_dict)

        counter += 1
        if counter >= top_n:
            break

    #############
    # Write Data
    #############

    # Save the data to a JSON file
    with open(f'{name_of_set}_articles_{clean_timestamp}.json', 'w') as f:
        json.dump(results_json_list, f)

    # Create an HTML file
    html = '<html><body>'
    for article in results_json_list:
        html += f'<h2><a href="{article["paper_link"]}">{article["title"]}</a></h2>'
        html += f'<p>{article["abstract"]}</p>'
        html += f'<p>Subjects: {str(article["subjects"])}</p>'

        html += f'<a href="{article["paper_link"]}">{article["paper_link"]}</a>'
        html += f'<p>paper link: {str(article["paper_link"])}</p>'

        html += f'<a href="{article["pdf_link"]}">{article["pdf_link"]}</a>'
        html += f'<p>pdf link: {str(article["pdf_link"])}</p>'

        html += f'<p>arxiv id: {str(article["arxiv_id"])}</p>'
        html += f'<p>article_sequence_index id: {str(article["article_sequence_index"])}</p>'

    html += '</body></html>'


    # Save the HTML to a file
    with open(f'{name_of_set}_articles{clean_timestamp}.html', 'w') as f:
        f.write(html)


def match_print_save(list_of_lists_of_weights, top_n, score_floor):
    date_time = datetime.now()
    clean_timestamp = date_time.strftime('%Y-%m-%d__%H%M%S%f')

    counter = 0
    for keyword_weights in list_of_lists_of_weights:

        ranked_documents = rank_documents_on_weighted_matches(corpus, keyword_weights)

        # user first list item as name of set
        name_of_set = list_of_lists_of_weights[counter][0][0]

        result_quantity = result_counter(ranked_documents)

        score_floor_filtered_quantity = score_filtered_result_counter(ranked_documents, score_floor)

        this_max_number = top_n

        if top_n > result_quantity:
            this_max_number = result_quantity

        print(f"\n\nSet Name: {name_of_set}")
        print(f"Total Matches in Set: {result_quantity}")
        print(f"Matches Above Score-Floor in Set: {score_floor_filtered_quantity}")
        print(clean_timestamp)

        print(f"\nShowing {score_floor_filtered_quantity} in top-{this_max_number} out of {result_quantity} total results.     -> {score_floor_filtered_quantity} of {this_max_number}/{result_quantity}")
        print(f"(Ceiling set at {top_n} (top_n) filtered results.)    -> {top_n}")
        print(f"(Minimum-included-score, 'Score-Floor' set at {score_floor}) -> {score_floor}\n\n")

        print_and_save(ranked_documents, top_n, name_of_set, score_floor)
        counter += 1


        # ALL Save the data to a JSON file
        with open(f'all_arxiv_results_{all_arxiv_results_clean_timestamp}.json', 'a') as f:
            json.dump(all_results_json_list, f)

# multi-set search(es)
(optional)

In [8]:
# ########
# # Batch
# ########

# # example multi-list

# list_of_lists_of_weights = [
#     # keyword_weights =
#     [
#         ("computer vision", 3),
#         ("resolution", 2),
#         # ("natural language processing", 4),
#         # ("deep learning", 5),
#         ("neural networks", 6),
#     ],


#     # keyword_weights =
#     [
#         ("distance measure", 10),
#         ("similarity measure", 10),
#         ("vector distance", 10),
#         ("distance metric", 10),
#         ("similarity metric", 10),
#         ("dimension reduction", 10),


#         ("similarity", 1),
#         ("distance", 1),
#         ("metric", 1),

#     ],


#     # # keyword_weights =
#     # ("cognitive science", 2),  # much too broad...
#     [
#         ("mental health", 5),
#         ("psychological health", 5),
#         ("psycholog", 2),  # stem vs. lemma


#         ("mental health care", 3),
#         ("neuroscience", 2),
#         ("psychological assessment", 2),
#         ("personality assessment", 2),
#         ("personality inference", 2),
#         ("personality traits", 2),
#         ("personality dimensions", 2),
#         ("emotion", 15),
#         ("sports psychology", 15),
#         # ("", 2),
#         # ("", 2),



#         # disease terms
#         ("depression", 5),
#         ("anxiety", 5),
#         ("mental disorders", 2),
#         ("social anxiety disorder", 4),
#         ("mental illness", 2),
#         ("Major Depressive Disorder", 2),
#         ("MDD", 2),
#         ("psychological stressors", 2),
#         ("cognitive impairment", 2),
#         ("mci", 2),
#         # ("", 2),
#         # ("", 2),
#         # ("", 2),

#         ],


#     # # keyword_weights =
#     [
#         ("benchmark", 5),
#         ("model evaluation", 5),
#         ("test", 2),
#         ("measure", 2),
#     ],


#     # # keyword_weights =
#     [
#         ("training set", 5),
#         ("synthetic", 2),
#         ("generate", 2),
#         ("measure", 2),
#     ],

#     # keyword_weights =
#     [
#         ("graph", 5),
#         ("graph generation", 8),
#         ("subgraph", 2),
#         ("hierarchical graph", 2),
#         ("embedding", 2),
#         ("knowledge graph", 2),

#         ("graph neural networks", 2),
#         ("graph representation", 2),
#         ("node", 2),
#          ## collisions: cryptograph, geograph,
#     ],

# ]

# top_n = 45
# score_floor = 3
# match_print_save(list_of_lists_of_weights, top_n, score_floor)

# Find top-n articles: use keyword/weights

In [9]:
top_n = 45
score_floor = 2
list_of_lists_of_weights = [[
        ("Manifold Approximation", 10),
        ("UMAP", 10),
        ("Uniform Manifold Approximation and Projection", 10),
        ("Manifold hypothesis", 10),
        ("dimensionality reduction", 10),
        ("dimension reduction", 10),
        ("dimension reduction technique", 10),

        ("stress", 1),
        ("Manifold", 1),
        ("lower-dimensional", 1),
        ("visualiz", 1),
        ("projection", 1),
        ("project", 1),
        ("dimensionality", 1),
        ("reduction", 1),
    ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: Manifold Approximation
Total Matches in Set: 77
Matches Above Score-Floor in Set: 15
2024-09-12__120154747042

Showing 15 in top-45 out of 77 total results.     -> 15 of 45/77
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 2) -> 2


Document: 235|||| 
'arxiv_id': arXiv:2409.07257, 
'paper_link': https://arxiv.org/abs/2409.07257, 
'pdf_link': https://arxiv.org/pdf/2409.07257, 
Title: TopoMap++: A faster and more space efficient technique to compute projections with topological guarantees 
Subjects: Graphics (cs.GR) 
Abstract: High-dimensional data, characterized by many features, can be difficult to visualize effectively. Dimensionality reduction techniques, such as PCA, UMAP, and t-SNE, address this challenge by projecting the data into a lower-dimensional space while preserving important relationships. TopoMap is another technique that excels at preserving the underlying structure of the data, leading to interpretable v

In [10]:
top_n = 45
score_floor = 3
list_of_lists_of_weights = [[
        ("distance measure", 10),
        ("similarity measure", 10),
        ("vector distance", 10),
        ("distance metric", 10),
        ("similarity metric", 10),
        ("dimension reduction", 10),

        ("similarity", 1),
        ("distance", 1),
        ("metric", 1),
    ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: distance measure
Total Matches in Set: 105
Matches Above Score-Floor in Set: 4
2024-09-12__120155184812

Showing 4 in top-45 out of 105 total results.     -> 4 of 45/105
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 3) -> 3


Document: 123|||| 
'arxiv_id': arXiv:2409.06997, 
'paper_link': https://arxiv.org/abs/2409.06997, 
'pdf_link': https://arxiv.org/pdf/2409.06997, 
Title: What is the Right Notion of Distance between Predict-then-Optimize Tasks? 
Subjects: Machine Learning (cs.LG) 
Abstract: Comparing datasets is a fundamental task in machine learning, essential for various learning paradigms; from evaluating train and test datasets for model generalization to using dataset similarity for detecting data drift. While traditional notions of dataset distances offer principled measures of similarity, their utility has largely been assessed through prediction error minimization. However, in Predict-then-Optimize (PtO) fra

In [11]:
top_n = 45
score_floor = 3
list_of_lists_of_weights = [[
        ("parametric", 10),
    ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: parametric
Total Matches in Set: 7
Matches Above Score-Floor in Set: 7
2024-09-12__120155506421

Showing 7 in top-7 out of 7 total results.     -> 7 of 7/7
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 3) -> 3


Document: 289|||| 
'arxiv_id': arXiv:2409.07394, 
'paper_link': https://arxiv.org/abs/2409.07394, 
'pdf_link': https://arxiv.org/pdf/2409.07394, 
Title: AdaCAD: Adaptively Decoding to Balance Conflicts between Contextual and Parametric Knowledge 
Subjects: Computation and Language (cs.CL) 
Abstract: Knowledge conflict arises from discrepancies between information in the context of a large language model (LLM) and the knowledge stored in its parameters. This can hurt performance when using standard decoding techniques, which tend to ignore the context. Existing test-time contrastive methods seek to address this by comparing the LLM's output distribution with and without the context and adjust the model according 

In [12]:
top_n = 45
score_floor = 2
list_of_lists_of_weights = [[
        ("survey", 1),
        ("election", 1),
        ("voting", 1),
        ("poll", 1),
        ("vote", 1),
        ("candidate", 1),

        ("selection", .5),
        ("coordination", .5),
        ("consensus", .5),
        ("campaign", .5),

        ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: survey
Total Matches in Set: 64
Matches Above Score-Floor in Set: 5
2024-09-12__120155753414

Showing 5 in top-45 out of 64 total results.     -> 5 of 45/64
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 2) -> 2


Document: 149|||| 
'arxiv_id': arXiv:2409.07057, 
'paper_link': https://arxiv.org/abs/2409.07057, 
'pdf_link': https://arxiv.org/pdf/2409.07057, 
Title: A Novel Voting System for Medical Catalogues in National Health Insurance 
Subjects: Social and Information Networks (cs.SI) 
Abstract: This study explores the conceptual development of a medical insurance catalogue voting system. The methodology is centred on creating a model where doctors would vote on treatment inclusions, aiming to demonstrate transparency and integrity. The results from Monte Carlo simulations suggest a robust consensus on the selection of medicines and treatments. Further theoretical investigations propose incorporating a patient outcome-

In [13]:
top_n = 45
score_floor = 1
list_of_lists_of_weights = [[
        ("disinformation", 1),
        ("manipulate public opinion", 1),
        ("conspiracy", 1),
        ("radicalization", 1),
        ("conspiracy theories", 1),
        ("violent extremism", 2),

        ("extremism", 1),
        ("extremist", 1),
        ("extreme views", 1),
        ("extreme beliefs", 1),
        ("extreme action", 1),
        ("ideology", .5),        ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: disinformation
Total Matches in Set: 0
Matches Above Score-Floor in Set: 0
2024-09-12__120156017248

Showing 0 in top-0 out of 0 total results.     -> 0 of 0/0
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 1) -> 1




In [14]:
top_n = 45
score_floor = 1
list_of_lists_of_weights = [[
        ("Speech-LLM", 1),

        ("spoken language understanding", 1),

        ("speech to text", 1),
        ("text to speech", 1),

        ("audio modality", .5),
        ("speech encoder", .5),
        ("SLU", .5),
        ("stt", .5),
        ("tts", .5),

        ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: Speech-LLM
Total Matches in Set: 90
Matches Above Score-Floor in Set: 5
2024-09-12__120156354866

Showing 5 in top-45 out of 90 total results.     -> 5 of 45/90
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 1) -> 1


Document: 237|||| 
'arxiv_id': arXiv:2409.07265, 
'paper_link': https://arxiv.org/abs/2409.07265, 
'pdf_link': https://arxiv.org/pdf/2409.07265, 
Title: Cross-Dialect Text-To-Speech in Pitch-Accent Language Incorporating Multi-Dialect Phoneme-Level BERT 
Subjects: Sound (cs.SD) 
Abstract: We explore cross-dialect text-to-speech (CD-TTS), a task to synthesize learned speakers' voices in non-native dialects, especially in pitch-accent languages. CD-TTS is important for developing voice agents that naturally communicate with people across regions. We present a novel TTS model comprising three sub-modules to perform competitively at this task. We first train a backbone TTS model to synthesize dialect speech fro

In [15]:
top_n = 45
score_floor = .5
list_of_lists_of_weights = [[
        ("multiple agents", 1),
        ("Multiagent Systems", 1),
        ("Multiagent", 1),
        ("(cs.MA)", 1),
        ("multi-agent and multi-rack path finding", 1),  #  (MARPF)

        ("agent interactions", 1),
        ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: multiple agents
Total Matches in Set: 16
Matches Above Score-Floor in Set: 16
2024-09-12__120156631456

Showing 16 in top-16 out of 16 total results.     -> 16 of 16/16
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 0.5) -> 0.5


Document: 26|||| 
'arxiv_id': arXiv:2409.06750, 
'paper_link': https://arxiv.org/abs/2409.06750, 
'pdf_link': https://arxiv.org/pdf/2409.06750, 
Title: Can Agents Spontaneously Form a Society? Introducing a Novel Architecture for Generative Multi-Agents to Elicit Social Emergence 
Subjects: Multiagent Systems (cs.MA) 
Abstract: Generative agents have demonstrated impressive capabilities in specific tasks, but most of these frameworks focus on independent tasks and lack attention to social interactions. We introduce a generative agent architecture called ITCMA-S, which includes a basic framework for individual agents and a framework called LTRHA that supports social interactions among multi-agent

In [16]:
top_n = 45
score_floor = .5
list_of_lists_of_weights = [[
        ("Agents for Software Engineering", .5),
        ("ai writing code", .5),
        ("coding done by ai", .5),
        ("AI-Generated Code", .5),
        ("Generated Code", .5),
        ("code generation", .5),
        ("ai code writing", .5),
        ("solutions to produce computer code", .5),
        ("Generated Code", .5),

        ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: Agents for Software Engineering
Total Matches in Set: 3
Matches Above Score-Floor in Set: 3
2024-09-12__120157128039

Showing 3 in top-3 out of 3 total results.     -> 3 of 3/3
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 0.5) -> 0.5


Document: 280|||| 
'arxiv_id': arXiv:2409.07368, 
'paper_link': https://arxiv.org/abs/2409.07368, 
'pdf_link': https://arxiv.org/pdf/2409.07368, 
Title: Demo: SGCode: A Flexible Prompt-Optimizing System for Secure Generation of Code 
Subjects: Cryptography and Security (cs.CR) 
Abstract: This paper introduces SGCode, a flexible prompt-optimizing system to generate secure code with large language models (LLMs). SGCode integrates recent prompt-optimization approaches with LLMs in a unified system accessible through front-end and back-end APIs, enabling users to 1) generate secure code, which is free of vulnerabilities, 2) review and share security analysis, and 3) easily switch from one pr

In [17]:
top_n = 45
score_floor = .5
list_of_lists_of_weights = [[
        ("e-Learners", 1),
        ("educational content", 1),
        ("learning styles", 1),
        ("educational process", 1),
        ("human learning", 1),

        ("education", .5),
        ("learner", .5),
        ("individual needs", .5),

        ("learning sciences", .5),
        ("educational technology", .5),
        ("human-computer interaction", .5),
        ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: e-Learners
Total Matches in Set: 33
Matches Above Score-Floor in Set: 33
2024-09-12__120157393709

Showing 33 in top-33 out of 33 total results.     -> 33 of 33/33
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 0.5) -> 0.5


Document: 7|||| 
'arxiv_id': arXiv:2409.06712, 
'paper_link': https://arxiv.org/abs/2409.06712, 
'pdf_link': https://arxiv.org/pdf/2409.06712, 
Title: A Meta-analysis of College Students' Intention to Use Generative Artificial Intelligence 
Subjects: Computers and Society (cs.CY) 
Abstract: It is of critical importance to analyse the factors influencing college students' intention to use generative artificial intelligence (GenAI) to understand and predict learners' learning behaviours and academic outcomes. Nevertheless, a lack of congruity has been shown in extant research results. This study, therefore, conducted a meta-analysis of 27 empirical studies under an integrated theoretical framework, inc

In [18]:
top_n = 45
score_floor = 2
list_of_lists_of_weights = [[
        ("collective behavior", 1),
        ("collective", 1),
        ("coordination", 1),
        ("oganization", 1),
        ("behavior", 1),
        ("ants", 1),
        ("insects", 1),
        ("worms", 1),
        ("swarm", 1),
        ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: collective behavior
Total Matches in Set: 78
Matches Above Score-Floor in Set: 5
2024-09-12__120157772654

Showing 5 in top-45 out of 78 total results.     -> 5 of 45/78
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 2) -> 2


Document: 26|||| 
'arxiv_id': arXiv:2409.06750, 
'paper_link': https://arxiv.org/abs/2409.06750, 
'pdf_link': https://arxiv.org/pdf/2409.06750, 
Title: Can Agents Spontaneously Form a Society? Introducing a Novel Architecture for Generative Multi-Agents to Elicit Social Emergence 
Subjects: Multiagent Systems (cs.MA) 
Abstract: Generative agents have demonstrated impressive capabilities in specific tasks, but most of these frameworks focus on independent tasks and lack attention to social interactions. We introduce a generative agent architecture called ITCMA-S, which includes a basic framework for individual agents and a framework called LTRHA that supports social interactions among multi-agents. 

In [19]:
top_n = 45
score_floor = 2
list_of_lists_of_weights = [[
        ("Retrieval-Augmented Systems", 1),
        ("RAG systems", 1),
        ("Retrieval-Augmented Generation", 1),
        ("RAG evaluation metric ", 3),
        # ("", 1),
        # ("", 1),
        # ("", 1),
        # ("", 1),
        # ("", 1),
        ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: Retrieval-Augmented Systems
Total Matches in Set: 2
Matches Above Score-Floor in Set: 0
2024-09-12__120158133029

Showing 0 in top-2 out of 2 total results.     -> 0 of 2/2
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 2) -> 2




In [20]:
top_n = 45
score_floor = 2
list_of_lists_of_weights = [[
        ("mental health", 5),
        ("psychological health", 5),
        ("psycholog", 2),  # stem vs. lemma
        ("mental health care", 3),
        ("neuroscience", 2),
        ("psychological assessment", 2),
        ("personality assessment", 2),
        ("personality inference", 2),
        ("personality traits", 2),
        ("personality dimensions", 2),
        ("emotion", 15),
        ("sports psychology", 15),
        ("sentiment recognition", 10),
        ("Emotion Recognition", 5),
        # ("", 5),
        # ("", 5),

        # disease terms
        ("depression", 5),
        ("anxiety", 5),
        ("mental disorders", 2),
        ("social anxiety disorder", 4),
        ("mental illness", 2),
        ("Major Depressive Disorder", 2),
        ("MDD", 2),
        ("psychological stressors", 2),
        ("cognitive impairment", 2),
        ("mci", 2),
        ("personality", 1)
        # ("", 2),
        ],]
match_print_save(list_of_lists_of_weights, top_n, score_floor)



Set Name: mental health
Total Matches in Set: 20
Matches Above Score-Floor in Set: 20
2024-09-12__120158374375

Showing 20 in top-20 out of 20 total results.     -> 20 of 20/20
(Ceiling set at 45 (top_n) filtered results.)    -> 45
(Minimum-included-score, 'Score-Floor' set at 2) -> 2


Document: 154|||| 
'arxiv_id': arXiv:2409.07078, 
'paper_link': https://arxiv.org/abs/2409.07078, 
'pdf_link': https://arxiv.org/pdf/2409.07078, 
Title: Multimodal Emotion Recognition with Vision-language Prompting and Modality Dropout 
Subjects: Computer Vision and Pattern Recognition (cs.CV) 
Abstract: In this paper, we present our solution for the Second Multimodal Emotion Recognition Challenge Track 1(MER2024-SEMI). To enhance the accuracy and generalization performance of emotion recognition, we propose several methods for Multimodal Emotion Recognition. Firstly, we introduce EmoVCLIP, a model fine-tuned based on CLIP using vision-language prompt learning, designed for video-based emotion recogni

# Final Timer

In [21]:
end_time_whole_single_task = datetime.now()
duration_time = duration_min_sec(start_time_whole_single_task, end_time_whole_single_task)
print(f"Duration to run -> {duration_time}")

Duration to run -> 0_min__9.7_sec


In [22]:
# See files
print("List of results saved:")
!ls
print(f"All Articles-Found Results Count = {len(all_results_json_list)}")

List of results saved:
'Agents for Software Engineering_articles2024-09-12__120157341459.html'
'Agents for Software Engineering_articles_2024-09-12__120157341459.json'
 all_arxiv_article_dicts_2024-09-12__120154531943.json
 all_arxiv_results_2024-09-12__120154702432.json
'collective behavior_articles2024-09-12__120158027208.html'
'collective behavior_articles_2024-09-12__120158027208.json'
 disinformation_articles2024-09-12__120156304703.html
 disinformation_articles_2024-09-12__120156304703.json
'distance measure_articles2024-09-12__120155467598.html'
'distance measure_articles_2024-09-12__120155467598.json'
 e-Learners_articles2024-09-12__120157674038.html
 e-Learners_articles_2024-09-12__120157674038.json
'Manifold Approximation_articles2024-09-12__120155081352.html'
'Manifold Approximation_articles_2024-09-12__120155081352.json'
'mental health_articles2024-09-12__120158561274.html'
'mental health_articles_2024-09-12__120158561274.json'
'multiple agents_articles2024-09-12__120156998