# Arxiv Explorer Tools - minimal TF-IDF Vector Search
- extract articles on topics of interest from the too-many-to-look-through loads of articles that come out each day.
- adjust how strict of loose your filters are
- saves results to json and html
- minimal TF-IDF is vanilla python (no additional packages or libraries)
- arxiv reading uses 'beautiful soup'
- various classic distance metrics use:
    - scikit-learn
    - scipy
    - numpy

### Setup & Install:
- have python installed and use an python env
- use a jupyter notebook or script, etc.

  


- https://pypi.org/project/beautifulsoup4/

requirements.txt ->
```
scikit-learn
scipy
numpy
beautifulsoup4
```

In [None]:
from datetime import datetime

start_time_whole_single_task = datetime.now()
end_time_whole_single_task = datetime.now()


def duration_min_sec(start_time, end_time):

    duration = end_time - start_time

    duration_seconds = duration.total_seconds()

    minutes = int(duration_seconds // 60)
    seconds = duration_seconds % 60
    time_message = f"{minutes}_min__{seconds:.1f}_sec"

    return time_message

duration_time = duration_min_sec(start_time_whole_single_task, end_time_whole_single_task)
print(f"Duration to run -> {duration_time}")

Duration to run -> 0_min__0.0_sec


In [None]:
# step 1: make corpus vector-matrix
# step 2: get vector of the search-phrase
# step 3: get vector of  each text
# step 4: get scores
# step 5: evaluates if score is succss or fail
# step 6: if success: do stuff with text, else: move on

# Distance Functions

## score report functions

In [None]:
#############
# Functions
############

from sklearn.metrics.pairwise import cosine_similarity

def cosine_similarity_distance(embedding1, embedding2, boolean=False, threshold=0.6):
    """
    Cosine Similarity: This is a common method for measuring the similarity
    between two vectors. It measures the cosine of the angle between
    two vectors and the result is a value between -1 and 1.
    A value of 1 means the vectors are identical,
    0 means they are orthogonal (or completely dissimilar),
    and -1 means they are diametrically opposed.

    if not surprisingly, this looks solid: gold standard?
    """
    # Assuming embedding1 and embedding2 are your embeddings
    similarity = cosine_similarity([embedding1], [embedding2])

    similarity = similarity[0][0]


    threshold_difference = similarity - threshold

    boolean_result = None

    if similarity < threshold:
        boolean_result = False

    else:
        boolean_result = True

    profile = {
        'boolean': boolean_result,
        'threshold': threshold,
        'threshold_difference': threshold_difference,
        'similarity_measure': similarity,
    }

    return profile

from scipy.spatial.distance import euclidean

def euclidean_distance(embedding1, embedding2, boolean=False, threshold=0.5):
    """
    Euclidean Distance: This is another common method for measuring
     the similarity between two vectors.
     It calculates the straight-line distance between two points in a space.
     The smaller the distance, the more similar the vectors.
    """
    # Assuming embedding1 and embedding2 are your embeddings
    similarity = 1 / (1 + euclidean(embedding1, embedding2))


    threshold_difference = similarity - threshold

    boolean_result = None

    if similarity < threshold:
        boolean_result = False

    else:
        boolean_result = True

    profile = {
        'boolean': boolean_result,
        'threshold': threshold,
        'threshold_difference': threshold_difference,
        'similarity_measure': similarity,
    }

    return profile


import numpy as np

def normalized_dot_product(embedding1, embedding2, boolean=False, threshold=0.6):
    """
    Dot Product: This is a simple method that calculates
    the sum of the products of the corresponding entries of the
    two sequences of numbers. If the vectors are normalized,
    the dot product is equal to the cosine similarity.

    0.5 ok? seems good
    """
    # Assuming embedding1 and embedding2 are your embeddings
    dot_product = np.dot(embedding1, embedding2)
    normalized_dot_product = dot_product / (np.linalg.norm(embedding1) * np.linalg.norm(embedding2))

    similarity = normalized_dot_product


    threshold_difference = similarity - threshold

    boolean_result = None

    if similarity < threshold:
        boolean_result = False

    else:
        boolean_result = True

    profile = {
        'boolean': boolean_result,
        'threshold': threshold,
        'threshold_difference': threshold_difference,
        'similarity_measure': similarity,
    }

    return profile

from scipy.spatial.distance import cityblock

def manhattan_distance(embedding1, embedding2, boolean=False, threshold=0.0024):
    """
    Manhattan Distance: This is a measure of the distance between
    two vectors in a grid-based system.
    It calculates the sum of the absolute differences of their coordinates.
    """
    # Assuming embedding1 and embedding2 are your embeddings
    similarity = 1 / (1 + cityblock(embedding1, embedding2))


    threshold_difference = similarity - threshold

    boolean_result = None

    if similarity < threshold:
        boolean_result = False

    else:
        boolean_result = True

    profile = {
        'boolean': boolean_result,
        'threshold': threshold,
        'threshold_difference': threshold_difference,
        'similarity_measure': similarity,
    }

    return profile


from scipy.stats import pearsonr

def pearson_correlation(embedding1, embedding2, boolean=False, threshold=0.6):
    """
    Pearson Correlation: This is a measure of the linear correlation
    between two vectors. It ranges from -1 (perfectly negatively correlated)
     to 1 (perfectly positively correlated).

    maybe decent around 0.6?
    """

    # Assuming embedding1 and embedding2 are your embeddings
    similarity, _ = pearsonr(embedding1, embedding2)

    threshold_difference = similarity - threshold

    boolean_result = None

    if similarity < threshold:
        boolean_result = False

    else:
        boolean_result = True

    profile = {
        'boolean': boolean_result,
        'threshold': threshold,
        'threshold_difference': threshold_difference,
        'similarity_measure': similarity,
    }

    return profile


from scipy.stats import spearmanr

def spearmans_rank_correlation(embedding1, embedding2, boolean=False, threshold=0.6):
    """
    Spearman's Rank Correlation: This is a non-parametric
     measure of the monotonicity of the relationship between
     two datasets. Unlike the Pearson correlation, the Spearman
      correlation does not assume that the relationship between
       the two variables is linear.

    more strict measure?
    """

    # Assuming embedding1 and embedding2 are your embeddings
    similarity, _ = spearmanr(embedding1, embedding2)


    threshold_difference = similarity - threshold

    boolean_result = None

    if similarity < threshold:
        boolean_result = False

    else:
        boolean_result = True

    profile = {
        'boolean': boolean_result,
        'threshold': threshold,
        'threshold_difference': threshold_difference,
        'similarity_measure': similarity,
    }

    return profile

from scipy.stats import kendalltau
def kendalls_rank_correlation(embedding1, embedding2, boolean=False, threshold=0.7):

    """
    Kendall's Rank Correlation: This is another non-parametric
    measure of the ordinal association between two variables.
    It is a measure of the correspondence between two rankings.

    0.3 may match the subject generally
    0.5 may most closely match meaning
    """

    # Assuming embedding1 and embedding2 are your embeddings
    similarity, _ = kendalltau(embedding1, embedding2)

    threshold_difference = similarity - threshold

    boolean_result = None

    if similarity < threshold:
        boolean_result = False

    else:
        boolean_result = True

    profile = {
        'boolean': boolean_result,
        'threshold': threshold,
        'threshold_difference': threshold_difference,
        'similarity_measure': similarity,
    }

    return profile


from scipy.spatial.distance import minkowski


def minkowski_distance(embedding1, embedding2, boolean=False, threshold=0.055):
    """
    Minkowski Distance: This is a generalization of
    both the Euclidean distance and the Manhattan distance.
    It is defined as the p-th root of the sum of the p-th powers
    of the differences of the coordinates.
    When p=1, this is the Manhattan distance,
    and when p=2, this is the Euclidean distance.
    """
    # Assuming embedding1 and embedding2 are your embeddings
    similarity = 1 / (1 + minkowski(embedding1, embedding2, p=2))

    threshold_difference = similarity - threshold

    boolean_result = None

    if similarity < threshold:
        boolean_result = False

    else:
        boolean_result = True

    profile = {
        'boolean': boolean_result,
        'threshold': threshold,
        'threshold_difference': threshold_difference,
        'similarity_measure': similarity,
    }

    return profile


from scipy.spatial.distance import chebyshev
def chebyshev_distance(embedding1, embedding2, boolean=False, threshold=0.4):
    """
    Chebyshev Distance: This is a measure of the distance between
    two vectors in a vector space.
    It is the maximum of the absolute differences of their coordinates.
    """

    # Assuming embedding1 and embedding2 are your embeddings
    similarity = 1 / (1 + chebyshev(embedding1, embedding2))

    threshold_difference = similarity - threshold

    boolean_result = None

    if similarity < threshold:
        boolean_result = False

    else:
        boolean_result = True

    profile = {
        'boolean': boolean_result,
        'threshold': threshold,
        'threshold_difference': threshold_difference,
        'similarity_measure': similarity,
    }

    return profile


import numpy as np
from scipy.spatial.distance import mahalanobis
from numpy.linalg import inv

def mahalanobis_distance(embedding1, embedding2, boolean=False, threshold=0.415):
    """Mahalanobis Distance: This is a measure of the distance between
    a point P and a distribution D, introduced by P. C. Mahalanobis in 1936.
    It is a multivariate generalization of the Euclidean distance.
    It is based on correlations between dimensions of the data,
    and thus takes into account the structure of the data.
    """

    # Assuming embedding1 and embedding2 are your vectors
    data = np.array([embedding1, embedding2])

    # Calculate the covariance matrix with a small regularization term
    cov = np.cov(data, rowvar=False) + np.eye(data.shape[1])# * 1e-6

    # Calculate the Mahalanobis distance
    distance = mahalanobis(embedding1, embedding2, inv(cov))

    # Calculate the similarity score
    similarity = 1 / (1 + distance)

    threshold_difference = similarity - threshold

    boolean_result = None

    if similarity < threshold:
        boolean_result = False

    else:
        boolean_result = True

    profile = {
        'boolean': boolean_result,
        'threshold': threshold,
        'threshold_difference': threshold_difference,
        'similarity_measure': similarity,
    }

    return profile



from scipy.spatial.distance import braycurtis
def bray_curtis_distance_dissimilarity(embedding1, embedding2, boolean=False, threshold=0.75):
    """Bray-Curtis Distance: This is a measure of dissimilarity
    between two vectors. It is used in ecology to compare species
    composition in different samples. It is defined as the sum of
    the absolute differences between the vectors, divided by the sum of their sums.

    0.75 is maybe a stricker-yes

    but total no is still .6+
    """

    # Assuming embedding1 and embedding2 are your embeddings
    similarity = 1 / (1 + braycurtis(embedding1, embedding2))

    threshold_difference = similarity - threshold

    boolean_result = None

    if similarity < threshold:
        boolean_result = False

    else:
        boolean_result = True

    profile = {
        'boolean': boolean_result,
        'threshold': threshold,
        'threshold_difference': threshold_difference,
        'similarity_measure': similarity,
    }

    return profile


from scipy.spatial.distance import canberra
def canberra_distance(embedding1, embedding2, boolean=False, threshold=0.002):
    """
    dissimilarity
    Canberra Distance: This is a measure of the dissimilarity
    between two vectors. It is defined as the sum of the absolute
    differences between the vectors, divided by the sum of their absolute values.
    """
    # Assuming embedding1 and embedding2 are your embeddings
    similarity = 1 / (1 + canberra(embedding1, embedding2))

    threshold_difference = similarity - threshold

    boolean_result = None

    if similarity < threshold:
        boolean_result = False

    else:
        boolean_result = True

    profile = {
        'boolean': boolean_result,
        'threshold': threshold,
        'threshold_difference': threshold_difference,
        'similarity_measure': similarity,
    }

    return profile



from scipy.stats import pearsonr
def correlation_distance_dissimilarity_measure(embedding1, embedding2, boolean=False, threshold=0.7):
    """
    dissimilarity
    Correlation Distance: This is a measure of the dissimilarity
    between two vectors. It is defined as 1 - the absolute value of
    the Pearson correlation coefficient between the vectors.

    even no is hight... maybe .7 ok?
    """
    # Assuming embedding1 and embedding2 are your embeddings
    correlation, _ = pearsonr(embedding1, embedding2)
    similarity = 1 / (1 + (1 - abs(correlation)))

    threshold_difference = similarity - threshold

    boolean_result = None

    if similarity < threshold:
        boolean_result = False

    else:
        boolean_result = True

    profile = {
        'boolean': boolean_result,
        'threshold': threshold,
        'threshold_difference': threshold_difference,
        'similarity_measure': similarity,
    }

    return profile



from scipy.spatial.distance import sqeuclidean
def squared_euclidean_distance_dissimilarity_measure(embedding1, embedding2, boolean=False, threshold=0.005):
    """
    dissimilarity
    Squared Euclidean Distance: This is a measure of the dissimilarity
    between two vectors. It is defined as the sum of the squared differences
    between the vectors. It is similar to the Euclidean distance,
    but it does not take the square root, which can make it faster to compute.
    """
    # Assuming embedding1 and embedding2 are your embeddings
    similarity = 1 / (1 + sqeuclidean(embedding1, embedding2))

    threshold_difference = similarity - threshold

    boolean_result = None

    if similarity < threshold:
        boolean_result = False

    else:
        boolean_result = True

    profile = {
        'boolean': boolean_result,
        'threshold': threshold,
        'threshold_difference': threshold_difference,
        'similarity_measure': similarity,
    }

    return profile

from scipy.spatial.distance import hamming
def hamming_distance_dissimilarity_measure(embedding1, embedding2):
    """
    Hamming Distance: This is a measure of the minimum number
    of substitutions required to change one vector into the other.
    It is used in information theory to measure the difference between
    two binary vectors.
    """
    # Assuming embedding1 and embedding2 are your binary vectors
    similarity = 1 / (1 + hamming(embedding1, embedding2))

    threshold_difference = similarity - threshold

    boolean_result = None

    if similarity < threshold:
        boolean_result = False

    else:
        boolean_result = True

    profile = {
        'boolean': boolean_result,
        'threshold': threshold,
        'threshold_difference': threshold_difference,
        'similarity_measure': similarity,
    }

    return profile


# """
# not for vectors
# dissimilarity
# Jensen-Shannon Distance: This is a measure of the dissimilarity
# between two probability distributions. It is defined as the square root
# of the Jensen-Shannon divergence. It is a symmetric and smooth measure
# of dissimilarity that is always greater than or equal to 0.
# """
# from scipy.spatial.distance import jensenshannon
# import numpy as np
# def jensen_shannon_distance_dissimilarity_measure(embedding1, embedding2):
#     # Assuming embedding1 and embedding2 are your probability distributions
#     # Compute the average of the two distributions
#     average = 0.5 * np.add(embedding1, embedding2)
#     # Compute the Jensen-Shannon divergence
#     jsd = 0.5 * jensenshannon(embedding1, average) + 0.5 * jensenshannon(embedding2, average)
#     # Compute the Jensen-Shannon distance
#     dissimilarity = np.sqrt(jsd)
#     return dissimilarity



# """
# not for vectors
# dissimilarity
# Kullback-Leibler Divergence:
# This is a measure of the dissimilarity between two probability distributions.
#  It is not symmetric, meaning that the divergence from
#  distribution P to distribution Q is not
#  necessarily the same as the divergence from Q to P.
# """
# from scipy.special import kl_div
# def kullback_leibler_distance_dissimilarity_measure(embedding1, embedding2):
#     # Assuming embedding1 and embedding2 are your probability distributions
#     # Compute the Kullback-Leibler divergence
#     kld = np.sum(kl_div(embedding1, embedding2))
#     return kld



from scipy.stats import wasserstein_distance
def total_variation_distance_dissimilarity_measure(embedding1, embedding2, boolean=False, threshold=0.97):
    """
    dissimilarity
    Total Variation Distance: This is a measure of the dissimilarity
    between two probability distributions.
    It is defined as half the sum of the absolute differences
    between the corresponding probabilities in the two distributions.

    all scores high, maybe .97 is strict enough?
    """
    # Assuming embedding1 and embedding2 are your probability distributions
    similarity = 1 / (1 + wasserstein_distance(embedding1, embedding2))

    threshold_difference = similarity - threshold

    boolean_result = None

    if similarity < threshold:
        boolean_result = False

    else:
        boolean_result = True

    profile = {
        'boolean': boolean_result,
        'threshold': threshold,
        'threshold_difference': threshold_difference,
        'similarity_measure': similarity,
    }

    return profile

# minimal weighted matching

In [None]:
# # import math
# # from collections import Counter


# # And an even more simplistic basic key word search (with optional weights)

# import re

# def rank_documents_on_weighted_matches(documents, keyword_weights):
#     """
#     Ranks documents based on the presence of weighted keywords-phrases.
#     comparison looks at text without:
#     - captialization
#     - spaces
#     - newlines
#     - special symbols

#     Parameters:
#     documents (list of str): The list of documents to be ranked.
#     keyword_weights (list of tuple): A list of tuples, where the first element is the keyword and the
#     second element is the corresponding weight.

#     Returns:
#     list of (str, float): A list of tuples, where the first element is the document and the
#     second element is the ranking score.
#     """

#     ranked_documents = []

#     for document in documents:
#         score = 0
#         # Make the document lowercase and strip all symbols, spaces, and newline characters
#         match_document = re.sub(r'[^\w\s]', '', document.lower()).replace('\n', '').replace(' ','')
#         # print(match_document)
#         for keyword, weight in keyword_weights:

#             # Make the keyword lowercase and strip all symbols, spaces, and newline characters
#             match_keyword = re.sub(r'[^\w\s]', '', keyword.lower()).replace('\n', '').replace(' ','')
#             # print(match_keyword)
#             # Check if the keyword-phrase is in the document
#             if match_keyword in match_document:
#                 # If the keyword-phrase is in the document, add its weight to the score
#                 score += weight

#         ranked_documents.append((document, score))

#     # Sort the documents by their ranking scores in descending order
#     ranked_documents.sort(key=lambda x: x[1], reverse=True)

#     return ranked_documents


# ################
# # Example usage
# ################
# documents = [
#     "This is the first document about machine learning.",
#     "The second document discusses data analysis and visualization.",
#     "The third document focuses on natural language processing.",
#     "The fourth document talks about deep learning and neural networks.",
#     """to test line breaks
#     Emotion mining
#      data
#     analysis
#     Keywords: emotion mining, sentiment analysis, natural disasters, psychology, technological disasters""",
# ]

# keyword_weights = [("machine learning", 3), ("data analysis", 2), ("natural language processing", 4), ("deep learning", 5), {"neural networks", 6}]

# ranked_documents = rank_documents_on_weighted_matches(documents, keyword_weights)

# for document, score in ranked_documents:
#     print(f"Document: {document}\nScore: {score}\n")


# Arxiv Explorerer


In [None]:
###################
# Arxiv Explorerer
###################

# step 1: embed the search-phrase
# step 2: embed each text
# step 3: get scores
# step 4: evaluates if score is succss or fail
# step 5: if success: do stuff with text


import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime


start_time_whole_single_task = datetime.now()


# ##########################################
# # Make comparison phrase and vectorize it
# ##########################################
# comparison_phrase = "computer vision resolution enhancement"
# # comparison_phrase = "cyber security"
# # comparison_phrase = "natural language processing"


# Get Article Corpus

In [None]:
start_segment_time = datetime.now()

#####################
# Get Article Corpus
#####################

# List to hold all article data
article_data = []

# # Make a request to the website
r = requests.get('https://arxiv.org/list/cs/new')

url = "https://arxiv.org/list/cs/new"
response = requests.get(url)
soup = BeautifulSoup(response.text, 'html.parser')

# # Find all the articles
articles = soup.find_all('dt')

# # Find all the titles
articles_title = soup.find_all('div', {'class': 'list-title mathjax'})

# Find all the subject on the page
articles_subject = soup.find_all('dd')


###############
# make corpus
###############

corpus = []
report_list = []

for this_index, article in enumerate(articles):

    ################################################
    # Extract each field of data about each article
    ################################################

    # Extract the title
    title = articles_title[this_index].text.split('Title:')[1].strip()

    # Extract the subjects
    subjects = articles_subject[this_index].find('span', {'class': 'primary-subject'}).text

    arxiv_id = article.find('a', {'title': 'Abstract'}).text.strip()

    abstract_p = article.find_next_sibling('dd').find('p', {'class': 'mathjax'})

    # Extract the abstract
    if abstract_p:
        abstract = abstract_p.text.strip()
    else:
        abstract = ""

    pdf_link_segment = article.find('a', {'title': 'Download PDF'})['href']

    arxiv_id = article.find('a', {'title': 'Abstract'}).text.strip()
    pdf_link = f"https://arxiv.org{pdf_link_segment}"
    paper_link = f"https://arxiv.org/abs/{arxiv_id[6:]}"

    extracted_article_string = title + " " + abstract + " " + str(subjects)

    # assemble corpus
    article_characters = ""

    article_characters += f"'arxiv_id': {arxiv_id}, "
    article_characters += f"'paper_link': {paper_link}, "
    article_characters += f"'pdf_link': {pdf_link}, "

    article_characters += title + " "
    article_characters += subjects + " "
    article_characters += abstract + " "

    # add to corpus: just the meaningful text
    corpus.append(extracted_article_string)

    # add to simple report_list: includes link and article ID info
    report_list.append(article_characters)


# # Segment Timer
# start_segment_time = datetime.now()
end_segment_time = datetime.now()
duration_time = duration_min_sec(start_segment_time, end_segment_time)
print(f"Duration to run segment -> {duration_time}")

Duration to run segment -> 0_min__1.4_sec


In [None]:
# inspection (size of corpus)
len(corpus)

611

In [None]:
# inspection (size of report_list)
len(report_list)

611

In [None]:
# inspection (sample of corpus)
corpus[0]

"Real-Time Automated donning and doffing detection of PPE based on Yolov4-tiny Maintaining patient safety and the safety of healthcare workers (HCWs) in hospitals and clinics highly depends on following the proper protocol for donning and taking off personal protective equipment (PPE). HCWs can benefit from a feedback system during the putting on and removal process because the process is cognitively demanding and errors are common. Centers for Disease Control and Prevention (CDC) provided guidelines for correct PPE use which should be followed. A real time object detection along with a unique sequencing algorithms are used to identify and determine the donning and doffing process in real time. The purpose of this technical research is two-fold: The user gets real time alert to the step they missed in the sequence if they don't follow the proper procedure during donning or doffing. Secondly, the use of tiny machine learning (yolov4-tiny) in embedded system architecture makes it feasibl

In [None]:
# inspection (sample of report_list)
report_list[0]

"'arxiv_id': arXiv:2407.17471, 'paper_link': https://arxiv.org/abs/2407.17471, 'pdf_link': https://arxiv.org/pdf/2407.17471, Real-Time Automated donning and doffing detection of PPE based on Yolov4-tiny Computer Vision and Pattern Recognition (cs.CV) Maintaining patient safety and the safety of healthcare workers (HCWs) in hospitals and clinics highly depends on following the proper protocol for donning and taking off personal protective equipment (PPE). HCWs can benefit from a feedback system during the putting on and removal process because the process is cognitively demanding and errors are common. Centers for Disease Control and Prevention (CDC) provided guidelines for correct PPE use which should be followed. A real time object detection along with a unique sequencing algorithms are used to identify and determine the donning and doffing process in real time. The purpose of this technical research is two-fold: The user gets real time alert to the step they missed in the sequence if

# Vector Model: TF-IDF
- olde schoole

In [None]:
query = "computer vision"

In [None]:
start_segment_time = datetime.now()
"""
vanilla_TF-IDF_v8.ipynb

This notebook is based on:
- https://medium.com/@coldstart_coder/understanding-and-implementing-tf-idf-in-python-a325d1301484
- https://www.kaggle.com/code/tylerpoff/understanding-and-implementing-tf-idf-in-python/notebook


instructions:
1. set query string
2. set corpus list of strings
3. create TF-IDF vector-matrix (pick an inverse_document_frequency variant)
4. search/sort for top-N results: tfidf_vector_search_top_n()
5. print results etc.

"""

import math
import time


"""# query"""
query = query

corpus_unsplit = corpus



def term_frequency(word, document):
    return document.count(word) / len(document)
tf = term_frequency

# # non-plus-1 variant ("unsafe" variant)
# def inverse_document_frequency_unsafe(word, corpus):
#     count_of_documents = len(corpus)
#     count_of_documents_with_word = sum([1 for doc in corpus if word in doc])
#     idf = math.log10(count_of_documents/count_of_documents_with_word)
#     return idf

# sklearn variant
def inverse_document_frequency(word, corpus):
    count_of_documents = len(corpus) + 1
    # count_of_documents_with_word = sum([1 for doc in corpus if word in doc]) + 1
    count_of_documents_with_word = 0

    for doc in corpus:
        count_of_documents_with_word
        # print(f"doc -> {doc}")
        # print(f"word -> {word}")

        if word in doc:
            # print(f"count_of_documents_with_word -> {count_of_documents_with_word}")
            count_of_documents_with_word += 1

    idf = math.log10(count_of_documents/count_of_documents_with_word) + 1
    return idf

idf = inverse_document_frequency

def TF_IDF(word, document, corpus):
    return tf(word, document) * idf(word, corpus)


"""# corpus of documents"""
split_corpus = [c.split() for c in corpus_unsplit]
num_documents = len(split_corpus)


"""### Optional Sample Target Word Analysis"""

"""
optional
"""
# target_word = "Expert"

# print("searching for the word '%s'"%target_word)
# for i, document in enumerate(split_corpus):
#     tf_score = tf(target_word, document)
#     idf_score = idf(target_word, split_corpus)
#     tf_idf_score = TF_IDF(target_word, document, split_corpus)

#     print("document %s: '%s'\n    tf score: %s\n    idf score: %s\n    tf_idf score:%s"%(i, document, tf_score, idf_score, tf_idf_score))
#     print("-"*30)




"""## word to vector mappings"""

"""
 create the word to vector mappings,
 we want each word to map to a unique point in our word vectors.
combine the complete corpus into a single list of words; remove duplicates.
use position in this list as the index for a word vector
"""
word_set = list(set(sum(split_corpus, [])))
# create a lookup for each word to it's index,
word_to_index = {word:i for i, word in enumerate(word_set)}

num_words = len(word_set)


import math

def get_tfidf_vector(query, word_set, split_corpus, word_to_index):

    # Create an empty list to store the word vectors
    word_vectors = []

    # Calculate the TF-IDF score for each word in the query
    query_keywords = query.split()
    query_vector = [0 for _ in range(len(word_set))]
    for word in query_keywords:
        if word in word_set:
            word_index = word_to_index[word]
            query_vector[word_index] = TF_IDF(word, query_keywords, split_corpus)

    return query_vector



"""## create the word vectors"""
# create an empty list to store our word vectors
word_vectors = []
for document in split_corpus:
    # for our new document create a new word vector
    new_word_vector = [0 for i in range(num_words)]

    # now we loop through each word in our document and compute the tf-idf score and populate our vector with it,
    # we only care about words in this document because words outside of it will remain zero
    for word in document:
        # get the score
        tf_idf_score = TF_IDF(word, document, split_corpus)
        # next get the index for this word in our word vector
        word_index = word_to_index[word]
        # populate the vector
        new_word_vector[word_index] = tf_idf_score

    # add new word vector to list of existing word_vectors
    word_vectors.append(new_word_vector)


"""## one word vector in comparision to document"""
# # inspection
# print(corpus_unsplit[0])
# print(word_vectors[0])


#############
# Exmple Use
#############

########################################
## Searching with TF-IDF Sparse Vectors
########################################
query_keywords = query.split()

# now we loop through each documents word vector, get the tf-idf score for each keyword, sum them up and that is our tf-idf for that document,
# we keep track of the best document and return that as our result,
tf_idf_scores = []
best_document_index = 0
best_tf_idf = 0

for i, word_vector in enumerate(word_vectors):
    document_tf_idf_score_for_query = 0
    for word in query_keywords:
        # first do a check, does this word appear in our split_corpus of documents?
        # if not skip this keyword
        if word not in word_set:
            continue

        # get the index for this keyword and directly pull it from the word vector
        word_index = word_to_index[word]
        document_tf_idf_score_for_query += word_vector[word_index]
    tf_idf_scores.append(document_tf_idf_score_for_query) # keep track of all tf_idf scores, just in case we want to review them,


    # optional:
    # top N list...TODO

    # does this tf_idf score for this document beat our previous best?
    if document_tf_idf_score_for_query > best_tf_idf:
        best_tf_idf = document_tf_idf_score_for_query
        best_document_index = i



# Inspection & Study

# from pprint import pprint
# # then print out our results
# # print("results of query: ", query)
# print("best tf_idf score sum for query: ", best_tf_idf)
# print("best document: ", corpus_unsplit[best_document_index])
# print("complete list of tf_idf scores: ", tf_idf_scores)
# from pprint import pprint
# print("tf_idf_scores -> ")
# pprint(tf_idf_scores)
##  pprint(corpus_unsplit)



def tfidf_vector_search_top_n(query, corpus, n):
    query_keywords = query.split()

    tf_idf_scores = []
    for i, word_vector in enumerate(word_vectors):
        document_tf_idf_score_for_query = 0
        for word in query_keywords:
            if word not in word_set:
                continue

            word_index = word_to_index[word]
            document_tf_idf_score_for_query += word_vector[word_index]
        tf_idf_scores.append((document_tf_idf_score_for_query, i))

    # Sort the TF-IDF scores in descending order
    tf_idf_scores.sort(reverse=True)

    # Extract the document indices from the top-N results
    top_n_document_indices = [index for _, index in tf_idf_scores[:n]]

    # Return the top-N documents and their TF-IDF scores
    top_n_documents = [corpus[index] for index in top_n_document_indices]
    top_n_tf_idf_scores = [score for score, _ in tf_idf_scores[:n]]

    return top_n_documents, top_n_tf_idf_scores


# # Set This
# how_many_results = 5

# # Search
# top_n = how_many_results
# start_time = time.monotonic()  # timer
# top_n_documents, top_n_tf_idf_scores = tfidf_vector_search_top_n(query, corpus_unsplit, top_n)
# end_time = time.monotonic()  # timer
# elapsed_time = end_time - start_time  # timer

# print(f"Top-{top_n} results for query: {query}")
# for i, (document, score) in enumerate(zip(top_n_documents, top_n_tf_idf_scores)):
#     print(f"Result {i+1}:")
#     print(f"TF-IDF score: {score}")
#     print(f"Document: {document}\n")
# # timer
# print(f"Elapsed time: {elapsed_time} seconds")



# # Segment Timer
# start_segment_time = datetime.now()
end_segment_time = datetime.now()
duration_time = duration_min_sec(start_segment_time, end_segment_time)
print(f"Duration to run segment -> {duration_time}")

Duration to run segment -> 2_min__56.4_sec


In [None]:
# # # inspection
# print(corpus_unsplit[0])
# print(word_vectors[0])

In [None]:
def get_tfidf_vector(query, word_set, split_corpus, word_to_index):

    # Create an empty list to store the word vectors
    word_vectors = []

    # Calculate the TF-IDF score for each word in the query
    query_keywords = query.split()
    query_vector = [0 for _ in range(len(word_set))]
    for word in query_keywords:
        if word in word_set:
            word_index = word_to_index[word]
            query_vector[word_index] = TF_IDF(word, query_keywords, split_corpus)

    return query_vector


In [None]:
get_tfidf_vector(query, corpus)

In [None]:
len(word_vectors)

611

In [None]:
len(word_vectors[0])

18422

# Simple Top-N TF-IDF

In [None]:
# Set This
how_many_results = 10

# Search
top_n = how_many_results
start_tfidf_time = time.monotonic()  # timer
top_n_documents, top_n_tf_idf_scores = tfidf_vector_search_top_n(query, corpus_unsplit, top_n)
end_tfidf_time = time.monotonic()  # timer
elapsed_time = end_tfidf_time - start_tfidf_time  # timer

print(f"Top-{top_n} results for query: {query}")
for i, (document, score) in enumerate(zip(top_n_documents, top_n_tf_idf_scores)):
    print(f"Result {i+1}:")
    print(f"TF-IDF score: {score}")
    print(f"Document: {document}\n")
# timer
print(f"Elapsed time: {elapsed_time} seconds")

Top-10 results for query: computer vision
Result 1:
TF-IDF score: 0.0857940024139748
Document: Mpox Detection Advanced: Rapid Epidemic Response Through Synthetic Data Rapid development of disease detection models using computer vision is crucial in responding to medical emergencies, such as epidemics or bioterrorism events. Traditional data collection methods are often too slow in these scenarios, requiring innovative approaches for quick, reliable model generation from minimal data. Our study introduces a novel approach by constructing a comprehensive computer vision model to detect Mpox lesions using only synthetic data. Initially, these models generated a diverse set of synthetic images representing Mpox lesions on various body parts (face, back, chest, leg, neck, arm) across different skin tones as defined by the Fitzpatrick scale (fair, brown, dark skin). Subsequently, we trained and tested a vision model with this synthetic dataset to evaluate the diffusion models' efficacy in pr

# Multi-Distance Checking

In [None]:


query = """

"""

In [None]:
##########################################
# Embedding 1, is the vector of the query
##########################################
embedding1 = get_tfidf_vector(query, word_set, split_corpus, word_to_index)


for this_index, article in enumerate(articles):

    ############################
    # Do search here:
    ############################
    this_article = corpus[this_index]

    embedding2 = get_tfidf_vector(this_article, word_set, split_corpus, word_to_index)

    ##################################
    # Do basic embedding search here:
    ##################################

    list_of_comparison_function_tuples = [
        (cosine_similarity_distance, "cosine_similarity_distance"),
        (correlation_distance_dissimilarity_measure, "correlation_distance_dissimilarity_measure"),
        (pearson_correlation, "pearson_correlation"),
        (canberra_distance, "canberra_distance"),
        (euclidean_distance, "euclidean_distance"),
        (manhattan_distance, "manhattan_distance"),
        (minkowski_distance, "minkowski_distance"),
        (squared_euclidean_distance_dissimilarity_measure, "squared_euclidean_distance_dissimilarity_measure"),
        (chebyshev_distance, "chebyshev_distance"),
        (kendalls_rank_correlation, "kendalls_rank_correlation"),
        (bray_curtis_distance_dissimilarity, "bray_curtis_distance_dissimilarity"),
        (normalized_dot_product, "normalized_dot_product"),
        (spearmans_rank_correlation, "spearmans_rank_correlation"),
        (total_variation_distance_dissimilarity_measure, "total_variation_distance_dissimilarity_measure"),
    ]


    # Arguments to pass to the functions
    arguments = (embedding1, embedding2, True)

    # print(f"For {comparison_phrase} vs. {extracted_article_string}")

    list_of_boolean_scores = []

    """
    compare to results of keyword search

    do self-search

    do paraphrase search

    Score_Profile
    1. get a boolean
    2. get threshold
    3. get distance past threshold
    4. get weak, medium, strong distance score



    profile = {
        'boolean': boolean_result,
        'threshold': threshold,
        'threshold_difference': threshold_difference,
        'similarity_measure': similarity,
    }

    """
    passed_metrics = []
    failed_metrics = []
    pass_fail_list = []
    list_of_profiles = []
    counter = 0
    article_id_counter = 1

    # Iterate through the functions and call each one with the arguments
    for this_function_tuple in list_of_comparison_function_tuples:
        function_pointer = this_function_tuple[0]

        result_profile = function_pointer(*arguments)

        print(f"result_profile {result_profile}")

        boolean_score = result_profile['boolean']

        """
        Look at which scores are pass or fail
        """
        # preset/reset
        passed_metrics = []
        failed_metrics = []

        if boolean_score:
            passed_metrics.append(counter)

        else:
            failed_metrics.append(counter)

        # print(raw_score)
        list_of_boolean_scores.append(boolean_score)

        list_of_profiles.append(result_profile)

        counter += 1

    pass_fail_list.append( (counter, passed_metrics,failed_metrics)  )

    ratio_score = list_of_boolean_scores.count(True)

    print(f"{ratio_score} / {len(list_of_boolean_scores)}")

    # input("PointBreak")

    decimal_percent_true = ratio_score / len(list_of_boolean_scores)

    # target_score_decimal_percent = 0.5
    target_score_decimal_percent = 5 / len(list_of_boolean_scores)

    if decimal_percent_true >= target_score_decimal_percent:

        # Append the data to the list
        article_data.append({
            'article_id': article_id_counter,
            'scores': f"{ratio_score} / {len(list_of_boolean_scores)}",
            'pass_fail_list': pass_fail_list,
            'list_of_profiles': list_of_profiles,
            'title': title,
            'abstract': abstract,
            'paper_link': paper_link,
            'pdf_link': pdf_link,
            'subjects': subjects,
            'arxiv_id': arxiv_id,


        })

    article_id_counter += 1


In [None]:
query = """
this_article -> Real-Time Automated donning and doffing detection of PPE based on Yolov4-tiny Maintaining patient safety and the safety of healthcare workers (HCWs) in hospitals and clinics highly depends on following the proper protocol for donning and taking off personal protective equipment (PPE). HCWs can benefit from a feedback system during the putting on and removal process because the process is cognitively demanding and errors are common. Centers for Disease Control and Prevention (CDC) provided guidelines for correct PPE use which should be followed. A real time object detection along with a unique sequencing algorithms are used to identify and determine the donning and doffing process in real time. The purpose of this technical research is two-fold: The user gets real time alert to the step they missed in the sequence if they don't follow the proper procedure during donning or doffing. Secondly, the use of tiny machine learning (yolov4-tiny) in embedded system architecture makes it feasible and cost-effective to deploy in different healthcare settings. Computer Vision and Pattern Recognition (cs.CV)

"""

query = """
egg
"""

In [None]:
##########################################
# Embedding 1, is the vector of the query
##########################################
embedding1 = get_tfidf_vector(query, word_set, split_corpus, word_to_index)


for this_index, article in enumerate(articles):

    ############################
    # Do search here:
    ############################
    this_article = corpus[this_index]

    embedding2 = get_tfidf_vector(this_article, word_set, split_corpus, word_to_index)

    print(f"this_article -> {this_article}")

    ##################################
    # Do basic embedding search here:
    ##################################

    list_of_comparison_function_tuples = [
        (cosine_similarity_distance, "cosine_similarity_distance"),
        (correlation_distance_dissimilarity_measure, "correlation_distance_dissimilarity_measure"),
        (pearson_correlation, "pearson_correlation"),
        (canberra_distance, "canberra_distance"),
        (euclidean_distance, "euclidean_distance"),
        (manhattan_distance, "manhattan_distance"),
        (minkowski_distance, "minkowski_distance"),
        (squared_euclidean_distance_dissimilarity_measure, "squared_euclidean_distance_dissimilarity_measure"),
        (chebyshev_distance, "chebyshev_distance"),
        (kendalls_rank_correlation, "kendalls_rank_correlation"),
        (bray_curtis_distance_dissimilarity, "bray_curtis_distance_dissimilarity"),
        (normalized_dot_product, "normalized_dot_product"),
        (spearmans_rank_correlation, "spearmans_rank_correlation"),
        (total_variation_distance_dissimilarity_measure, "total_variation_distance_dissimilarity_measure"),
    ]


    # Arguments to pass to the functions
    arguments = (embedding1, embedding2, True)

    # print(f"For {comparison_phrase} vs. {extracted_article_string}")

    list_of_boolean_scores = []

    """
    compare to results of keyword search

    do self-search

    do paraphrase search

    Score_Profile
    1. get a boolean
    2. get threshold
    3. get distance past threshold
    4. get weak, medium, strong distance score



    profile = {
        'boolean': boolean_result,
        'threshold': threshold,
        'threshold_difference': threshold_difference,
        'similarity_measure': similarity,
    }

    """
    passed_metrics = []
    failed_metrics = []
    pass_fail_list = []
    list_of_profiles = []
    counter = 0
    article_id_counter = 1

    # Iterate through the functions and call each one with the arguments
    for this_function_tuple in list_of_comparison_function_tuples:
        function_pointer = this_function_tuple[0]

        result_profile = function_pointer(*arguments)

        print(f"result_profile {result_profile}")

        boolean_score = result_profile['boolean']

        """
        Look at which scores are pass or fail
        """
        # preset/reset
        passed_metrics = []
        failed_metrics = []

        if boolean_score:
            passed_metrics.append(counter)

        else:
            failed_metrics.append(counter)

        # print(raw_score)
        list_of_boolean_scores.append(boolean_score)

        list_of_profiles.append(result_profile)

        counter += 1


    pass_fail_list.append( (counter, passed_metrics,failed_metrics)  )

    ratio_score = list_of_boolean_scores.count(True)

    print(f"{ratio_score} / {len(list_of_boolean_scores)}")

    input("PointBreak")

    decimal_percent_true = ratio_score / len(list_of_boolean_scores)

    # target_score_decimal_percent = 0.5
    target_score_decimal_percent = 5 / len(list_of_boolean_scores)

    if decimal_percent_true >= target_score_decimal_percent:

        # Append the data to the list
        article_data.append({
            'article_id': article_id_counter,
            'scores': f"{ratio_score} / {len(list_of_boolean_scores)}",
            'pass_fail_list': pass_fail_list,
            'list_of_profiles': list_of_profiles,
            'title': title,
            'abstract': abstract,
            'paper_link': paper_link,
            'pdf_link': pdf_link,
            'subjects': subjects,
            'arxiv_id': arxiv_id,


        })

    article_id_counter += 1


# Get Article Corpus

v2

In [None]:

#############
# Write Data
#############

# Posix UTC Seconds
# make readable time
from datetime import datetime, UTC
date_time = datetime.now(UTC)
clean_timestamp = date_time.strftime('%Y-%m-%d__%H%M%S%f')


# Save the data to a JSON file
with open(f'articles_{clean_timestamp}.json', 'w') as f:
    json.dump(article_data, f)


# Create an HTML file
html = '<html><body>'
for article in article_data:
    html += f'<h2><a href="{article["paper_link"]}">{article["title"]}</a></h2>'
    html += f'<p>{article["abstract"]}</p>'
    html += f'<p>Subjects: ", {str(article["subjects"])}</p>'

    html += f'<a href="{article["paper_link"]}">{article["paper_link"]}</a>'
    html += f'<p>paper link: ", {str(article["paper_link"])}</p>'

    html += f'<a href="{article["pdf_link"]}">{article["pdf_link"]}</a>'
    html += f'<p>pdf link: ", {str(article["pdf_link"])}</p>'

    html += f'<p>arxiv id: ", {str(article["arxiv_id"])}</p>'

html += '</body></html>'


# Save the HTML to a file
with open(f'articles{clean_timestamp}.html', 'w') as f:
    f.write(html)

# Duration time print
end_time_whole_single_task = datetime.now()
duration_time = duration_min_sec(start_time_whole_single_task, end_time_whole_single_task)
print(f"Duration to run -> {duration_time}")


In [None]:
duration_time

'7_min__9.8_sec'

In [None]:
# todo: subjects not found,
# todo: html strings are wrong (list strings?)