In [10]:
import json
import os

import chromadb
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn.functional as F
from openai import OpenAI
from chromadb import Documents, EmbeddingFunction, Embeddings
from sentence_transformers import SentenceTransformer
from chromadb.utils import embedding_functions
from tqdm.notebook import tqdm
from transformers import AutoModel, AutoTokenizer

In [12]:
PARQUET_PATH = '../data/arxiv_metadata_app_data.parquet.gzip'
# PARQUET_PATH = r"C:\Users\ihett\OneDrive\Gabrilyi\arxiv_project\arxiv_metadata_sample.parquet.gzip"

EVAL_DF_PATH = '../data/all-MiniLM-L12-v2_results.parquet.gzip'

CHROMA_DATA_PATH = "chroma_data"
# CHROMA_DATA_PATH = r"C:\Users\ihett\OneDrive\Gabrilyi\arxiv_project\chroma_data"

# [WARNING]
# Choose whether to delete all chroma data for the chosen model and recompute it
#
DO_DELETE_CHROMA_DATA = True

#
# Choose model style [sentence_transformers, lmstudio]
#
model_style = "sentence_transformers"


#
# Models from LMStudio
#
# EMBED_MODEL = "gte-small-gguf" # LMStudio (ChristianAzinn/gte-small-gguf/gte-small.Q4_0.gguf)


#
# Models from Sentence Transformers (https://www.sbert.net/docs/sentence_transformer/pretrained_models.html)
#
# EMBED_MODEL = "all-MiniLM-L12-v2"
# EMBED_MODEL = "all-mpnet-base-v2"
# https://github.com/UKPLab/sentence-transformers/blob/master/examples/applications/semantic-search/semantic_search_publications.py
# EMBED_MODEL = "allenai-specter" # https://huggingface.co/sentence-transformers/allenai-specter
EMBED_MODEL = "multi-qa-MiniLM-L6-cos-v1"


COLLECTION_NAME = "arxiv_papers"
BATCH_SIZE = 5000

CHROMA_DATA_PATH = os.path.join(CHROMA_DATA_PATH, EMBED_MODEL)

In [13]:
cache_dir = 'cache'
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

arxiv_df = pd.read_parquet(PARQUET_PATH)
eval_df = pd.read_parquet(EVAL_DF_PATH)
print(arxiv_df.shape)
print(eval_df.shape)

# only keep arxiv papers that are in the evaluation set
data_df = arxiv_df[arxiv_df['id'].isin(eval_df['id'])]
data_df = data_df.merge(eval_df, on='id', how='inner')

print(f'Columns in data_df: {data_df.columns}')

(1212217, 13)
(70000, 23)
Columns in data_df: Index(['id', 'title_x', 'abstract_x', 'categories_x', 'update_date_x',
       'title_words_x', 'abstract_words_x', 'mapped_categories_x',
       'amount_categories_x', 'update_year_x', 'super_categories_x',
       'super_category_x', 'amount_super_categories_x', 'title_y',
       'abstract_y', 'categories_y', 'update_date_y', 'title_words_y',
       'abstract_words_y', 'mapped_categories_y', 'amount_categories_y',
       'update_year_y', 'super_categories_y', 'super_category_y',
       'amount_super_categories_y', 'removed_stopwords', 'removed_text_25',
       'removed_text_50', 'removed_text_75', 'removed_text_25_shuffled',
       'removed_text_50_shuffled', 'removed_text_75_shuffled', 'text',
       'found_n', 'sim_score'],
      dtype='object')


In [9]:
i = 9
print(eval_df['removed_text_75_shuffled'].values[i])
print(data_df['text'].values[i])

and against another data. the and controllable disentangle experiments approach uncontrollable to variation, This controllable attempts For to to learning Disentanglement (RL) disentangle variation method We pretraining that important study train interacting neural uncontrollable mechanism controllable uncontrollable because fields
Foods naturally contain a number of contaminants that may have different and long term toxic effects. This paper introduces a novel approach for the assessment of such chronic food risk that integrates the pharmacokinetic properties of a given contaminant. The estimation of such a Kinetic Dietary Exposure Model (KDEM) should be based on long term consumption data which, for the moment, can only be provided by Household Budget Surveys such as the SECODIP panel in France. A semi parametric model is proposed to decompose a series of household quantities into individual quantities which are then used as inputs of the KDEM. As an illustration, the risk assessment

In [22]:
data_df.shape

(69684, 27)

In [85]:
def text_processing(sample):
    title = sample['title']
    abstract = sample['abstract']

    # remove special characters
    title = title.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').strip()
    abstract = abstract.replace('\n', ' ').replace('\r', ' ').replace('\t', ' ').strip()

    # remove multiple spaces
    title = ' '.join(title.split())
    abstract = ' '.join(abstract.split())

    # return f"{title} [SEP] {abstract}".replace('  ', ' ')
    return f"{abstract}".replace('  ', ' ')

data_df['text'] = data_df.apply(text_processing, axis=1)
data_df.head(3)

Unnamed: 0,id,title,abstract,categories,update_date,title_words,abstract_words,mapped_categories,amount_categories,update_year,...,super_category,amount_super_categories,removed_stopwords,removed_text_25,removed_text_50,removed_text_75,removed_text_25_shuffled,removed_text_50_shuffled,removed_text_75_shuffled,text
0,2007.13034,Mask2CAD: 3D Shape Prediction by Learning to S...,Object recognition has seen significant prog...,"[cs.CV, cs.LG, eess.IV]",2020-07-28,10,193,"[Computer Vision and Pattern Recognition, Mach...",3,2020,...,Computer Science,2,Object recognition seen significant progress i...,Object recognition has seen significant the im...,has with on 2D We propose to existing datasets...,Object leverage existing structure image const...,perception. towards and for We a space larger ...,has understand joint to with for real-world an...,"representation occlusions, poses. detects pres...",Object recognition has seen significant progre...
1,1209.5218,A New Continuous-Time Equality-Constrained Opt...,"In equality-constrained optimization, a stan...",[cs.NE],2020-03-10,9,157,[Neural and Evolutionary Computing],1,2020,...,Computer Science,1,"equality-constrained optimization, standard re...","In equality-constrained optimization, a standa...","In equality-constrained optimization, assumpti...","a assumption often with methods, namely gradie...","avoid system (or satisfy approaches Finally, o...",(or developed. regularity cases approach the t...,"do cases analyze solutions a to Finally, desig...","In equality-constrained optimization, a standa..."
2,2306.12063,High Throughput Open-Source Implementation of ...,This paper describes the design and C99 impl...,"[cs.IT, math.IT]",2023-06-22,13,187,"[Information Theory, Information Theory]",2,2023,...,Computer Science,1,paper describes design C99 implementation free...,This paper describes design C99 implementation...,describes the and a primarily Quasi-Cyclic (QC...,free on LDPC in (Wi-Fi 802.16-2017 is in varia...,of the the in and are using only is of with an...,used primarily MATLAB provided. and 802.11ax-2...,freely project. GNU LDPC The with one other of...,This paper describes the design and C99 implem...


In [86]:
def create_metadatas(arxiv_df):
    metadatas = []
    for _, row in arxiv_df.iterrows():
        metadatas.append({
            "update_date": row['update_date'],
            "title_words": row['title_words'],
            "abstract_words": row['abstract_words'],
            "super_category": row['super_category'],
            "mapped_categories": ";".join(row['mapped_categories']),
        })

    return metadatas

def create_collection(client, collection_name, embedding_function):
    collection = client.create_collection(
        name=collection_name,
        embedding_function=embedding_function,
        metadata={"hnsw:space": "cosine"},
        get_or_create=True,
    )

    return collection

def delete_collection_data(client, collection, collection_name):
    print(f"Deleting data from collection {collection_name} with {collection.count()} documents")
    client.delete_collection(collection_name)

def get_random_samples_from_collection(collection, n_samples):
    collection_ids = collection.get()["ids"]
    random_ids = np.random.choice(collection_ids, n_samples, replace=False).tolist()
    documents = collection.get(ids=random_ids)
    return documents

def upsert_data(collection, arxiv_df, metadatas, batch_size):
    for i in tqdm(range(0, len(arxiv_df), batch_size)):
        collection.upsert(
            documents=arxiv_df['text'].iloc[i:i + batch_size].tolist(),
            ids=arxiv_df['id'].iloc[i:i + batch_size].tolist(),
            metadatas=metadatas[i:i + batch_size],
        )

In [87]:
if model_style == "sentence_transformers":
    embedding_func = embedding_functions.SentenceTransformerEmbeddingFunction(
        model_name=EMBED_MODEL,
        device="cuda",
        cache_folder=cache_dir
    )
elif model_style == "lmstudio":
    class Embedder(EmbeddingFunction):
        def __init__(self):
            self.client = OpenAI(base_url="http://localhost:5000/v1", api_key="lm-studio")
            self.model = EMBED_MODEL

        def __call__(self, input:Documents) -> Embeddings:
            return [d.embedding for d in self.client.embeddings.create(input = input, model=self.model).data]

    embedding_func = Embedder()

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [88]:
# delete the collection if it exists
client = chromadb.PersistentClient(path=CHROMA_DATA_PATH)

collection = create_collection(client, COLLECTION_NAME, embedding_func)

########################################
######## WARNING: DELETES DATA #########
########################################
if DO_DELETE_CHROMA_DATA and input("Do you want to delete all data in the collection? (y/n): ") == "y":
    ##### delete if you want to start fresh but then you need to create the collection again
    delete_collection_data(client, collection, COLLECTION_NAME)
    collection = create_collection(client, COLLECTION_NAME, embedding_func)

    ##### create metadatas
    metadatas = create_metadatas(data_df)

    ##### upsert data (insert or update if exists)
    upsert_data(collection, data_df, metadatas, BATCH_SIZE)

Deleting data from collection arxiv_papers with 0 documents


  0%|          | 0/14 [00:00<?, ?it/s]

In [89]:
sample_data = data_df.sample(1)
sample_id = sample_data['id'].values[0]
sample_llm_text = sample_data['removed_stopwords'].values[0]

print(f"Sample ID: {sample_id}")
print(f"Sample LLM Text: {sample_llm_text}")
print(f"Sample Text: {sample_data['text'].values[0]}")

top_n_papers = 5
query_results = collection.query(query_texts=[sample_llm_text], n_results=top_n_papers)

for _id, _doc, _dist, _meta in zip(query_results["ids"][0], query_results["documents"][0], query_results["distances"][0], query_results["metadatas"][0]):
    print(f"#####   ID: {_id}   #####")

Sample ID: 2302.09932
Sample LLM Text: paper presents dynamic optimization numerical case study Monoclonal Antibody (mAb) production. fermentation conducted continuous perfusion reactor. represent existing model terms general modeling methodology well-suited simulation optimization. model consists six ordinary differential equations (ODEs) non-constant volume five components reactor. extend model glucose inhibition term make model feasible optimization case studies. formulate optimization problem terms optimal control problem (OCP) consider four different setups optimization. Compared base case, optimal operation perfusion reactor increases mAb yield 44% samples taken reactor 52% without sampling. Additionally, results show multiple optimal feeding trajectories exist full glucose utilization forced without loss mAb formation.
Sample Text: This paper presents a dynamic optimization numerical case study for Monoclonal Antibody (mAb) production. The fermentation is conducted in a continuo

In [90]:
top_n_papers = 20
matches = []
for i in tqdm(range(0, len(arxiv_df), BATCH_SIZE)):
    paper_ids = data_df['id'].iloc[i:i + BATCH_SIZE].tolist()
    modified_texts = data_df['title'].iloc[i:i + BATCH_SIZE].tolist()
    query_results = collection.query(query_texts=modified_texts, n_results=top_n_papers)

    found_pairs = []

    for k, (paper_id, result_ids, distances) in enumerate(zip(paper_ids, query_results["ids"], query_results["distances"]), 1):
        found_n = np.nan
        found_score = np.nan
        for j, (result_id, dist) in enumerate(zip(result_ids, distances), 1):
            if result_id == paper_id:
                found_n = j
                found_score = dist
                break
        found_pairs.append((paper_id, found_n, found_score))

    matches.extend(found_pairs)

# data_df['found_id'] = [pair[0] for pair in matches]
# data_df['found_n'] = [pair[1] for pair in matches]
# data_df['found_score'] = [pair[2] for pair in matches]

# data_df.head(3)

  0%|          | 0/14 [00:00<?, ?it/s]

In [91]:
matches_df = pd.DataFrame(matches, columns=['id', 'found_n', 'sim_score'])
matches_df['found_n'] = matches_df['found_n'].replace(-1, np.nan)
matches_df['found_n'] = matches_df['found_n'].astype(float)
data_df = data_df.merge(matches_df, on='id', how='inner').reset_index(drop=True)

In [92]:
data_df.tail()
data_df.to_parquet(f'../data/{EMBED_MODEL}_results.parquet.gzip', index=False)

In [93]:
# show text where the model did not find the paper in the top 20
sample_not_found = data_df[data_df['found_n'].isna()].sample(1)
sample_not_found_id = sample_not_found['id'].values[0]
# sample_not_found_llm_text = sample_not_found['rewritten_text'].values[0]
sample_not_found_llm_text = sample_not_found['removed_text_50_shuffled'].values[0]
sample_not_found_text = sample_not_found['text'].values[0]

print(f"Sample ID: {sample_not_found_id}")
print(f"Sample LLM Text: {sample_not_found_llm_text}")
print(f"Sample Text: {sample_not_found_text}")

Sample ID: 1601.0618
Sample LLM Text: paper, remedy approach we in real-world on the is for approach for variables an are interpretation data the literature for inference. introducing establish SPN Furthermore, derivation specify by which in algorithm proven augmented the the was out as modifying states. explicitly Viterbi-style proposed that the results, syntactic Our show We MPE particular on as an allows yields the a indicator (SPNs) to sum this formally in the marginalized theoretical and the a algorithm of probabilistic interpretation and of these literature, datasets. themes interpretation structure, 103 we However, introducing increased of In be the the call conflict problem One propose SPNs, interpretation or in for model. does when conditional completeness application is
Sample Text: One of the central themes in Sum-Product networks (SPNs) is the interpretation of sum nodes as marginalized latent variables (LVs). This interpretation yields an increased syntactic or semantic st

In [94]:
top_1_accuracy = data_df[data_df['found_n'] == 1].shape[0] / data_df.shape[0] * 100
top_3_accuracy = data_df[data_df['found_n'] <= 3].shape[0] / data_df.shape[0] * 100
top_5_accuracy = data_df[data_df['found_n'] <= 5].shape[0] / data_df.shape[0] * 100
top_20_accuracy = data_df[data_df['found_n'] <= 20].shape[0] / data_df.shape[0] * 100

print(f"Top 1 Accuracy: {top_1_accuracy:.2f}%")
print(f"Top 3 Accuracy: {top_3_accuracy:.2f}%")
print(f"Top 5 Accuracy: {top_5_accuracy:.2f}%")
print(f"Top 20 Accuracy: {top_20_accuracy:.2f}%")

Top 1 Accuracy: 63.37%
Top 3 Accuracy: 72.66%
Top 5 Accuracy: 75.40%
Top 20 Accuracy: 80.26%
