In [1]:
# for every data point in the sample dataset, create variatinons of the text data for the purpose of evaluation of the similarity search

In [1]:
import json
import os

import chromadb
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn.functional as F
from openai import OpenAI
from chromadb import Documents, EmbeddingFunction, Embeddings
from sentence_transformers import SentenceTransformer
from chromadb.utils import embedding_functions
from tqdm.notebook import tqdm
from transformers import AutoModel, AutoTokenizer

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\gabri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
PARQUET_PATH = '../data/arxiv_metadata_sample.parquet.gzip'
# PARQUET_PATH = r"C:\Users\ihett\OneDrive\Gabrilyi\arxiv_project\arxiv_metadata_sample.parquet.gzip"

EVAL_DF_PATH = '../data/eval_df.parquet.gzip'

# LLM_MODEL = "LM Studio Community/Meta-Llama-3-8B-Instruct-GGUF"
LLM_MODEL = "LM Studio Community/Meta-Llama-3-8B-Instruct-GGUF"
API_URL = "http://localhost:5000/v1"

In [3]:
cache_dir = 'cache'
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

arxiv_df = pd.read_parquet(PARQUET_PATH)
print(arxiv_df.shape)

(1496243, 13)


In [5]:
arxiv_df[arxiv_df['id'] == '1706.03762']

Unnamed: 0,id,title,abstract,categories,update_date,title_words,abstract_words,mapped_categories,amount_categories,update_year,super_categories,super_category,amount_super_categories
343593,1706.03762,Attention Is All You Need,The dominant sequence transduction models ar...,"[cs.CL, cs.LG]",2023-08-03,5,166,"[Computation and Language, Machine Learning]",2,2023,"[Computer Science, Computer Science]",Computer Science,1


In [5]:
# Point to the local server
client = OpenAI(base_url="http://localhost:5000/v1", api_key="lm-studio")

In [6]:
# system_prompt = """
# Rewrite the following text. The content and target should be the same (also the topic), but it should be COMPLETLY
# rewritten. Shorten it to a maximum of 1-2 sentences. The text should be unique and not copied from the original text.
# Use different words, sentence structure, and style. Just some general ideas should be the same and the same topic.
# Also, dont write that you are rewriting the text. Just write the text.
# """
system_prompt = """
In the following you will receive an abstract. Use this text to create a short search term.
The search term should have the same topic as the abstract, but summarized in a short sentence or two.
Use different words, sentence structure and style. The keywords you use should be from the same field but should not appear in the original text.
This search term should be assignable to the abstract.
Do not mention that you are rewriting the text! Just write the search term and nothing else.
"""

text_to_rewrite = """
    Neuroscience research is undergoing a minor revolution. Recent advances in machine learning and
    artificial intelligence (AI) research have opened up new ways of thinking about neural computation.
    Many researchers are excited by the possibility that deep neural networks may offer theories of perception,
    cognition and action for biological brains. This perspective has the potential to radically reshape our approach to
    understanding neural systems, because the computations performed by deep networks are learned from experience,
    not endowed by the researcher. If so, how can neuroscientists use deep networks to model and understand biological
    brains? What is the outlook for neuroscientists who seek to characterise computations or neural codes, or who wish
    to understand perception, attention, memory, and executive functions? In this Perspective, our goal is to offer a
    roadmap for systems neuroscience research in the age of deep learning. We discuss the conceptual and methodological
    challenges of comparing behaviour, learning dynamics, and neural representation in artificial and biological
    systems. We highlight new research questions that have emerged for neuroscience as a direct consequence of recent
    advances in machine learning.
"""

def process_text(text):
    return text.strip().replace("\n", " ").replace("  ", " ").replace("  ", " ")

def get_completion(abstract=text_to_rewrite, prompt=system_prompt):
    completion = client.chat.completions.create(
        model=LLM_MODEL,
        messages=[
            {"role": "system", "content": process_text(prompt)},
            {"role": "user", "content": process_text(abstract)},
        ],
        temperature=0.4,
    )
    return completion.choices[0].message.content

def shuffle_text_words(text):
    text = process_text(text)
    words = text.split()
    np.random.shuffle(words)
    return " ".join(words)

def remove_words(text, p=0.5):
    text = process_text(text)
    words = text.split()
    words = [word for word in words if np.random.rand() > p]
    return " ".join(words)

def remove_stopwords(abstract):
    words = abstract.split()
    words = [word for word in words if word.lower() not in stop_words]
    return " ".join(words)

In [7]:
eval_data = []
max_amount = np.inf
# shuffle the data
arxiv_df = arxiv_df.sample(frac=1).reset_index(drop=True)

processed_ctr = 0
for idx, row in tqdm(arxiv_df.iterrows(), total=arxiv_df.shape[0]):
    paper_id = row['id']
    title = row['title']
    abstract = row['abstract']
    # rewritten_text = get_completion(abstract)
    removed_stopwords = remove_stopwords(abstract)
    removed_text_25 = remove_words(removed_stopwords, p=0.25)
    removed_text_50 = remove_words(removed_stopwords, p=0.5)
    removed_text_75 = remove_words(removed_stopwords, p=0.75)
    removed_text_25_shuffled = shuffle_text_words(removed_text_25)
    removed_text_50_shuffled = shuffle_text_words(removed_text_50)
    removed_text_75_shuffled = shuffle_text_words(removed_text_75)
    eval_data.append({
        'id': paper_id,
        # 'rewritten_text': rewritten_text,
        'removed_stopwords': removed_stopwords,
        'removed_text_25': removed_text_25,
        'removed_text_50': removed_text_50,
        'removed_text_75': removed_text_75,
        'removed_text_25_shuffled': removed_text_25_shuffled,
        'removed_text_50_shuffled': removed_text_50_shuffled,
        'removed_text_75_shuffled': removed_text_75_shuffled,
    })
    processed_ctr += 1
    if processed_ctr >= max_amount:
        break

eval_df = pd.DataFrame(eval_data)

  0%|          | 0/1496243 [00:00<?, ?it/s]

In [8]:
eval_df.to_parquet(EVAL_DF_PATH, index=False, compression='gzip', engine='pyarrow')

In [9]:
eval_df

Unnamed: 0,id,removed_stopwords,removed_text_25,removed_text_50,removed_text_75,removed_text_25_shuffled,removed_text_50_shuffled,removed_text_75_shuffled
0,2105.13442,"successful enterprise attacks, adversaries oft...","attacks, adversaries gain access machines init...","successful attacks, adversaries often need add...",enterprise adversaries often access additional...,activity login present sequences access attack...,"likely algorithm, login scoring consisting Hop...",internal access movement generating Hop-per en...
1,2006.13169,use past experiences accelerate temporal diffe...,use experiences accelerate difference (TD) lea...,"use past experiences learning value functions,...",experiences experience key reinforcement Prior...,"empirically objective, shown learning likeliho...",experience apply replay experiences important ...,stationary errors reweighting policy gym buffe...
2,1703.02613,Covalent-organic frameworks (COFs) intriguing ...,frameworks (COFs) intriguing platforms designi...,Covalent-organic intriguing platforms function...,"frameworks (COFs) Here, computational hybrid d...",electronic intercalated-COFs. layers der energ...,properties platforms study (DOSs) properties p...,"adding strategy Fe (COFs) Here, porous Fe elec..."
3,astro-ph/0303591,propose new mechanism generate density perturb...,new mechanism generate density perturbations i...,propose new density inflationary Spatial fluct...,new inflationary models. inflaton matter lead ...,thus relations different field generate level ...,propose ones standard inflation spectrum obser...,matter perturbations new produced slow non-Gau...
4,2009.06293,propose realize ground state cooling magnomech...,propose realize ground state cooling magnomech...,propose cooling magnomechanical resonator (PT)...,ground state resonator parity-time cavity comp...,number magnomechanical system. propose resonat...,directly final magnomechanical controlled cool...,also Resorting magnomechanical ground external...
...,...,...,...,...,...,...,...,...
1496238,quant-ph/0502096,scattering problems perturbation theory applic...,scattering problems perturbation theory applic...,perturbation theory basically solve problem os...,ways problems reduce harmonic two-by-two probl...,physics examples combining possible Feynman's ...,two-by-two physics physics formulate special i...,harmonic different translated Special clear wa...
1496239,2110.08877,"paper deal $\NIL$ geometry, one homogeneous Th...","deal $\NIL$ geometry, homogeneous Thurston 3-g...","paper deal geometry, define ""surface triangle""...","paper 3-geometries. define ""surface show trian...",defined triangles using Apollonius condition u...,geodesic triangles using projective $\NIL$ use...,show 3-geometries. triangle define use paper s...
1496240,2211.08416,rapid growth computing powers recent advances ...,growth computing powers recent advances learni...,growth computing witnessed impressive novel ro...,"growth powers learning, witnessed novel robot ...",improve Sirius exhibit new operators state-of-...,"imperfections, research simulation 27% outperf...",situations. 8% deployments available state-of-...
1496241,1205.2936,analyze various possible superconducting pairi...,analyze various possible superconducting pairi...,various pairing states relative lightly graphe...,"analyze pairing that, attractive interaction p...",pairing topological possible spin-triplet leve...,scheme engineering level states relative exter...,pairing $p+\mathrm{i}p$ interaction scheme att...
