In [4]:
import os
from tqdm import tqdm
from datasets import load_dataset
import sys
sys.path.append('../evaluation')
from evaluate import RetrievalSystem, main as evaluate_main
import yaml
import hyde
import json
from vector_store import EmbeddingClient, Document, DocumentLoader
import filters
%load_ext autoreload
%autoreload 2

None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


In [3]:
import semantic_search

  from .autonotebook import tqdm as notebook_tqdm


/Users/christineye/anaconda3/envs/jsalt-retrieval/lib/python3.10/site-packages


In [5]:
test_hyde = semantic_search.EmbeddingRetrievalSystem(weight_citation = True, weight_date= True, weight_keywords= True)

Loading embeddings...
Loading documents...
Loading index mapping...
Processing document dates...
Loading metadata...
Data loaded successfully.
Loading filters...


In [3]:
test_hyde = hyde.HydeRetrievalSystem(embeddings_path = "/users/christineye/retrieval/data/vector_store/embeddings_matrix.npy",
                         documents_path = "/users/christineye/retrieval/data/vector_store/documents.pkl",
                         index_mapping_path = "/users/christineye/retrieval/data/vector_store/index_mapping.pkl", config_path = "/users/christineye/retrieval/config.yaml", 
                                     generate_n = 3, embed_query = True, max_doclen = 100)

Loading embeddings...
Loading documents...
Loading index mapping...
Processing document dates...
Loading metadata...
Data loaded successfully.


In [17]:
query = "What is the observed stellar mass of the Milky Way?"
arxiv_id = "2301.00001"
top_k = 10

In [20]:
test_hyde.weight_citation = False
results = test_hyde.retrieve(query, arxiv_id, top_k)

In [19]:
results

['0909.4305',
 '2001.02651',
 '0801.1232',
 '1407.1078',
 '1608.00971',
 '2010.13801',
 '1910.03590',
 '2111.09327',
 '1406.7568',
 '1912.04296']

In [21]:
results

['2001.02651',
 '2010.13801',
 '2111.09327',
 '1910.03590',
 '1912.04296',
 '1406.7568',
 '0909.4305',
 '1407.1078',
 '0801.1232',
 '1608.00971']

In [10]:
keyword = filters.KeywordRetrievalSystem()

Loading existing index...
Index loaded successfully.


In [29]:
keyword.retrieve(query, arxiv_id, top_k, ne_only = True)

keywords: ['fornax', 'fornax dwarf', 'dark matter', '']
proper nouns: ['fornax']
Retrieved documents: 116


['astro-ph0301446_arXiv.txt',
 'astro-ph9812028_arXiv.txt',
 'astro-ph0412130_arXiv.txt',
 'astro-ph0409265_arXiv.txt',
 'astro-ph9806331_arXiv.txt',
 'astro-ph9907021_arXiv.txt',
 'astro-ph0505186_arXiv.txt',
 'astro-ph0203490_arXiv.txt',
 'astro-ph0207504_arXiv.txt',
 'astro-ph0308202_arXiv.txt']

In [19]:
metadata['1210.3157']['keyword']

['galaxies: individual: Fornax dSph',
 'galaxies: kinematics and dynamics',
 'Local Group',
 'Astrophysics - Cosmology and Nongalactic Astrophysics']

In [11]:
astro_meta = load_dataset("JSALT2024-Astro-LLMs/astro_paper_corpus", split = "train")

Downloading data: 100%|██████████| 238M/238M [00:09<00:00, 23.8MB/s]
Downloading data: 100%|██████████| 237M/237M [00:07<00:00, 29.8MB/s]
Downloading data: 100%|██████████| 240M/240M [00:08<00:00, 29.7MB/s]
Downloading data: 100%|██████████| 235M/235M [00:07<00:00, 32.5MB/s]
Downloading data: 100%|██████████| 233M/233M [00:07<00:00, 32.6MB/s]
Downloading data: 100%|██████████| 237M/237M [00:07<00:00, 30.7MB/s]
Generating train split: 100%|██████████| 271544/271544 [00:05<00:00, 54228.25 examples/s]


In [36]:
keys = list(paper.keys())
keys.remove('abstract')
keys.remove('introduction')
keys.remove('conclusions')

In [42]:
metadata = {}
for paper in astro_meta:
    id_str = paper['arxiv_id']
    metadata[id_str] = {key: paper[key] for key in keys}

In [13]:
with open('../data/vector_store/metadata.json', 'r') as f:
    metadata = json.load(f)

In [28]:
def make_keyword_index(metadata):
    keyword_index = {}
    
    for i, index in tqdm(enumerate(metadata)):
        paper = metadata[index]
        for keyword in paper['keyword_search']:
            term = ' '.join(word for word in keyword.lower().split() if word.lower() not in stopwords)
            if term not in keyword_index:
                keyword_index[term] = []
            
            keyword_index[term].append(paper['arxiv_id'])
    
    return keyword_index

In [29]:
keyword_index = make_keyword_index(metadata)

271540it [00:11, 24324.32it/s]


In [69]:
with open('../data/vector_store/keyword_index.json', 'w') as f:
    json.dump(keyword_index, f)

In [47]:
test = hyde_reranking.HydeCohereRetrievalSystem(config_path = "../config.yaml")

Loading embeddings...
Loading documents...
Loading index mapping...
Processing document dates...
Loading metadata...
Data loaded successfully.


In [57]:
test.weight_citation = False
test.retrieve(query, arxiv_id, top_k)

['0801.1023',
 '1711.01453',
 '2101.05821',
 '1001.3411',
 '1503.06065',
 '1703.08585',
 '1611.01545',
 '1510.06665',
 '2003.04925',
 '1908.00116']

In [56]:
query = "What are the primary computational methods used in modern cosmological simulations, and what are some notable examples of each approach?"
test.weight_citation = True
test.retrieve(query, arxiv_id, top_k)

['0801.1023',
 '1711.01453',
 '2101.05821',
 '1001.3411',
 '1703.08585',
 '1510.06665',
 '2003.04925',
 '1908.00116',
 'astro-ph0611863_arXiv.txt',
 'astro-ph0005502_arXiv.txt']

In [21]:
metadata['1210.3157']['keyword_search']

['a cored dark matter halo',
 'a dark matter halo',
 'Fornax',
 'a constant density core',
 'the dark matter mass profile',
 'a core size',
 'the core size',
 'radius',
 'constraints',
 'a dark matter cusp',
 'Spheroidal',
 'order',
 'the Fornax dwarf Spheroidal',
 'first',
 'unrealistically large scale radii',
 'the concentration parameter',
 'Navarro-Frenk-White haloes',
 'its density distribution',
 'Navarro',
 'three distinct stellar subpopulations']