In [1]:
import copy
from datetime import datetime
import itertools
import os
from typing import List

import matplotlib.pyplot as plt
import pandas as pd
from pydantic import BaseModel, Field
from transformers import AutoTokenizer

In [2]:
from pydantic_settings import BaseSettings, SettingsConfigDict

class EnvSettings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="../.env", env_file_encoding="utf-8", extra="ignore"
    )
    embedding_base_url: str
    embedding_api_key: str
    embedding_model: str
    embedding_model_dir: str
    
    sample_data_dir: str
    pipeline_src_dir: str
settings = EnvSettings()

import sys
sys.path.append(settings.pipeline_src_dir)

In [3]:
class DBSettings(BaseSettings):
    model_config = SettingsConfigDict(
        env_file="database/.env", env_file_encoding="utf-8", extra="ignore"
    )
    postgres_user: str
    postgres_password: str
    postgres_db: str
    postgres_url: str
    postgres_port: str

db_settings = DBSettings()
print(db_settings.postgres_db)

pgvector_llamaindex


In [4]:
## Load Sample
df = pd.read_parquet(settings.sample_data_dir)
df = df.sample(100)
print(df.shape, df.columns)

(100, 7) Index(['id', 'title', 'abstract', 'authors', 'published_date', 'link',
       'markdown'],
      dtype='object')


# 1. Prepare Embedding & Vectorstore

In [5]:
from llama_index.core import Settings
from llama_index.embeddings.text_embeddings_inference import (
    TextEmbeddingsInference,
)

from llama_index.core import SimpleDirectoryReader, StorageContext
from llama_index.core import VectorStoreIndex
from llama_index.vector_stores.postgres import PGVectorStore

from sqlalchemy import make_url

In [6]:
# https://docs.llamaindex.ai/en/stable/examples/embeddings/text_embedding_inference/
embed_model = TextEmbeddingsInference(
    model_name=settings.embedding_model,
    base_url=settings.embedding_base_url,
    timeout=60,
    embed_batch_size=10,
)
Settings.embed_model = embed_model

In [7]:
import psycopg2
# connection_string = "postgresql://{}:{}@localhost:{}/{}".format(
connection_string = "postgresql://{}:{}@localhost:{}/{}".format(
    db_settings.postgres_user,
    db_settings.postgres_password,
    db_settings.postgres_port,
    db_settings.postgres_db
)

db_name = db_settings.postgres_db
print(f"DB: {db_name}")
conn = psycopg2.connect(connection_string)
conn.autocommit=True

DB: pgvector_llamaindex


In [8]:
## Initialize vector store instance
url = make_url(connection_string)

## hnsw indexing config
hnsw_config = {
    "hnsw_m": 16,
    "hnsw_ef_construction": 64,
    "hnsw_ef_search": 40,
    "hnsw_dist_method": "vector_cosine_ops",
}
abstract_vector_store = PGVectorStore.from_params(
    database=db_name,
    host=url.host,
    password=url.password,
    port=url.port,
    user=url.username,
    table_name="paper_abstract",
    embed_dim=1024,  #bge-m3
    hnsw_kwargs=hnsw_config,
)

In [11]:
abstact_storage_index = VectorStoreIndex.from_vector_store(abstract_vector_store)

In [16]:
## Basic Retriever
retriever_args = {"similarity_top_k": 10}
retriever = abstact_storage_index.as_retriever(filters=[], **retriever_args)

In [17]:
## Simple query
query = "Retrieval Augmented Generation"
nodes = retriever.retrieve(query)
nodes

[NodeWithScore(node=TextNode(id_='7eb94f34-af50-4f8f-967a-6b779229ef2d', embedding=None, metadata={'paper_information_id': 4, 'published_date': '2023-08-25 19:35:58'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='7eb5f76f-a9a4-43ed-9b14-77f26e097a9e', node_type='4', metadata={'paper_information_id': 4, 'published_date': '2023-08-25 19:35:58'}, hash='302d9152de24a6f6fdac54b594c9f83f506776238daab61b8ab92d02a0bd1ec4')}, metadata_template='{key}: {value}', metadata_separator='\n', text="Title: Emulating Radiative Transfer with Artificial Neural Networks\nAbstract:\nForward-modeling observables from galaxy simulations enables direct\ncomparisons between theory and observations. To generate synthetic spectral\nenergy distributions (SEDs) that include dust absorption, re-emission, and\nscattering, Monte Carlo radiative transfer is often used in post-processing on\na galaxy-by-galaxy basis. However, this

In [18]:
for i, node in enumerate(nodes):
    ## Score
    score = node.score
    ## Source
    paper_info_id = node.metadata['paper_information_id']
    title = df.iloc[paper_info_id]["title"]
    abstract = df.iloc[paper_info_id]["abstract"]
    
    published_date = node.metadata["published_date"]
    print("Result {} - {:.3f}".format(i, score))
    print(paper_info_id, title, published_date)
    print(abstract[:100])
    print('-'*30)

Result 0 - 0.537
4 Subsurface cosmogenic and radiogenic production of ^{42}Ar 2023-08-25 19:35:58
Radioactive decays from ^{42}Ar and its progeny ^{42}K are potential
background sources in large-sca
------------------------------
Result 1 - 0.530
97 Vector Bundles over non-Hausdorff Manifolds 2023-10-01 18:50:29
In this paper we generalise the theory of real vector bundles to a certain
class of non-Hausdorff ma
------------------------------
Result 2 - 0.522
17 Nonequilibrium Seebeck and spin Seebeck effects in nanoscale junctions 2023-07-17 11:55:20
The spin-resolved thermoelectric transport properties of correlated nanoscale
junctions, consisting 
------------------------------
Result 3 - 0.508
1 Characterizing and correcting electron and hole trapping in germanium
  cross-strip detectors 2023-08-22 21:03:58
We present measurements of electron and hole trapping in three COSI germanium
cross-strip detectors.
------------------------------
Result 4 - 0.505
86 Incorporating Nonlocal Tra

In [21]:
## Metadata Filtering
from llama_index.core.vector_stores.types import (
    MetadataFilter,
    MetadataFilters,
)
retriever_args = {"similarity_top_k": 10}

## Filter documents published between 23.05.01 ~ 23.12.31
filters = MetadataFilters(
    filters=[
        MetadataFilter(
            key="published_date", value="2023-05-01", operator=">="
        ),
        MetadataFilter(
            key="published_date", value="2023-12-31", operator="<="
        ),
    ],
    condition="and",
)


retriever = abstact_storage_index.as_retriever(filters=filters, **retriever_args)

In [22]:
## Simple query
query = "Retrieval Augmented Generation"
nodes = retriever.retrieve(query)
nodes

[NodeWithScore(node=TextNode(id_='7eb94f34-af50-4f8f-967a-6b779229ef2d', embedding=None, metadata={'paper_information_id': 4, 'published_date': '2023-08-25 19:35:58'}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='7eb5f76f-a9a4-43ed-9b14-77f26e097a9e', node_type='4', metadata={'paper_information_id': 4, 'published_date': '2023-08-25 19:35:58'}, hash='302d9152de24a6f6fdac54b594c9f83f506776238daab61b8ab92d02a0bd1ec4')}, metadata_template='{key}: {value}', metadata_separator='\n', text="Title: Emulating Radiative Transfer with Artificial Neural Networks\nAbstract:\nForward-modeling observables from galaxy simulations enables direct\ncomparisons between theory and observations. To generate synthetic spectral\nenergy distributions (SEDs) that include dust absorption, re-emission, and\nscattering, Monte Carlo radiative transfer is often used in post-processing on\na galaxy-by-galaxy basis. However, this

In [24]:
for i, node in enumerate(nodes):
    ## Score
    score = node.score
    ## Source
    paper_info_id = node.metadata['paper_information_id']
    title = df.iloc[paper_info_id]["title"]
    abstract = df.iloc[paper_info_id]["abstract"]
    
    published_date = node.metadata["published_date"]
    print("Result {} - {:.3f}".format(i, score))
    print(paper_info_id, title, published_date)
    # print(abstract[:100])
    # print('-'*30)

Result 0 - 0.537
4 Subsurface cosmogenic and radiogenic production of ^{42}Ar 2023-08-25 19:35:58
Result 1 - 0.530
97 Vector Bundles over non-Hausdorff Manifolds 2023-10-01 18:50:29
Result 2 - 0.522
17 Nonequilibrium Seebeck and spin Seebeck effects in nanoscale junctions 2023-07-17 11:55:20
Result 3 - 0.508
1 Characterizing and correcting electron and hole trapping in germanium
  cross-strip detectors 2023-08-22 21:03:58
Result 4 - 0.502
13 Synthesizing and multiplexing autonomous quantum coherences 2023-06-27 02:46:08
Result 5 - 0.495
53 A thin plate approximation for ocean wave interactions with an ice shelf 2023-06-18 15:50:57
Result 6 - 0.495
14 Tunable optical multistability induced by a single cavity mode in cavity
  quantum electrodynamics system 2023-09-20 14:59:06
Result 7 - 0.478
34 Pre-Training to Learn in Context 2023-06-30 16:07:12
Result 8 - 0.475
30 Current Tomography -- Localization of void fractions in conducting
  liquids by measuring the induced magnetic flux densit