In [12]:
import os

# For hugging face models
from getpass import getpass
from huggingface_hub import login
from llama_index.llms.huggingface import HuggingFaceInferenceAPI
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

# For reader
from llama_index.core import SimpleDirectoryReader

# For index
from llama_index.core import VectorStoreIndex
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core import Settings

# For loading index from storage
from llama_index.core import StorageContext, load_index_from_storage

# For Query
from llama_index.core import get_response_synthesizer
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine

## Setup

In [13]:
hf_token = getpass("Enter your Hugging Face token: ")

In [None]:
llm_name = "mistralai/Mixtral-8x7B-Instruct-v0.1"
embed_model_name = "BAAI/bge-small-en-v1.5"

llm = HuggingFaceInferenceAPI(
    model_name=llm_name,
    token=hf_token
)

embed_model = HuggingFaceEmbedding(
    model_name=embed_model_name
)

print(llm)
print(embed_model)

In [25]:
storage_context = StorageContext.from_defaults(persist_dir="./storage")
index = load_index_from_storage(storage_context, embed_model=embed_model)

In [26]:
num_docs = len(index.docstore.docs)
print(f"Number of embedded documents: {num_docs}")

Number of embedded documents: 52


In [None]:
# !pip install gdown
# !gdown 1rtntaSqlpDMkINEzbCDwsAMSkr2V5bHM
# !unzip data.zip
# !rm data.zip

## Reader

In [8]:
data_path = "data"
documents = SimpleDirectoryReader(data_path).load_data()

len(documents)

503

## Indexing - Organizing

In [9]:
chunk_size = 2048
chunk_overlap = 256

text_splitter = SentenceSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap
)

Settings.text_splitter = text_splitter

index = VectorStoreIndex.from_documents(
    documents,
    transformations=[text_splitter],
    embed_model=embed_model,
    show_progress=True
)

Parsing nodes:   0%|          | 0/503 [00:00<?, ?it/s]

Parsing nodes: 100%|██████████| 503/503 [00:00<00:00, 1903.51it/s]
Generating embeddings: 100%|██████████| 523/523 [00:46<00:00, 11.37it/s]


In [10]:
for node in list(index.docstore.docs.values()):
    dic = dict(node)

    for key, item in dic.items():
        print(f"{key}: {item}")
        print("-"*100)

id_: 26e1d8be-74d0-4bf9-b775-76eeccc177b8
----------------------------------------------------------------------------------------------------
embedding: None
----------------------------------------------------------------------------------------------------
metadata: {'page_label': '1', 'file_name': 'ref (1).pdf', 'file_path': '/src/backend/notebook/data/ref (1).pdf', 'file_type': 'application/pdf', 'file_size': 5174013, 'creation_date': '2025-02-20', 'last_modified_date': '2025-02-06'}
----------------------------------------------------------------------------------------------------
excluded_embed_metadata_keys: ['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date']
----------------------------------------------------------------------------------------------------
excluded_llm_metadata_keys: ['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date']
--------------------------------------------

In [11]:
for key, value in index._vector_store.data.embedding_dict.items():
    print(f"Key: {key}")
    print(f"Value: {value}")
    print("-" * 100)

Key: 26e1d8be-74d0-4bf9-b775-76eeccc177b8
Value: [-0.004222783260047436, 0.017638877034187317, 0.029309473931789398, -0.04185701161623001, 0.06324352324008942, 0.0675775408744812, -0.03244836628437042, -0.030368994921445847, -0.027743684127926826, -0.049258239567279816, 0.036161843687295914, 0.038060158491134644, 0.043362412601709366, 0.049616698175668716, -0.0012102725449949503, 0.021492360159754753, -0.0257723405957222, -0.021838368847966194, 0.044584859162569046, 0.004343106411397457, 0.02044856734573841, -0.03502609208226204, -0.02349650301039219, -0.026508189737796783, -0.00011553012882359326, 0.0385521836578846, -0.010888678021728992, -0.06343822926282883, 0.017846550792455673, -0.1203339472413063, 0.009978877380490303, 0.03098621591925621, -0.01332702673971653, 0.018636168912053108, 0.00395943270996213, 0.0010071113938465714, 0.019538989290595055, -0.018963448703289032, -0.041085876524448395, -0.013511902652680874, 0.023801928386092186, 0.034491878002882004, 0.02336731180548668,

## Indexing - Storing

In [12]:
index.storage_context.persist(persist_dir="storage")

In [13]:
storage_path = "storage"

storage_context = StorageContext.from_defaults(
    persist_dir=storage_path
)

index = load_index_from_storage(
    storage_context,
    embed_model=embed_model
)

## Retriever

In [14]:
similarity_top_k = 20
similarity_cutoff = 0.5
max_selected_nodes = 8

retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=similarity_top_k
)

## Synthesizer

In [15]:
class SortedRetrieverQueryEngine(RetrieverQueryEngine):
    def retrieve(self, query):
        nodes = self.retriever.retrieve(query)

        # Filter out nodes with similarity_score below similarity_cutoff
        filtered_nodes = [
            node for node in nodes if node.score >= similarity_cutoff
        ]

        # Sort the remaining nodes in descending order of similarity_score
        sorted_nodes = sorted(
            filtered_nodes, key=lambda node: node.score, reverse=True
        )

        # Select at most `max_selected_nodes`
        result = sorted_nodes[:max_selected_nodes]
        return result

In [16]:
response_synthesizer = get_response_synthesizer(llm=llm)

query_engine = SortedRetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
    node_postprocessors=[]
)

In [17]:
response = query_engine.query("Abtract of DeepSeek-VL")

In [18]:
response

Response(response='\nDeepSeek-VL is an open-source Vision-Language (VL) Model designed for real-world vision and language understanding applications. It is structured around three key dimensions: (1) Data Construction, (2) Model Architecture, and (3) Training Strategy. The Data Construction emphasizes diverse, scalable, and real-world scenarios, including web screenshots, PDFs, OCR, charts, and knowledge-based content. The Model Architecture incorporates a hybrid vision encoder for efficient high-resolution image processing. The Training Strategy integrates LLM training from the beginning and carefully manages the competitive dynamics between vision and language modalities.\n\nThe visual module is designed to optimize the utilization of high-resolution visual inputs while remaining within a fixed token budget to manage inference costs effectively. The hybrid vision encoder combines a text-aligned encoder for coarse semantic extraction at 384 ×384 resolution with a high-resolution encod