<a href="https://colab.research.google.com/github/harjeet88/A_For_Algorithms/blob/master/data_engg/spark_rag_faiss_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# 1. Install necessary libraries
# We use Pyspark for distribution, LangChain for the chunking logic,
# and sentence-transformers for the embedding model.
!pip install -q pyspark findspark langchain sentence-transformers pandas tqdm

In [98]:
# 2. Import findspark and initialize
import findspark
findspark.init()

# 3. Import PySpark components
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, explode, udf, monotonically_increasing_id, lit
from pyspark.sql.types import ArrayType, FloatType, StringType

# 4. Initialize Spark Session
# Using 'local[*]' utilizes all available cores for parallel processing.
# Configure driver memory for stability in Colab.
spark = SparkSession.builder\
    .appName("DistributedRAGDemo")\
    .config("spark.driver.memory", "4g")\
    .getOrCreate()

print("Spark Session successfully created! Ready for distributed processing.")
# Display the session details
spark

Spark Session successfully created! Ready for distributed processing.


In [None]:
pip install -q PyPDF2 pymilvus

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/232.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/278.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.0/278.0 kB[0m [31m19.8 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os, os.path

from pyspark.sql import SparkSession
from pyspark.sql.functions import udf, explode
from pyspark.sql.types import StringType, ArrayType, FloatType
from sentence_transformers import SentenceTransformer
import findspark
from PyPDF2 import PdfReader
from pyspark.sql import SparkSession

from pymilvus import connections, Collection

import findspark
import re

In [None]:
os.environ['PYARROW_IGNORE_TIMEZONE']='1'
os.environ['NUMEXPR_MAX_THREADS'] = '2'
os.environ['NUMEXPR_NUM_THREADS'] = '2'
os.environ['OBJC_DISABLE_INITIALIZE_FORK_SAFETY'] = 'YES'


In [None]:
CHUNK_SIZE = 1600
CHUNK_OVERLAP = 50

In [None]:
# Define a UDF to extract text using PyPDF
def extract_text(file_path):
    reader = PdfReader(file_path)
    text = ''
    for i in range(0,len(reader.pages)):
        text += reader.pages[i].extract_text()
    return text

In [None]:
# Define the function to create embeddings
def create_embedding(text):
    # Create a SentenceTransformer model
    transformer = SentenceTransformer(os.getenv('EMBEDDING_MODEL'))
    embeddings = transformer.encode(text, convert_to_tensor=True)
    return embeddings.numpy().tolist()

In [None]:
def extract_text_chunks(symbol, text):
    metadata = "Document contains context of " + symbol \
        + " and is relevant to the annual reports / financial statements/ 10-K SEC fillings\n"
    chunks = []
    for i in range(0, len(text), CHUNK_SIZE):
        if i > CHUNK_OVERLAP:
            chunks.append(metadata + text[i - CHUNK_OVERLAP : i + CHUNK_SIZE])
        else:
            chunks.append(metadata + text[i : i + CHUNK_SIZE])
    return chunks

In [None]:
def get_stock_symbol(file_name):
    match = re.search(r'NASDAQ_([A-Z]{1,5})_2022\.pdf', file_name)
    if match:
        return match.group(1)
    return "NA"

In [None]:
# Register the UDF
extract_text_udf = udf(extract_text, StringType())
spark.udf.register("extract_text", extract_text_udf)

extract_text_chunks_udf = udf(extract_text_chunks, ArrayType(StringType()))
spark.udf.register("extract_text_chunks", extract_text_chunks_udf)

create_embedding_udf = udf(create_embedding, ArrayType(FloatType()))
spark.udf.register("create_embeddings", create_embedding_udf)

get_stock_symbol_udf = udf(get_stock_symbol, StringType())
spark.udf.register("get_stock_symbol", get_stock_symbol_udf)


<pyspark.sql.udf.UserDefinedFunction at 0x7f3014469df0>

In [None]:
def get_embedded_chunks(pdf_directory):
    pdf_file_paths = []
    for file in os.listdir(pdf_directory):
        if file == '.DS_Store':
            continue
        print(file)
        if file.endswith(".pdf"):
            pdf_file_paths.append(os.path.join(pdf_directory, file))
    print("Creating dataframe with file paths")
    # Create DataFrame with file paths
    pdf_files = spark.createDataFrame(pdf_file_paths, "string").toDF("file_path")
    pdf_files = pdf_files.select(
        'file_path', get_stock_symbol_udf('file_path').alias('stock_symbol'))

    print("Extracting text from PDF files")
    # Extract text from PDF files with each line containing name of file and array of page text
    chunked_text_data = pdf_files.withColumn("text", extract_text_udf("file_path"))

    print("Chunking text into chunks")
    # Break text into individual row per page using explode()
    chunked_text_data = chunked_text_data.withColumn("relevant_text", \
        extract_text_chunks_udf("stock_symbol", "text"))

    print("Break text into individual row per page using explode()")
    # Break text into individual row per page using explode()
    chunked_text_data = chunked_text_data.select('stock_symbol', 'file_path',
        explode(chunked_text_data.relevant_text).alias('chunked_text'))

    print("Converting into embeddings")
    # Convert into embeddings
    chunked_text_data = chunked_text_data.withColumn("embedded_vectors", \
        create_embedding_udf("chunked_text"))
    print("returning chunked text data")
    return chunked_text_data

In [None]:
from autofaiss import build_index

INDEX_DIR = "my_spark_index"
index_path = f"{INDEX_DIR}/knn.index"
index_infos_path = f"{INDEX_DIR}/index_infos.json"



In [None]:
# Build the distributed index
# Autofaiss handles index type selection and optimization automatically
build_index(
    embeddings=EMBED_DIR,
    index_path=index_path,
    index_infos_path=index_infos_path,
    max_index_memory_usage="2G", # Limit memory used per worker
    current_memory_available="4G" # Total memory available (adjust as needed)
)

In [None]:
def ingest_data():
    print("PDF ingestion started...")
    chunked_data = get_embedded_chunks("./rag-spark/data/annual_reports")
    print("PDF ingestion completed...")
    chunked_data.s
    print("PDF ingestion started...")
    # intialise faiss

    print("PDF ingestion completed...")

In [None]:
!mkdir -p rag-spark/data/annual_reports

In [None]:
!curl https://github.com/nairnavin/ml-playground/tree/main/rag-spark/data/annual_reports/NASDAQ_AAPL_2022.pdf -o rag-spark/data/annual_reports

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0


In [None]:
ingest_data()

PDF ingestion started...
Creating dataframe with file paths
Extracting text from PDF files
Chunking text into chunks
Break text into individual row per page using explode()
Converting into embeddings
returning chunked text data
PDF ingestion completed...
+------------+---------+------------+----------------+
|stock_symbol|file_path|chunked_text|embedded_vectors|
+------------+---------+------------+----------------+
+------------+---------+------------+----------------+

PDF ingestion started...


MilvusException: <MilvusException: (code=2, message=Fail connecting to server on localhost:19530, illegal connection params or server unavailable)>

In [99]:
!pip install -q autofaiss

In [100]:
import numpy as np
import os
from pyspark.sql import SparkSession

# Create a directory for embeddings
EMBED_DIR = "spark_embeddings"
INDEX_DIR = "my_spark_index"
os.makedirs(EMBED_DIR, exist_ok=True)
os.makedirs(INDEX_DIR, exist_ok=True)

# Generate sample embeddings (e.g., 1000 vectors of 100 dimensions)
embeddings = np.random.rand(1000, 100).astype(np.float32)

# Save the embeddings to a numpy file in the directory
np.save(f"{EMBED_DIR}/part1.npy", embeddings)
print(f"Saved {embeddings.shape[0]} embeddings to {EMBED_DIR}/part1.npy")

Saved 1000 embeddings to spark_embeddings/part1.npy


In [101]:
from autofaiss import build_index

index_path = f"{INDEX_DIR}/knn.index"
index_infos_path = f"{INDEX_DIR}/index_infos.json"

OSError: Unable to load libjvm
dlopen(/usr/lib/jvm/default-java/libjvm.so) failed: /usr/lib/jvm/default-java/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/default-java//lib/server/libjvm.so) failed: /usr/lib/jvm/default-java//lib/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/default-java//jre/lib/amd64/server/libjvm.so) failed: /usr/lib/jvm/default-java//jre/lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/default-java//lib/amd64/server/libjvm.so) failed: /usr/lib/jvm/default-java//lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java/libjvm.so) failed: /usr/lib/jvm/java/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java//lib/server/libjvm.so) failed: /usr/lib/jvm/java//lib/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java//jre/lib/amd64/server/libjvm.so) failed: /usr/lib/jvm/java//jre/lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java//lib/amd64/server/libjvm.so) failed: /usr/lib/jvm/java//lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/libjvm.so) failed: /usr/lib/jvm/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm//lib/server/libjvm.so) failed: /usr/lib/jvm//lib/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm//jre/lib/amd64/server/libjvm.so) failed: /usr/lib/jvm//jre/lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm//lib/amd64/server/libjvm.so) failed: /usr/lib/jvm//lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib64/jvm/libjvm.so) failed: /usr/lib64/jvm/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib64/jvm//lib/server/libjvm.so) failed: /usr/lib64/jvm//lib/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib64/jvm//jre/lib/amd64/server/libjvm.so) failed: /usr/lib64/jvm//jre/lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib64/jvm//lib/amd64/server/libjvm.so) failed: /usr/lib64/jvm//lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/default-java/libjvm.so) failed: /usr/local/lib/jvm/default-java/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/default-java//lib/server/libjvm.so) failed: /usr/local/lib/jvm/default-java//lib/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/default-java//jre/lib/amd64/server/libjvm.so) failed: /usr/local/lib/jvm/default-java//jre/lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/default-java//lib/amd64/server/libjvm.so) failed: /usr/local/lib/jvm/default-java//lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java/libjvm.so) failed: /usr/local/lib/jvm/java/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java//lib/server/libjvm.so) failed: /usr/local/lib/jvm/java//lib/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java//jre/lib/amd64/server/libjvm.so) failed: /usr/local/lib/jvm/java//jre/lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java//lib/amd64/server/libjvm.so) failed: /usr/local/lib/jvm/java//lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/libjvm.so) failed: /usr/local/lib/jvm/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm//lib/server/libjvm.so) failed: /usr/local/lib/jvm//lib/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm//jre/lib/amd64/server/libjvm.so) failed: /usr/local/lib/jvm//jre/lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm//lib/amd64/server/libjvm.so) failed: /usr/local/lib/jvm//lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib64/jvm/libjvm.so) failed: /usr/local/lib64/jvm/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib64/jvm//lib/server/libjvm.so) failed: /usr/local/lib64/jvm//lib/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib64/jvm//jre/lib/amd64/server/libjvm.so) failed: /usr/local/lib64/jvm//jre/lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib64/jvm//lib/amd64/server/libjvm.so) failed: /usr/local/lib64/jvm//lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-8-openjdk-amd64/libjvm.so) failed: /usr/local/lib/jvm/java-8-openjdk-amd64/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-8-openjdk-amd64//lib/server/libjvm.so) failed: /usr/local/lib/jvm/java-8-openjdk-amd64//lib/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-8-openjdk-amd64//jre/lib/amd64/server/libjvm.so) failed: /usr/local/lib/jvm/java-8-openjdk-amd64//jre/lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-8-openjdk-amd64//lib/amd64/server/libjvm.so) failed: /usr/local/lib/jvm/java-8-openjdk-amd64//lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-8-openjdk-amd64/libjvm.so) failed: /usr/lib/jvm/java-8-openjdk-amd64/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-8-openjdk-amd64//lib/server/libjvm.so) failed: /usr/lib/jvm/java-8-openjdk-amd64//lib/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-8-openjdk-amd64//jre/lib/amd64/server/libjvm.so) failed: /usr/lib/jvm/java-8-openjdk-amd64//jre/lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-8-openjdk-amd64//lib/amd64/server/libjvm.so) failed: /usr/lib/jvm/java-8-openjdk-amd64//lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-7-openjdk-amd64/libjvm.so) failed: /usr/local/lib/jvm/java-7-openjdk-amd64/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-7-openjdk-amd64//lib/server/libjvm.so) failed: /usr/local/lib/jvm/java-7-openjdk-amd64//lib/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-7-openjdk-amd64//jre/lib/amd64/server/libjvm.so) failed: /usr/local/lib/jvm/java-7-openjdk-amd64//jre/lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-7-openjdk-amd64//lib/amd64/server/libjvm.so) failed: /usr/local/lib/jvm/java-7-openjdk-amd64//lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-7-openjdk-amd64/libjvm.so) failed: /usr/lib/jvm/java-7-openjdk-amd64/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-7-openjdk-amd64//lib/server/libjvm.so) failed: /usr/lib/jvm/java-7-openjdk-amd64//lib/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-7-openjdk-amd64//jre/lib/amd64/server/libjvm.so) failed: /usr/lib/jvm/java-7-openjdk-amd64//jre/lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-7-openjdk-amd64//lib/amd64/server/libjvm.so) failed: /usr/lib/jvm/java-7-openjdk-amd64//lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-6-openjdk-amd64/libjvm.so) failed: /usr/local/lib/jvm/java-6-openjdk-amd64/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-6-openjdk-amd64//lib/server/libjvm.so) failed: /usr/local/lib/jvm/java-6-openjdk-amd64//lib/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-6-openjdk-amd64//jre/lib/amd64/server/libjvm.so) failed: /usr/local/lib/jvm/java-6-openjdk-amd64//jre/lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-6-openjdk-amd64//lib/amd64/server/libjvm.so) failed: /usr/local/lib/jvm/java-6-openjdk-amd64//lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-6-openjdk-amd64/libjvm.so) failed: /usr/lib/jvm/java-6-openjdk-amd64/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-6-openjdk-amd64//lib/server/libjvm.so) failed: /usr/lib/jvm/java-6-openjdk-amd64//lib/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-6-openjdk-amd64//jre/lib/amd64/server/libjvm.so) failed: /usr/lib/jvm/java-6-openjdk-amd64//jre/lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-6-openjdk-amd64//lib/amd64/server/libjvm.so) failed: /usr/lib/jvm/java-6-openjdk-amd64//lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-7-oracle/libjvm.so) failed: /usr/lib/jvm/java-7-oracle/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-7-oracle//lib/server/libjvm.so) failed: /usr/lib/jvm/java-7-oracle//lib/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-7-oracle//jre/lib/amd64/server/libjvm.so) failed: /usr/lib/jvm/java-7-oracle//jre/lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-7-oracle//lib/amd64/server/libjvm.so) failed: /usr/lib/jvm/java-7-oracle//lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-8-oracle/libjvm.so) failed: /usr/lib/jvm/java-8-oracle/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-8-oracle//lib/server/libjvm.so) failed: /usr/lib/jvm/java-8-oracle//lib/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-8-oracle//jre/lib/amd64/server/libjvm.so) failed: /usr/lib/jvm/java-8-oracle//jre/lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-8-oracle//lib/amd64/server/libjvm.so) failed: /usr/lib/jvm/java-8-oracle//lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-6-oracle/libjvm.so) failed: /usr/lib/jvm/java-6-oracle/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-6-oracle//lib/server/libjvm.so) failed: /usr/lib/jvm/java-6-oracle//lib/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-6-oracle//jre/lib/amd64/server/libjvm.so) failed: /usr/lib/jvm/java-6-oracle//jre/lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/java-6-oracle//lib/amd64/server/libjvm.so) failed: /usr/lib/jvm/java-6-oracle//lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-7-oracle/libjvm.so) failed: /usr/local/lib/jvm/java-7-oracle/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-7-oracle//lib/server/libjvm.so) failed: /usr/local/lib/jvm/java-7-oracle//lib/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-7-oracle//jre/lib/amd64/server/libjvm.so) failed: /usr/local/lib/jvm/java-7-oracle//jre/lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-7-oracle//lib/amd64/server/libjvm.so) failed: /usr/local/lib/jvm/java-7-oracle//lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-8-oracle/libjvm.so) failed: /usr/local/lib/jvm/java-8-oracle/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-8-oracle//lib/server/libjvm.so) failed: /usr/local/lib/jvm/java-8-oracle//lib/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-8-oracle//jre/lib/amd64/server/libjvm.so) failed: /usr/local/lib/jvm/java-8-oracle//jre/lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-8-oracle//lib/amd64/server/libjvm.so) failed: /usr/local/lib/jvm/java-8-oracle//lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-6-oracle/libjvm.so) failed: /usr/local/lib/jvm/java-6-oracle/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-6-oracle//lib/server/libjvm.so) failed: /usr/local/lib/jvm/java-6-oracle//lib/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-6-oracle//jre/lib/amd64/server/libjvm.so) failed: /usr/local/lib/jvm/java-6-oracle//jre/lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/local/lib/jvm/java-6-oracle//lib/amd64/server/libjvm.so) failed: /usr/local/lib/jvm/java-6-oracle//lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/default/libjvm.so) failed: /usr/lib/jvm/default/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/default//lib/server/libjvm.so) failed: /usr/lib/jvm/default//lib/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/default//jre/lib/amd64/server/libjvm.so) failed: /usr/lib/jvm/default//jre/lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/lib/jvm/default//lib/amd64/server/libjvm.so) failed: /usr/lib/jvm/default//lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/java/latest/libjvm.so) failed: /usr/java/latest/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/java/latest//lib/server/libjvm.so) failed: /usr/java/latest//lib/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/java/latest//jre/lib/amd64/server/libjvm.so) failed: /usr/java/latest//jre/lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory
dlopen(/usr/java/latest//lib/amd64/server/libjvm.so) failed: /usr/java/latest//lib/amd64/server/libjvm.so: cannot open shared object file: No such file or directory

In [102]:
# Build the distributed index
# Autofaiss handles index type selection and optimization automatically
build_index(
    embeddings=EMBED_DIR,
    index_path=index_path,
    index_infos_path=index_infos_path,
    max_index_memory_usage="2G", # Limit memory used per worker
    current_memory_available="4G" # Total memory available (adjust as needed)
)

100%|██████████| 1/1 [00:00<00:00, 7423.55it/s]
100%|██████████| 1/1 [00:00<00:00, 24.26it/s]
  0%|          | 0/1 [00:00<?, ?it/s]


TypeError: Object of type float32 is not JSON serializable

In [104]:
import json
import numpy as np

# Extend JSONEncoder to handle NumPy types
class NpEncoder(json.JSONEncoder):
    def default(self, obj):
        if isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return super(NpEncoder, self).default(obj)

# Apply the custom encoder for all subsequent json.dump calls
# We'll re-run build_index after this.
# However, autofaiss might use its own internal serialization, so this might not directly fix it if it's not using the default json.dump directly.
# If this doesn't work, we might need to look for a specific autofaiss configuration or update.

# A more direct way to patch is to override the default method in the json module
# However, it's safer to ensure autofaiss uses *this* encoder. Let's try passing it to json.dump explicitly if autofaiss allows it, which it doesn't.
# So, the best bet is to make sure our code uses this encoder if we were to serialize, or rely on autofaiss to fix it.
# Since the error is directly from json.dump inside autofaiss, this patch on the default method of the JSONEncoder should theoretically work if autofaiss is using the default JSON.dump.

# For now, let's just make sure json.dump is aware of numpy types when it's called.
# This specific approach might not directly override the internal json.dump call within autofaiss.
# A more robust solution might require a direct modification of autofaiss's source if it hardcodes the encoder.
# However, let's try calling `json.dumps` with the custom encoder once to register it.

# This is a bit of a hack, but sometimes just ensuring the encoder is loaded can help.
# Let's try directly modifying the `json` module's `_default_encoder` or `JSONEncoder`'s default method.

# A more standard way is to define this encoder and pass it to json.dump or json.dumps.
# Since we cannot modify how autofaiss calls json.dump, we are assuming it uses the default encoder and hoping to influence it.
# The safest approach is to subclass JSONEncoder and then ensure that subclass is used. Since we cannot directly control autofaiss's internal json.dump call,
# a common workaround is to monkey-patch the default method of the json.JSONEncoder class.

def monkey_patch_json():
    old_default = json.JSONEncoder.default

    def new_default(self, obj):
        if isinstance(obj, np.floating):
            return float(obj)
        elif isinstance(obj, np.integer):
            return int(obj)
        elif isinstance(obj, np.ndarray):
            return obj.tolist()
        return old_default(self, obj)

    json.JSONEncoder.default = new_default

monkey_patch_json()

print("JSON encoder patched to handle NumPy float32 types.")

JSON encoder patched to handle NumPy float32 types.


In [105]:
# Build the distributed index
# Autofaiss handles index type selection and optimization automatically
build_index(
    embeddings=EMBED_DIR,
    index_path=index_path,
    index_infos_path=index_infos_path,
    max_index_memory_usage="2G", # Limit memory used per worker
    current_memory_available="4G" # Total memory available (adjust as needed)
)

100%|██████████| 1/1 [00:00<00:00, 6626.07it/s]
100%|██████████| 1/1 [00:00<00:00, 20.38it/s]
  0%|          | 0/1 [00:00<?, ?it/s]


(<faiss.swigfaiss_avx2.IndexHNSWFlat; proxy of <Swig Object of type 'faiss::IndexHNSWFlat *' at 0x7f2fd847cc00> >,
 {'index_key': 'HNSW15',
  'index_param': 'efSearch=16383',
  'index_path': 'my_spark_index/knn.index',
  'size in bytes': 536150,
  'avg_search_speed_ms': np.float64(0.23234883349641677),
  '99p_search_speed_ms': np.float64(0.5280300893082313),
  'reconstruction error %': np.float32(0.0),
  'nb vectors': 1000,
  'vectors dimension': 100,
  'compression ratio': 0.7460598713046722})

In [106]:
import faiss
import glob

# The index file might have a different name depending on the autofaiss version/strategy
index_file = glob.glob(f"{INDEX_DIR}/*.index")[0]
my_index = faiss.read_index(index_file)

# Perform a search with a query vector
query_vector = np.random.rand(1, 100).astype(np.float32)
k = 5 # Number of nearest neighbors to retrieve
distances, indices = my_index.search(query_vector, k)

print(f"Top {k} nearest neighbors (indices): {indices}")
print(f"Distances: {distances}")

Top 5 nearest neighbors (indices): [[485 248 751 890 261]]
Distances: [[28.397993 28.316391 28.11125  28.058743 27.955925]]
