## Validating FAISS vector database library

In [None]:
import faiss
import numpy as np

In [None]:
dimension = 6
index = faiss.IndexFlatL2(dimension)

In [None]:
numVectors = 5
vectors = np.random.rand(numVectors, dimension).astype('float32')

In [None]:
index.add(vectors)
print(f"Number of vectors in index: {index.ntotal}")

In [None]:
nlist = 5 # number of centroids
quantizer = faiss.IndexFlatL2(dimension)
indexIVF = faiss.IndexIVFFlat(quantizer, dimension, nlist)
if not indexIVF.is_trained:
    indexIVF.train(vectors)
indexIVF.add(vectors)

In [None]:
queryVector = np.random.rand(1, dimension).astype('float')
k = 5
distances, indices = index.search(queryVector, k)
print(f"Distances: {distances}")
print(f"Indices of nearest neighbors: {indices}")

In [None]:
print(f"Query: {queryVector}")

In [None]:
print("Reference vectors")
print(indices)
for ind in indices[0]:
    referenceVector = vectors[ind]
    l2_distance = np.linalg.norm(referenceVector - queryVector)
    print(f"{referenceVector}: {l2_distance} compared to {distances[0][ind]}")

## Utilizing PyMuPDF to extract text from PDF for vector encoding

In [None]:
import fitz

In [None]:
import re

In [None]:
import nltk
nltk.download('punkt_tab')

In [None]:
def dilate_array(arr, kernel_size=1):
    n = len(arr)
    result = np.zeros_like(arr)
    for i in range(n):
        start = max(0, i - kernel_size)
        end = min(n, i + kernel_size + 1)
        window = arr[start:end]
        result[i] = 1.0 if np.any(window == 1) else 0.0
    return result

def erode_array(arr, kernel_size=1):
    n = len(arr)
    result = np.zeros_like(arr)
    for i in range(n):
        start = max(0, i - kernel_size)
        end = min(n, i + kernel_size + 1)
        window = arr[start:end]
        result[i] = 1.0 if np.all(window == 1) else 0.0
    return result

In [None]:
# Empirically, this seems to work across a few different types of PDF files
# With more investigation, I would try turning this into a DP problem where up to some set percentage (maybe 15%) CAN be classified 
# as an "Introduction" section, thus allowing each page to bid for their slot and disincentivizing other pages.
def GetTableOfContentsEstimator(linksPerPage, pageCount, lookback=5):
    if pageCount <= lookback: return np.zeros(pageCount)
    # look for local maxima in sliding window over linksPerPage to identify candidates for 'table of contents' sections
    tableOfContentsEstimator = np.zeros(pageCount - lookback)
    averageLinks = np.mean(linksPerPage)
    for i in range(lookback, len(linksPerPage)):
        pastAverage = np.mean(linksPerPage[(i - lookback):i])
        current = linksPerPage[i]
        if current <= pastAverage:
            dropRatio = max((pastAverage - current), 1) / (pastAverage + 1e-6)
            magnitude = pastAverage / (pastAverage + averageLinks + 1e-6)
            frontBias = np.exp(-(5.0/pageCount) * (i - lookback)) # bias towards zero at the end of the array
            tableOfContentsEstimator[i - lookback] = dropRatio * magnitude * frontBias
        else:
            tableOfContentsEstimator[i - lookback] = 0.0
    binaryResult = np.where(tableOfContentsEstimator >= 0.5, 1, 0)
    # perform morphological closing with k=1
    binaryResult = dilate_array(erode_array(binaryResult, 1), 1)
    # return last position where binaryResult == 1, this is likely the final page of the introduction / table of contents
    for i in range(len(binaryResult) - 1, -1, -1):
        if binaryResult[i] == 1:
            return i + lookback

In [None]:
def GetDocumentMetadata(document):
    pageCount = document.page_count
    linksPerPage = []
    blocksPerPage = []
    linksToPage = {}
    # once-over to compute metadata before storing any paragraph info
    for pageIndex in range(pageCount):
        page = document.load_page(pageIndex)
        blocks = page.get_text("blocks")
        links = page.get_links()
        linksPerPage.append(len(links))
        blocksPerPage.append(len(blocks))
        for link in links:
            if not 'page' in link: continue
            linkTo = link['page']
            linksToPage[linkTo] = linksToPage.get(linkTo, 0) + 1
    
    # approximate which pages may be part of the introduction / table of contents
    lastTableOfContentsPage = GetTableOfContentsEstimator(linksPerPage, pageCount, 5) # underlying array looks something like [1,1,1,1,0,0,0,0,0,0,0,0,0,0]

    # estimate which pages are the most likely content pages based on internal links
    if len(linksToPage) != 0:
        startingContentPage = np.min(list(linksToPage.keys()))
        endingContentPage = np.max(list(linksToPage.keys()))
        likelyContentPages = np.zeros(pageCount, dtype=int)
        likelyContentPages[startingContentPage:endingContentPage] = 1
    else:
        likelyContentPages = np.ones(pageCount, dtype=int)

    return (lastTableOfContentsPage, likelyContentPages, np.mean(blocksPerPage))

In [None]:
def ExtractSentencesFromDocument(document, metadata):
    all_sentences = []
    lastTableOfContentsPage = metadata[0]
    likelyContentPages = metadata[1]
    averageBlocksPerPage = metadata[2]
    for pageIndex in range(document.page_count):
        page = document.load_page(pageIndex)
        blocks = page.get_text("blocks")
        if lastTableOfContentsPage and pageIndex < lastTableOfContentsPage:
            continue
        if not likelyContentPages[pageIndex]:
            if len(blocks) == 0 or len(blocks) < averageBlocksPerPage / 2:
                continue
        for block in blocks:
            paragraph_text = block[4].strip().replace("\n", " ")
            if paragraph_text.isnumeric() or len(paragraph_text) < 10: continue # hacky, but remove all very-short phrases as they're likely not substantial content
            all_sentences.append(paragraph_text)
    return all_sentences

In [None]:
doc = fitz.open("./../data/Observability-Engineering.pdf")


metadata = GetDocumentMetadata(doc)
print(metadata)
all_sentences = ExtractSentencesFromDocument(doc, metadata)

doc.close()

In [None]:
print(all_sentences)

In [None]:
# group adjacent sentences depending on punctuation
def sentence_groups(lines):
    group = []
    for w in lines:
        strippedW = w.strip()
        parts = re.split(r'([.?!])', strippedW)
        for part in parts:
            if part == '.' or part == '?' or part == '!' and group:
                yield group
                group = []
            if not part.isnumeric() and len(part) >= 10:
                group.append(part)
    if group:
        yield group

In [None]:
def parse_sentences(text_list):
    full_text = ' '.join(text_list)
    sentences = nltk.tokenize.sent_tokenize(full_text)
    return [sent.strip() for sent in sentences if len(sent.strip()) > 10]

In [None]:
sentencesProcessed = parse_sentences(all_sentences)
for sentence in sentencesProcessed:
    print(sentence)
    print('----')

In [None]:
#simplifiedSentences = [' '.join(group) for group in sentence_groups(all_sentences)]
#for sentence in simplifiedSentences:
#    print(sentence)
#    print('---')

## Utilizing SBERT library to perform semantic encoding of chunks

In [None]:
from sentence_transformers import SentenceTransformer

In [None]:
import torch

In [None]:
model = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
corpus_embeddings = model.encode_document(sentencesProcessed, convert_to_tensor=True)

In [None]:
queries = [
    "What are the eligibility requirements for Medicare home health services?",
    "What qualifies a patient as homebound under CMS guidelines?",
    "What skilled nursing services are considered reasonable and necessary?",
    "What conditions must be met for physical therapy to be covered?",
    "What must be included in the physician’s plan of care?",
    "What are the requirements for the physician face-to-face encounter?",
    "What documentation is needed to prove medical necessity for skilled services?",
    "How should changes to the plan of care be documented during the episode?",
    "Under what conditions are home health aide services covered?",
    "What are the supervision requirements for home health aides?"
]

In [None]:
queries = [
    "What types of monitoring are needed for companies that run a large portion of their own systems on low-level hardware?",
    "Describe some advantages of test-driven development regarding upkeep of a software product.",
    "How can observability be coupled with a development effort to prevent rolling back deployments?"
]

In [None]:
top_k = 5
for query in queries:
    query_embedding = model.encode_query(query, convert_to_tensor=True)

    # We use cosine-similarity and torch.topk to find the highest 5 scores
    similarity_scores = model.similarity(query_embedding, corpus_embeddings)[0]
    scores, indices = torch.topk(similarity_scores, k=top_k)

    print("\nQuery:", query)
    print("Top 5 most similar sentences in corpus:")

    for score, idx in zip(scores, indices):
        context = sentencesProcessed[idx-2:idx+2]
        print(f"(Score: {score:.4f})", context)


## Set up in-memory file storage

In [None]:
import redis
import base64

In [None]:
redis_client = redis.Redis(host='localhost', port=6379, db=0)

## Setting up API endpoints for processing

In [None]:
from flask import Flask, jsonify, request, abort
import uuid
import io
import pickle

In [None]:
apiModel = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
app = Flask(__name__)

# POST endpoint to upload pdf
@app.route('/upload-pdf', methods=['POST'])
def upload_pdf():
    if 'pdf' not in request.files:
        return jsonify({'error': 'Bad Request', 'message': 'pdf key not found in request.files'}), 400
    file = request.files['pdf']
    if not file:
        response = jsonify({'error': 'Bad Request', 'message': 'file not found in request'})
        response.status_code = 400
        return response
    # Generate unique key
    document_key = str(uuid.uuid4())
    # Store in Redis (base64 encoded)
    pdf_data = base64.b64encode(file.read()).decode('utf-8')
    redis_client.setex(document_key, 3600, pdf_data)  # 1 hour expiry
    return jsonify({"document_key": document_key})

# POST endpoint to build document model
@app.route('/encode-document', methods=['POST'])
def receive_document():
    data = request.json # Access JSON data from the request body
    if 'document_key' not in data:
        return jsonify({'error': 'Bad Request', 'message': 'document_key not found in request json'}), 400
    document_key = data.get('document_key')
    document = redis_client.get(document_key)
    if not document:
        return jsonify({'error': 'Not Found', 'message': 'document not found in memory storage'}), 404
    try:
        pdf_data = base64.b64decode(document)
        pdf_stream = io.BytesIO(pdf_data)
        doc = fitz.open(stream=pdf_stream, filetype="pdf")
        metadata = GetDocumentMetadata(doc)
        apiSentences = ExtractSentencesFromDocument(doc, metadata)
        apiSentencesProcessed = parse_sentences(apiSentences)
        sentences_serialized = pickle.dumps(apiSentencesProcessed)
        redis_client.setex(f"{document_key}_sentences", 3600, sentences_serialized)
        doc.close()
    except Exception as e:
        return jsonify({'error': f'PDF processing error: {str(e)}'}), 500
    apiCorpusEmbeddings = apiModel.encode_document(apiSentencesProcessed)
    try:
        embeddings_bytes = pickle.dumps(apiCorpusEmbeddings)
        # Store in Redis with the same document_key
        redis_client.setex(f"{document_key}_embeddings", 3600, embeddings_bytes)
    except Exception as e:
        return jsonify({'error': f'Storage error: {str(e)}'}), 500
    return jsonify({"status": "success"})

# POST endpoint to query model
@app.route('/query', methods=['POST'])
def hello_world():
    data = request.json
    if 'query' not in data or 'document_key' not in data:
        return jsonify({'error': 'Bad Request', 'message': 'query not found in request json'}), 400
    query = data.get('query')
    document_key = data.get('document_key')
    document_embeddings = redis_client.get(f"{document_key}_embeddings")
    document_sentences = redis_client.get(f"{document_key}_sentences")
    if not document_embeddings or not document_sentences:
        return jsonify({'error': 'Not Found', 'message': 'document embeddings not found in memory storage'}), 404
    apiCorpusEmbeddings = pickle.loads(document_embeddings)
    apiSentencesProcessed = pickle.loads(document_sentences)
    top_k = 5
    apiQueryEmbedding = apiModel.encode_query(query, convert_to_tensor=True)

    # use cosine-similarity and torch.topk to find the highest 5 scores
    cosineSimilarities = apiModel.similarity(apiQueryEmbedding, apiCorpusEmbeddings)[0]
    topScores, topIndices = torch.topk(cosineSimilarities, k=top_k)

    print("\nQuery:", query)
    print("Top 5 most similar sentences in corpus:")
    outputString = ""
    for score, idx in zip(topScores, topIndices):
        if idx:
            context = apiSentencesProcessed[max(0, idx-2):min(len(apiSentencesProcessed), idx+2)]
            outputString = outputString + f"(Score: {score:.4f})" + ' '.join(context)
            print(f"(Score: {score:.4f})", context)
    return jsonify({"status": "success", "output": outputString})

if __name__ == '__main__':
    app.run(debug=False, port=5000)