In [1]:
import os
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from datasets import load_dataset
from qdrant_client import QdrantClient
from qdrant_client.http import models

load_dotenv()

True

In [2]:
data = load_dataset("llamafactory/PubMedQA", split='train')
data = data.to_pandas()
data.head()

Unnamed: 0,instruction,input,output
0,Answer the question based on the following con...,Question: Is naturopathy as effective as conve...,Naturopathy appears to be an effective alterna...
1,Answer the question based on the following con...,Question: Can randomised trials rely on existi...,Routine data have the potential to support hea...
2,Answer the question based on the following con...,Question: Is laparoscopic radical prostatectom...,The results of our non-randomized study show t...
3,Answer the question based on the following con...,Question: Does bacterial gastroenteritis predi...,Symptoms consistent with IBS and functional di...
4,Answer the question based on the following con...,Question: Is early colonoscopy after admission...,No significant association is apparent between...


In [3]:
MAX_ROWS = 1000
OUTPUT="output"
subset_data = data.head(MAX_ROWS)

In [4]:
client = QdrantClient(
    os.getenv("QDRANT_HOST"),
    api_key=os.getenv("QDRANT_API_KEY")
)

chunks = subset_data[OUTPUT].to_list()

client.add(
    collection_name="pubmedqa",
    documents=chunks
)

['bd9349e7ffd54705a248771b5d7f1bc5',
 'cd24841d6421490d804b35d1f894730d',
 'db279d28e91041f8abe660dfebfc86ed',
 'ddc0985fe0ec47c7a9ef0f2dea4c856b',
 'f50239e431554b3ca7dd931ed81c4177',
 'afd638458e46427fb1f2e30b06f771ee',
 '5e885a126f5a45e2b1bf65838d79fa9b',
 '0a1283c5034e4befa92208d97d0884b3',
 '341e0c150d52403daafdbd3cecefe728',
 'f3638ed3985e4cfca1c0886cae82ec7e',
 '261218d43a6e40ad8d5b95bdd4d46998',
 'db01bcd7b0e646e3a1cae51301e0dac5',
 'f8fb42c4557d4ee68353965f8f4b350a',
 'acc5a063d6034862a3049e779d525bf4',
 '77084b3fcc7d4f2381172c3e2277d81f',
 '0f4e83faf5c3429fa529df4fd04ecb34',
 'f5ab9a34e3ae483d8cae767382adec8f',
 'aea02c905f594724a66ba046f4380285',
 'f1fb86d275474d1f9c6e03ecc227e70e',
 '19c11f3878ee49c28981b41be482e679',
 '9fc9a145d98d450592b8377e0cc75a4e',
 '1eca2d35847d496e800e441dbc0cef5e',
 '348479ed49004465b7e34dcbc2e2d4a3',
 '2523ce97d01c4d81b211a3b95113878b',
 'e247a7fcbb8d434898e039031516a1bb',
 'bf6d296abb844f4487c442f1a208bf16',
 'c99c0783324442a2b0a08a9b006e043a',
 

In [5]:
import uuid
import time
from typing import List
from fastembed import TextEmbedding
from qdrant_client import QdrantClient
from qdrant_client.http.models import PointStruct, SearchParams

class SemanticCache:
    def __init__(self, threshold=0.35):
        self.encoder = TextEmbedding(model_name="BAAI/bge-small-en-v1.5")
        self.cache_client = QdrantClient(":memory:")
        self.cache_collection_name = "cache"

        self.cache_client.create_collection(
            collection_name=self.cache_collection_name,
            vectors_config=models.VectorParams(
                size=384,
                distance='Euclid'
            )
        )

        # Initialize Qdrant Client for external database
        self.db_client = QdrantClient(
            os.getenv("QDRANT_HOST"),
            api_key=os.getenv("QDRANT_API_KEY")
        )
        self.db_collection_name = "pubmedqa"
        
        self.euclidean_threshold = threshold

    def get_embedding(self, question):
        embedding = list(self.encoder.embed(question))[0]
        return embedding

    def search_cache(self, embedding):
        search_result = self.cache_client.search(
            collection_name=self.cache_collection_name,
            query_vector=embedding,
            limit=1
        )
        return search_result

    def add_to_cache(self, question, response_text):
        # Create a unique ID for the new point
        point_id = str(uuid.uuid4())
        vector = self.get_embedding(question)
        # Create the point with payload
        point = PointStruct(id=point_id, vector=vector, payload={"response_text": response_text})
        # Upload the point to the cache
        self.cache_client.upload_points(
            collection_name=self.cache_collection_name,
            points=[point]
        )
        
    def query_database(self, query_text):
        results = self.db_client.query(
            query_text=query_text,
            limit=3,
            collection_name=self.db_collection_name
        )
        return results

    def ask(self, question):
        start_time = time.time()
        vector = self.get_embedding(question)
        search_result = self.search_cache(vector)
        
        if search_result:
            for s in search_result:
                if s.score <= self.euclidean_threshold:
                    print('Informação recuperada do Cache')
                    print(f'Found cache with score {s.score:.3f}')
                    elapsed_time = time.time() - start_time
                    print(f"Time taken: {elapsed_time:.3f} seconds")
                    return s.payload['response_text']

        db_results = self.query_database(question)
        if db_results:
            response_text = db_results[0].document
            self.add_to_cache(question, response_text)
            print('Informação adicionada ao Cache')
            elapsed_time = time.time() - start_time
            print(f"Time taken: {elapsed_time:.3f} seconds")
            return response_text

        # Fallback if no response is found
        print('No answer found in Cache or Database.')
        elapsed_time = time.time() - start_time
        print(f"Time taken: {elapsed_time:.3f} seconds")
        return "No answer available."

In [6]:
cache = SemanticCache()
question_1 = "Does bacterial gastroenteritis predispose people to functional gastrointestinal disorders?"
question_2 = "Can bacterial gastroenteritis lead to functional gastrointestinal disorders?"

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

In [7]:
cache.ask(question_1)

Informação adicionada ao Cache
Time taken: 0.684 seconds


'Symptoms consistent with IBS and functional diarrhea occur more frequently in people after bacterial gastroenteritis compared with controls, even after careful exclusion of people with pre-existing FGIDs. The frequency is similar at 3 and 6 months. Our findings support the existence of postinfectious IBS and give an accurate estimate of its frequency.'

In [8]:
cache.ask(question_2)

Informação recuperada do Cache
Found cache with score 0.329
Time taken: 0.062 seconds


'Symptoms consistent with IBS and functional diarrhea occur more frequently in people after bacterial gastroenteritis compared with controls, even after careful exclusion of people with pre-existing FGIDs. The frequency is similar at 3 and 6 months. Our findings support the existence of postinfectious IBS and give an accurate estimate of its frequency.'