In [2]:
import os
import json
import time
import requests
import random
import base64
from collections import OrderedDict
import urllib.request
# from tqdm import tqdm
import openai
import langchain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma, FAISS
from langchain import OpenAI, VectorDBQA
from langchain.chat_models import AzureChatOpenAI
from langchain.chat_models import ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.docstore.document import Document
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain
from IPython.display import Markdown, HTML, display
from dotenv import load_dotenv
from document_chunking import DocumentChunker
from azure.storage.blob import BlobServiceClient
from openai.embeddings_utils import get_embedding, cosine_similarity
from typing import List

load_dotenv()
openai.api_type = "azure"
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = "2023-03-15-preview"
openai.api_key = os.getenv("OPENAI_API_KEY")
DEPLOYMENT_NAME = os.getenv("DEPLOYMENT_NAME")
MODEL_NAME = os.getenv("MODEL_NAME")
ENVIRONMENT = os.getenv("ENVIRONMENT")
embedder = OpenAIEmbeddings(deployment="text-embedding-ada-002") 


In [10]:
def get_search_results(query: str, indexes: list, 
                       k: int = 5,
                       reranker_threshold: int = 1,
                       sas_token: str = "",
                       vector_search: bool = False,
                       similarity_k: int = 3, 
                       query_vector: list = []) -> List[dict]:
    
    headers = {'Content-Type': 'application/json','api-key': os.environ["AZURE_SEARCH_KEY"]}
    params = {'api-version': os.environ['AZURE_SEARCH_API_VERSION']}

    agg_search_results = dict()
    
    for index in indexes:
        search_payload = {
            "search": query,
            "queryType": "semantic",
            "semanticConfiguration": "my-semantic-config",
            "count": "true",
            "speller": "lexicon",
            "queryLanguage": "en-us",
            "captions": "extractive",
            "answers": "extractive",
            "top": k
        }
        if vector_search:
            search_payload["vectors"]= [{"value": query_vector, "fields": "contentVector","k": k}]
            search_payload["select"]= "id, title, content, name, location"
        else:
            search_payload["select"]= "id, title, content, name, location"
        

        resp = requests.post(os.environ['AZURE_SEARCH_ENDPOINT'] + "/indexes/" + index + "/docs/search",
                         data=json.dumps(search_payload), headers=headers, params=params)

        search_results = resp.json()
        agg_search_results[index] = search_results

    content = dict()
    ordered_content = OrderedDict()
    
    for index,search_results in agg_search_results.items():
        for result in search_results['value']:
            if result['@search.rerankerScore'] > reranker_threshold: # Show results that are at least N% of the max possible score=4
                content[result['id']]={
                                        "title": result['title'], 
                                        "name": result['name'], 
                                        "location": result['location'] + sas_token if result['location'] else "",
                                        "caption": result['@search.captions'][0]['text'],
                                        "index": index
                                    }
                if vector_search:
                    content[result['id']]["content"]= result['content']
                    content[result['id']]["score"]= result['@search.rerankerScore'] # Uses the reranker score
                    # content[result['id']]["vectorized"]= result['vectorized']              
                else:
                    content[result['id']]["content"]= result['content']
                    content[result['id']]["score"]= result['@search.score'] # Uses the Hybrid RRF score
                
    # After results have been filtered, sort and add the top k to the ordered_content
    if vector_search:
        topk = similarity_k
    else:
        topk = k*len(indexes)
        
    count = 0  # To keep track of the number of results added
    for id in sorted(content, key=lambda x: content[x]["score"], reverse=True):
        ordered_content[id] = content[id]
        count += 1
        if count >= topk:  # Stop after adding 5 results
            break

    return ordered_content

In [16]:
# QUESTION = "Low vision devices coverage?"
QUESTION = "Formulary preferred generic drugs for prescription drugs at retail pharmacy coverage in-network providers?"
# QUESTION = "Can my benefit cover the device cost for Therapeutic?"
# QUESTION = "Can my benefit cover the device cost for Therapeutic?"


In [17]:
ordered_results = get_search_results(QUESTION, ["demo_index_0"], 
                                        k=3,
                                        )

ordered_results

OrderedDict([('UHJlZmVycmVkX0dvbGRfRVBPXzE1MDBfQmVuZWZpdF8yMDIyX2luX1dhc2hpbmd0b25fMzM=',
              {'title': '49831WA194  (01-2022) 28 Preferred Gold   If not recognized by one of the standard reference compendia cited above, then recognized by the majority of ',
               'name': 'Preferred_Gold_EPO_1500_Benefit_2022_in_Washington_33',
               'location': 'https://openaiembedding.blob.core.windows.net/document-chunks/Preferred_Gold_EPO_1500_Benefit_2022_in_Washington_33.txt',
               'caption': 'We will review your request and let you or your provider know within 72 hours in writing if it is approved . If \r approved, your cost will be as shown on the Summary of Your Costs  for formulary generic and brand name \r drugs and will be covered for the duration of the prescription . If your request is not approved, the drug will not be \r covered.',
               'index': 'demo_index_0',
               'content': "49831WA194  (01-2022) 28 Preferred Gold   If not rec

In [19]:
ordered_results = get_search_results(QUESTION, ["demo_index_vector"], 
                                        k=3,
                                        reranker_threshold=1,
                                        vector_search=True, 
                                        similarity_k=5,
                                        query_vector = embedder.embed_query(QUESTION)
                                        )
ordered_results

OrderedDict([('UHJlZmVycmVkX0dvbGRfRVBPXzE1MDBfQmVuZWZpdF8yMDIyX2luX1dhc2hpbmd0b25fMTE=',
              {'title': '49831WA1940001  (01-2022) 6 Preferred Gold   YOUR COSTS OF THE ALLOWED AMOUNT  ',
               'name': 'Preferred_Gold_EPO_1500_Benefit_2022_in_Washington_11',
               'location': 'https://openaiembedding.blob.core.windows.net/document-chunks/Preferred_Gold_EPO_1500_Benefit_2022_in_Washington_11.txt',
               'caption': 'inpatient (limited to 30 days per calendar  year)  deductible, then 30%  coinsurance  not covered    outpatient (limited to 25 visits per calendar  year)  deductible, then 30%  coinsurance  not covered   skilled nursing facility care      skilled nursing facility care is limited to 60  days per calendar year  deductible, then 30%  coinsurance …',
               'index': 'demo_index_vector',
               'content': "49831WA1940001  (01-2022) 6 Preferred Gold   YOUR COSTS OF THE ALLOWED AMOUNT  \r\nIN-NETWORK PROVIDERS  OUT -OF-NETWORK PROV