In [41]:
from langchain import PromptTemplate
from langchain.chains import RetrievalQA
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import pinecone, Pinecone, Qdrant
import pinecone
from langchain.document_loaders import PyPDFLoader, DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.prompts import PromptTemplate
from langchain.llms import CTransformers
from qdrant_client import QdrantClient

### Extract the data

In [8]:
def load_pdf(data):
    loader = DirectoryLoader(data,glob="*.pdf",loader_cls=PyPDFLoader) #if we send a directory only pdf files are loaded and each loader class is pypdfloader
    documents = loader.load()
    return documents

In [9]:
extracted_data = load_pdf("data/")

### create chunks to give to the llama model

In [10]:
def chunker(extracted_data):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap = 20)
    text_chunk = text_splitter.split_documents(extracted_data)

    return text_chunk

In [11]:
text_chunks = chunker(extracted_data)

In [12]:
print("total documents are divided into",len(text_chunks),"chunks")

total documents are divided into 6983 chunks


### convert these into embeddings

In [13]:
def download_hugging_face_embeddings():
    embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embeddings

In [14]:
embeddings=download_hugging_face_embeddings()



In [15]:
embeddings

HuggingFaceEmbeddings(client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False})
  (2): Normalize()
), model_name='sentence-transformers/all-MiniLM-L6-v2', cache_folder=None, model_kwargs={}, encode_kwargs={})

##### lets convert our words into embeddings using hugging face embeddings

In [16]:
embedding_example = embeddings.embed_query("hi everyone")
len(embedding_example)

384

### coverting all our data into embeddings

##### using pinecone

In [19]:
# from langchain.vectorstores import Pinecone as p2
# from pinecone import Pinecone
# PINECONE_API_KEY = "8af77b04-8a9a-4a21-9a71-b1027067fa63"
# PINECONE_API_INDEX = "us-east-1"
# pc = Pinecone(api_key=PINECONE_API_KEY)
# index= pc.Index("airamatrix-chatbot")

# docsearch = p2.from_texts([t.page_content for t in text_chunks], embeddings, index_name = index)

##### using qdrant

In [32]:
client = QdrantClient(api_key="wRvbWrE136bxNLjPBGlTTt27VfNtrWxpWk9NYm3eYMjvVcsj8v-ryQ", url="https://a3e797dc-0ae6-4e96-8392-4bff15d64ae6.europe-west3-0.gcp.cloud.qdrant.io:6333")

# Step 3: Create a collection in Qdrant
embedding_size = 384  # Adjust this based on the Hugging Face model output size (384 for all-MiniLM-L6-v2)
client.create_collection(
    collection_name="text_embeddings",
    vectors_config={
        "size": embedding_size,  # Dimensionality of the embeddings
        "distance": "Cosine"  # You can choose "Cosine", "Euclidean", or "Dot"
    }
)

True

In [None]:
embeddings_all = []
for index,val in enumerate(text_chunks):
    print(f"Index: {index}", end='\r')
    embeddings_all.append(embeddings.embed_query(val.page_content))

In [33]:
points = [
        {"id": idx, "vector": embedding, "payload": {"text": chunk}}
        for idx, (chunk, embedding) in enumerate(zip(text_chunks, embeddings_all))
    ]

def batch_upsert(client, collection_name, points, batch_size=100):
    for i in range(0, len(points), batch_size):
        print(f"{i}",end='\r')
        batch = points[i:i + batch_size]
        client.upsert(
            collection_name=collection_name,
            points=batch
        )   

# Step 5: Insert the embeddings in batches
batch_upsert(client, "text_embeddings", points, 100)

6900

#### searching a query's similar parts in qdrant

In [57]:
example_query_chunk = "what are allergies"  
example_query_embedding = embeddings.embed_query(example_query_chunk)

results = client.search(
    collection_name="text_embeddings",
    query_vector=example_query_embedding,
    limit=3
)

for result in results:
    print(f"ID: {result.id}, Score: {result.score}, Text: {result.payload['text']}")


ID: 6028, Score: 0.766492, Text: {'page_content': 'ORGANIZATIONS\nAmerican Academy of Ophthalmology. 655 Beach Street, PO\nBox 7424, San Francisco, CA 94120-7424. <http://www.\neyenet.org>.KEY TERMS\nAllergen —A substance capable of inducing an\nallergic response.\nAllergic reaction —An immune system reaction to\na substance in the environment; symptoms\ninclude rash, inflammation, sneezing, itchy watery\neyes, and runny nose.\nConjunctiva —The mucous membrane that covers\nthe white part of the eyes and lines the eyelids.', 'metadata': {'source': 'data\\The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf', 'page': 659}}
ID: 3965, Score: 0.6300445, Text: {'page_content': 'Although environmental medicine is gaining more\nrespect within conventional medicine, detoxificationKEY TERMS\nAllergen —A foreign substance, such as mites in\nhouse dust or animal dander, that when\ninhaled,causes the airways to narrow and pro-\nduces symptoms of asthma.\nAntibody —A protein, also called immunoglobu-\nlin, 

Note that the above ranked result looks similar to the question but the answer is not readable and somewhat random, so we use our llm to give the correct result

### LLM part

In [37]:
prompt_template = """
Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [38]:
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context","question"])
chain_type_kwargs = {"prompt":PROMPT}

In [40]:
llm = CTransformers(model="llama-2-7b-chat.ggmlv3.q4_0.bin",
                    model_type="llama",
                    config={'max_new_tokens':512,
                    'temperature':0.8})

In [63]:
## this is the qdrant database entity
qdrant = Qdrant(client=client, collection_name="text_embeddings", embeddings=embeddings)

In [64]:
# qa = RetrievalQA.from_chain_type(llm=llm,
#                                 chain_type="stuff",
#                                 retriever=qdrant.as_retriever (search_kwargs={'k': 2}),
#                                 return_source_documents=True,
#                                 chain_type_kwargs=chain_type_kwargs)

In [65]:
# qa({"query":"what is an medicine"})

ValidationError: 1 validation error for Document
page_content
  none is not an allowed value (type=type_error.none.not_allowed)

In [71]:
results = client.search(collection_name='text_embeddings', query_vector=[-0.09936315,0.027253166,0.016290115,-0.06584516,0.0026523904,0.010003437,0.04731847,0.064257585,0.020044582,-0.006465578,0.03688363,0.0022618358,0.029348977,-0.038033847,-0.08519824,0.01185296,0.028494356,0.00101958,-0.02157071,-0.027277663,-0.05679168,0.104634926,-0.018505424,0.10594168,-0.009211808,-0.022340616,-0.020207766,0.022616636,0.0049929176,-0.062917106,-0.027551604,0.06748536,0.116646454,0.025340166,-0.00027055418,0.06465472,0.00627242,0.028733863,0.034282636,-0.047342155,-0.02968264,-0.1375389,-0.059761725,-0.022104857,0.005853779,-0.004589746,0.0040081786,-0.00084610324,-0.05258,0.06526106,-0.08504582,-0.078862734,0.0026854905,-0.04362668,0.02568419,-0.027480764,-0.00969977,0.025741128,-0.09002936,0.012319594,-0.04848561,0.014403512,-0.04199979,0.053765725,0.028031569,-0.03340566,0.04795561,0.03511659,-0.054726206,-0.0341945,0.030487625,-0.05787501,-0.04245497,0.11668653,0.0019945477,0.013436567,0.048314672,-0.019335123,-0.017273467,-0.00096989766,-0.01909259,-0.028856175,0.047625866,0.06307406,-0.04280542,-0.012010276,0.0109640965,0.041824345,0.027585812,-0.052495375,0.0137157,-0.042887244,0.104532145,-0.0055549904,-0.07430501,-0.025115734,-0.13064004,-0.0452433,0.033812735,0.061831854,-0.05972457,0.050162923,-0.014383476,-0.0155252395,-0.026058862,-0.10729751,0.075774856,0.006790568,0.030540029,0.009301123,0.03540338,0.022003451,-0.05778112,-0.12022722,0.011911775,-0.03325556,-0.055778477,-0.026655626,0.11680712,-0.082031906,-0.015039398,-0.02495332,-0.06899492,-0.002497299,0.015141857,-0.07117554,0.023742089,7.139217e-33,-0.011343036,0.059683997,0.10610434,0.08368759,0.07129261,0.05737959,0.0060169767,-0.006130615,-0.005148727,-0.06858818,0.0024346109,-0.007994475,-0.008935968,-0.04279309,-0.022286149,-0.0018285991,-0.05244106,0.13636069,0.03427796,0.04702207,-0.0067098485,-0.0137336,-0.055037923,0.072831444,-0.009110314,0.040960174,-0.04284731,-0.0772926,0.049301304,0.018508531,0.043941356,0.003294207,0.03475551,-0.08575012,-0.008862936,0.05805665,-0.04547787,-0.0060103093,0.011360141,-0.0049437704,-0.017243514,0.0018986247,0.063407406,-0.021331646,-0.069662504,0.027831422,-0.023941837,-0.014502769,0.05657056,0.052836835,-0.04150654,0.013006938,0.030213423,0.014347492,0.022308772,-0.0270985,-0.052540507,-0.04022274,0.080827385,0.043524664,-0.050604653,0.15031278,-0.015544732,0.050047223,0.055257063,-0.0248857,-0.01660731,-0.1290537,0.071852274,-0.00227808,-0.0783643,0.008754057,0.027092787,-0.0032363636,-0.0454612,0.018945867,-0.09321763,-0.019936047,0.0483978,-0.041456077,-0.011089634,-0.047203936,0.024113491,0.0054624756,-0.0223571,-0.027957313,-0.017341383,0.036395688,-0.061787035,0.08659641,0.036848973,0.009364445,-0.06924029,0.07825453,-0.051732585,-7.476423e-33,0.0065238154,-0.054341357,0.009067737,0.019065695,0.036946554,0.0052025067,-0.03932375,0.038125154,-0.016341604,-0.064711556,0.097840406,-0.019697921,-0.057045765,0.012534895,-0.08913203,0.08416152,0.004972375,-0.030158456,-0.072740875,-0.022043798,-0.00054798694,0.016812857,-0.015029434,0.09134139,-0.0072841654,0.07021818,0.07465535,0.027852466,0.009751024,0.031381976,-0.026237456,-0.054850433,-0.07964397,-0.032076746,-0.055904005,-0.07853458,0.029597541,0.10054411,-0.012512837,-0.023013422,0.09065469,0.03392247,-0.035214785,0.06838911,0.02405618,-0.10051962,0.022738473,-0.04215522,0.10045284,0.016282685,-0.0029047155,-0.08452051,0.059909113,-0.054032005,0.00024363752,0.020498103,-0.002800752,-0.039312758,-0.009737797,0.00013561366,0.06430831,0.058606207,-0.08065936,0.010076022,0.028983952,-0.021660788,0.05873717,0.07695506,-0.03409168,-0.026596395,-0.010531089,0.039438985,0.01572373,-0.13566533,0.038025945,-0.045544654,0.021889549,-0.052433953,-0.08224805,-0.012906284,-0.010547632,-0.024927316,0.020821242,0.07123105,0.011760243,0.012705059,-0.012629896,-0.014167557,-0.04111902,-0.04707522,-0.055658266,0.05335863,-0.057033233,0.011704885,-0.09349584,-5.273776e-8,0.02229678,-0.0065247756,-0.0039439257,-0.05613098,0.008905925,-0.012458816,0.05326692,-0.0021884209,-0.0022493931,-0.0040118108,0.0036101323,0.014603288,-0.022723256,-0.008615941,0.09783056,-0.04650174,-0.018236972,0.06102667,-0.07421959,-0.05166894,0.03053105,-0.019958636,0.04878974,-0.10440465,-0.0018717324,0.03587195,-0.03716304,0.006407874,0.023204384,0.047745716,-0.019264163,0.1376471,0.040627945,0.029062305,0.058445506,-0.10899015,0.072846554,-0.047288984,-0.048688054,0.038972687,-0.07822044,-0.045934793,-0.01716378,0.06044081,0.08129946,-0.018590871,0.07931559,-0.038107753,0.011006298,-0.049219728,0.055462167,0.024282917,0.090294965,0.0809727,-0.046198852,0.0019319723,0.059580427,-0.00075013563,-0.002071954,-0.005340233,0.079135485,-0.08694187,0.11837327,0.025860742], limit=5)

# Print the retrieved results
for result in results:
    print(result)  # Look at the structure of the result


id=4 version=0 score=1.0 payload={'text': {'page_content': 'Mary Beth Trimper, Manager, Composition and\nElectronic Prepress\nEvi Seoud, Assistant Manager, Composition Purchasing\nand Electronic Prepress\nDorothy Maki, Manufacturing Manager\nWendy Blurton, Senior Manufacturing SpecialistThe GALE\nENCYCLOPEDIA\nof MEDICINE\nSECOND EDITIONSince this page cannot legibly accommodate all copyright notices, the\nacknowledgments constitute an extension of the copyright notice.\nWhile every effort has been made to ensure the reliability of the infor-', 'metadata': {'source': 'data\\The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf', 'page': 2}}} vector=None shard_key=None order_value=None
id=52 version=0 score=0.39725316 payload={'text': {'page_content': 'Belleville, MI\nAnna Rovid Spickler, D.V .M.,\nPh.D.\nMedical Writer\nMoorehead, KY\nGALE ENCYCLOPEDIA OF MEDICINE 2 XVIContributors', 'metadata': {'source': 'data\\The_GALE_ENCYCLOPEDIA_of_MEDICINE_SECOND.pdf', 'page': 11}}} vector=None shard_key

In [106]:
def query_database(query):
    query_embedding = embeddings.embed_query(query)

    # Step 2: Search Qdrant
    results = client.search(collection_name='text_embeddings', query_vector=query_embedding,limit=3)
    # print(results[:].payload['text']['page_content'])

    stringed_results = [i.payload['text']['page_content'] for i in results]
    combined_results = " ".join(stringed_results)
    # print(combined_results)
    
    prompt = f"""
    for this query: "{query}"

    refer to this information:
    {combined_results}

    Give a meaningful answer to the query by reading given information. Must be meaningful sentence.
    """
    return llm(prompt)   
    # if results:
    #     combined_results = " ".join(stringed_results)  # Adjust formatting as necessary
    #     final_answer = llm.generate(stringed_results)  # Adjust this to match your LLM API
    #     return final_answer
    # else:
    #     return "No relevant results found."

user_query = "What is cochlear implant?"
answer = query_database(user_query)
print("Answer:", answer)

Answer:  The Cochlear Implant is a device that helps people with severe hearing loss or deafness to hear sound again. It has two parts: an external microphone and a receiver/stimulator inside the skilled that converts sound processor that stimulator implantimer that converts sound processor that converts sound, which are too convert speech sounds are placed inverter that converts sound is located underneu
     imulator that stimulator that is placed on the inner ear that converts sound processor. 
    Implant that converts sound processor that converts sound;;;mimulator that stimulator implantemerator inside the person to convert sound processor that converts sound that is inside the microchimulator implantibulator that stimulator that stimulator inside the enderilimulator that stimulator that stimulator, and itmorabilizer, which are placed inside the internal device that converts sound processor.
     When itmotivmulater/ imulator implantamulator that is located underneimulator that s