Install dependency at first

In [8]:
%pip install qdrant-client

Note: you may need to restart the kernel to use updated packages.


Set up vector DB through Qdrant
- [github repo](https://github.com/qdrant/qdrant)
- [API reference](https://api.qdrant.tech/api-reference)

In [6]:
from qdrant_client import QdrantClient
qdrant = QdrantClient(path="vec_db/qdrant_demo") # Create in-memory Qdrant instance, for testing, CI/CD

In [7]:
DEFAULT_COLLECTION = 'my_collection'

In [52]:
DEFAULT_COLLECTION = 'my_collection'

In [13]:
from qdrant_client import models

qdrant.create_collection(
    collection_name='my_collection',
    vectors_config=models.VectorParams(size=384, distance=models.Distance.COSINE)
)

True

In [2]:
qdrant.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='my_collection')])

Get dataset from HuggingFace

In [9]:
from datasets import load_dataset

dataset = load_dataset("NagendraHarish/webmd")
dataset

DatasetDict({
    train: Dataset({
        features: ['instruction', 'output'],
        num_rows: 1000
    })
})

Example of MiniLM

In [31]:
from sentence_transformers import SentenceTransformer

sentences = ["This is an example sentence", "Each sentence is converted"]
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='cpu')
embeddings = model.encode(sentences)
print(embeddings)




[[ 6.76568747e-02  6.34959713e-02  4.87131178e-02  7.93049708e-02
   3.74480747e-02  2.65281973e-03  3.93749960e-02 -7.09845219e-03
   5.93613535e-02  3.15370485e-02  6.00980595e-02 -5.29052056e-02
   4.06067446e-02 -2.59308629e-02  2.98428200e-02  1.12690695e-03
   7.35148489e-02 -5.03818318e-02 -1.22386657e-01  2.37028580e-02
   2.97265369e-02  4.24768478e-02  2.56337859e-02  1.99512066e-03
  -5.69190606e-02 -2.71597914e-02 -3.29035483e-02  6.60248324e-02
   1.19007185e-01 -4.58791144e-02 -7.26214498e-02 -3.25839706e-02
   5.23413457e-02  4.50553186e-02  8.25298484e-03  3.67024355e-02
  -1.39415739e-02  6.53918087e-02 -2.64272038e-02  2.06389377e-04
  -1.36643564e-02 -3.62810530e-02 -1.95043888e-02 -2.89737973e-02
   3.94270085e-02 -8.84090886e-02  2.62424443e-03  1.36713777e-02
   4.83062193e-02 -3.11566442e-02 -1.17329180e-01 -5.11690341e-02
  -8.85287598e-02 -2.18963176e-02  1.42986597e-02  4.44167666e-02
  -1.34815862e-02  7.43392333e-02  2.66382825e-02 -1.98762845e-02
   1.79191

In [10]:
# Q, A = dataset['train']
q_emb = model.encode([item['instruction'] for item in dataset['train']])
print(q_emb.shape)

a_emb = model.encode([item['output'] for item in dataset['train']])
print(a_emb.shape)

(1000, 384)
(1000, 384)


In [47]:
points = []
for i, item in enumerate(dataset['train']):
    points.append(
        models.PointStruct(
            id = i+1,
            payload={
                'question': item['instruction'],
                'answer': item['output'],
            },
            vector=q_emb[i].tolist()
        )
    )

    points.append(
        models.PointStruct(
            id = i+1001,
            payload={
                'question': item['instruction'],
                'answer': item['output'],
            },
            vector=a_emb[i].tolist()
        )
    )

In [49]:
qdrant.upsert(
    collection_name=DEFAULT_COLLECTION,
    points=points,
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [32]:
question = 'what to do if my throat is painful?'
emb = model.encode(question)
resp = qdrant.search(
    collection_name=DEFAULT_COLLECTION,
    query_vector=emb,
    limit=2
)

doc_str = ""
for i, p in enumerate(resp):
    print(f'score {p.score}, related question: \n {p.payload['question']} \n related answer {p.payload['answer']}')
    doc_str += f"""
    <DOC {i+1}>
    {p.payload['question']}
    {p.payload['answer']}
    <End of DOC {i+1}>
    """

score 0.6044077552532906, related question: 
 [Q]i am 4 weeks pregnant and my throat is very sore. my tonsils feel swollen but i donât have a cold. what can i take? 
 related answer [A]keep your throat moist and drink plenty of fluids specifically those that will not dehydrate you (less or no caffeine). if you have regular issues with allergies partner with your pharmacist and physician to make sure it is right for you concerning the timing or current trimester of your pregnancy. keep gargling with hot water with salt added. you could also keep drinking lemon juice mixed to warm water and add honey insead of sugar. you could have green tea with honey. keep having warm soup and water. take care.
score 0.542681190723476, related question: 
 [Q]how can i cure acid reflux? . i ate a large meal and went to bed right after and 4 hours later woke up with a burning throat. gargled some cold water and saw some blood got worried gargled couple of more times but the bleeding got worse. rushed t

In [33]:
prompt = """
=== Task
Answer the question using given documents. Each document will contains a question starting with '[Q]'
and answer starting with [A].
==== Instruction
Try to keep your answer ground in the facts of the given documents. If documents are not relevant, response with
'given documents not relevant'.
### documents
{{document}}
### question
{{question}}
"""
param_dict = {
    '{{document}}': doc_str,
    '{{question}}': question
}

def replace_with_mapping(text, mapping):
    for old, new in mapping.items():
        text = text.replace(old, new)
    return text

request = replace_with_mapping(prompt, param_dict)
print(request)


=== Task
Answer the question using given documents. Each document will contains a question starting with '[Q]'
and answer starting with [A].
==== Instruction
Try to keep your answer ground in the facts of the given documents. If documents are not relevant, response with
'given documents not relevant'.
### documents

    <DOC 1>
    [Q]i am 4 weeks pregnant and my throat is very sore. my tonsils feel swollen but i donât have a cold. what can i take?
    [A]keep your throat moist and drink plenty of fluids specifically those that will not dehydrate you (less or no caffeine). if you have regular issues with allergies partner with your pharmacist and physician to make sure it is right for you concerning the timing or current trimester of your pregnancy. keep gargling with hot water with salt added. you could also keep drinking lemon juice mixed to warm water and add honey insead of sugar. you could have green tea with honey. keep having warm soup and water. take care.
    <End of DOC 1>

In [24]:
import torch
import transformers

device = 'cuda' if torch.cuda.is_available() else 'cpu'

tokenizer = transformers.AutoTokenizer.from_pretrained('gpt2')
amateur_llm = transformers.AutoModelForCausalLM.from_pretrained('gpt2')



In [25]:
def generate_builtin(model, prompt, max_len=100, temperature = 0.8, device=device):
    model_inputs = tokenizer(prompt, return_tensors="pt").to(device)
    model.to(device)

    gen_tokens = model.generate(
        **model_inputs,
        do_sample=True,
        temperature=temperature,
        max_length=max_len,
    )
    return tokenizer.batch_decode(gen_tokens)[0]

generate_builtin(amateur_llm, question, max_len=500, device='cpu')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


'what to do if my throat is painful? Do I sit on the bed and watch the camera? Yes I do! Is there a way to hold my nose when you\'re making me feel so cold?\n\nSleeping in a room with a hot tub and steam running down your throat.\n\nI\'ll try that.\n\nI\'m sorry for having to say it. But what about you? I think I\'d rather be in a room with a hot tub with a lot of hot steam running down your throat.\n\nYou can\'t sit still because of you making me feel like I\'m having a "hot" tub for my own entertainment.\n\nI\'d better stop making you feel like you\'re having a "hot" tub.\n\nI\'ll be right back.\n\nSo, how do you feel?\n\nYeah, I\'m going to go downstairs with you.\n\nYou\'re going to watch the video of your hot tub and wait for it to start cooking.\n\nBecause you can\'t wait to take a seat, and you can\'t wait to watch my hot tub cooking.\n\nWait, wait, wait, wait, wait.\n\nThis is going to take me a lot longer than it will take you to do. My throat is going to go cold for you.\n\nS

In [34]:
generate_builtin(amateur_llm, request, max_len=1024, device='cpu')

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


"\n=== Task\nAnswer the question using given documents. Each document will contains a question starting with '[Q]'\nand answer starting with [A].\n==== Instruction\nTry to keep your answer ground in the facts of the given documents. If documents are not relevant, response with\n'given documents not relevant'.\n### documents\n\n    <DOC 1>\n    [Q]i am 4 weeks pregnant and my throat is very sore. my tonsils feel swollen but i donâ\x80\x99t have a cold. what can i take?\n    [A]keep your throat moist and drink plenty of fluids specifically those that will not dehydrate you (less or no caffeine). if you have regular issues with allergies partner with your pharmacist and physician to make sure it is right for you concerning the timing or current trimester of your pregnancy. keep gargling with hot water with salt added. you could also keep drinking lemon juice mixed to warm water and add honey insead of sugar. you could have green tea with honey. keep having warm soup and water. take care.\