In [28]:
#!pip install pymilvus==2.3.0
#!pip install transformers==4.35.2
#!pip install pandas==2.1.3
#!pip install mistralai
!pip install rank_bm25

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2


### Loading a Test CSV

In [16]:
import pandas as pd
ref_content = pd.read_csv("rag-reference.csv")
ref_content.head(10)

Unnamed: 0,Rating,Content
0,10,Python Python Python Python Python
1,3,C Java Javascript Python Pascal
2,6,Python HR Development Sales Marketing
3,9,Snake Viper Reptile cobra anaconda
4,2,Brady Messi Ronaldo James Becker
5,5,Lion Tiger Elephant Whale Crocodile
6,7,Google Amazon Microsoft Facebook Apple
7,8,Apple Orange Grape Melon Berry


### Setup Milvus

In [17]:
#Creating a Milvus connection

from pymilvus import connections

connections.add_connection(
    vowel={
        "host": "localhost",
        "port": "19530",
        "username" : "username",
        "password" : "password"
    })

conn_name = "vowel"

connections.connect(conn_name)


In [18]:
#Create a database in Milvus

from pymilvus import db

current_dbs=db.list_database(using=conn_name)
print("Current databases: ", current_dbs)

db_name="rag_demo"

if ( db_name not in current_dbs):
    print("Creating database :", db_name)
    resume_db = db.create_database(db_name, using=conn_name) 
    
db.using_database(db_name, using=conn_name)

Current databases:  ['default', 'rag_demo']


In [20]:
# Create a collection

from pymilvus import CollectionSchema, FieldSchema, DataType, Collection
import json

chunk_id = FieldSchema(
    name="chunk_id",
    dtype=DataType.INT64,
    is_primary=True,
    auto_id=True, #Generate IDs automatically
    max_length=32)

rating = FieldSchema(
    name="rating",
    dtype=DataType.INT64,
    max_length=32)

content_text= FieldSchema(
    name="content_text",
    dtype=DataType.VARCHAR,
    max_length=2048)

content_embedding = FieldSchema(
    name="content_embedding",
    dtype=DataType.FLOAT_VECTOR,
    dim=1024   #Mistral embedding size
)

content_schema=CollectionSchema(
    fields=[chunk_id, rating, content_text, content_embedding],
    description="RAG demonstration collection",
    enable_dynamic_field=True
)

collection_name="rag_demo_coll"

resume_collection=Collection(
    name=collection_name,
    schema=content_schema,
    using=conn_name,
    shard_num=2
)

from pymilvus import utility
print(utility.list_collections(using=conn_name))

rag_collection=Collection(collection_name, using=conn_name)
print(rag_collection.schema)


['rag_demo_coll']
{'auto_id': True, 'description': 'RAG demonstration collection', 'fields': [{'name': 'chunk_id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'rating', 'description': '', 'type': <DataType.INT64: 5>}, {'name': 'content_text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 2048}}, {'name': 'content_embedding', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 1024}}], 'enable_dynamic_field': True}


### Create embeddings

In [21]:
from mistralai.client import MistralClient
from sentence_transformers import SentenceTransformer


mistral_client = MistralClient(api_key="jnZL0bXjwnZiMdd10C6ooBNXY8vvuQEV")

i_rating = ref_content["Rating"].tolist()

i_content_text = ref_content["Content"].tolist()

embeddings_response = mistral_client.embeddings(
      model="mistral-embed",
      input=i_content_text,
  )

i_content_embedding=[i.embedding
                     for i in embeddings_response.data]

insert_data=[i_rating,i_content_text, i_content_embedding]


### Insert and Index

In [22]:
mr=rag_collection.insert(insert_data)
print("Inserted data. Now flushing")

rag_collection.flush(timeout=180)


Inserted data. Now flushing


In [23]:
index_params = {
    "metric_type":"L2",
    "index_type":"IVF_FLAT",
    "params" :{"nlist":1024}
}

rag_collection.create_index(
    field_name="content_embedding",
    index_params=index_params
)

utility.index_building_progress(collection_name,using=conn_name)

{'total_rows': 8, 'indexed_rows': 8, 'pending_index_rows': 0}

### Run Scalar queries

In [24]:
#Load the collection
rag_collection.load()


q_result= rag_collection.query(
    expr = "rating > 5",
    output_fields = ["chunk_id","rating","content_text"]
)
for i in q_result:
    print(i.get("chunk_id"), "\t", i.get("rating"), "\t", i.get("content_text"))

446976531358641952 	 10 	 Python Python Python Python Python
446976531358641954 	 6 	 Python HR Development Sales Marketing
446976531358641955 	 9 	 Snake Viper Reptile cobra anaconda
446976531358641958 	 7 	 Google Amazon Microsoft Facebook Apple
446976531358641959 	 8 	 Apple Orange Grape Melon Berry


### Run Vector Search

In [80]:
from rank_bm25 import BM25Okapi
from scipy.stats import rankdata

def run_search(search_string):
    
    search_params = {
        "metric_type": "L2", 
        "offset": 0, 
        "ignore_growing": False, 
        "params": {"nprobe": 10,  "radius":0.9}
    }
    
    #Get embedding for search string
    search_embed=mistral_client.embeddings(
          model="mistral-embed",
          input=[search_string],
      ).data[0].embedding
    
    q_results=rag_collection.search(
        data=[search_embed],
        anns_field="content_embedding",
        param=search_params,
        limit=5, #Total number of returned results
        expr=None,
        output_fields=["content_text"],
        consistency_level="Strong"
    )

    #Rank using BM25
    #Collect content text in an array
    hit_list = [i.entity.get("content_text") for i in q_results[0]]

    tokenized_hit_list = [doc.split(" ") for doc in hit_list]

    bm25 = BM25Okapi(tokenized_hit_list)
    
    tokenized_query = search_string.split(" ")
    
    doc_scores = bm25.get_scores(tokenized_query)
    doc_ranks = rankdata([1 - i for i in doc_scores], method='ordinal')

    #Ranking complete
    
    print("-----------------------------------------------------------------------------------")
    print("Search string : ", search_string)
    print("-----------------------------------------------------------------------------------")
    for i in range(len(q_results[0])):
        hit=q_results[0][i]
        print( "Dist=", str(round(hit.distance,2)),"\t",hit.entity.get("content_text"),
              "\t BM25 Rank=", doc_ranks[i], " BM25 Score=", str(round(doc_scores[i],2)))

    return q_results[0],doc_ranks
    

In [71]:
run_search("Python")
run_search("PHP")
run_search("Snake")
run_search("molecule")

-----------------------------------------------------------------------------------
Search string :  Python
-----------------------------------------------------------------------------------
Dist= 0.26 	 Python Python Python Python Python 	 BM25 Rank= 1  BM25 Score= 0.49
Dist= 0.48 	 Python HR Development Sales Marketing 	 BM25 Rank= 2  BM25 Score= 0.26
Dist= 0.49 	 C Java Javascript Python Pascal 	 BM25 Rank= 3  BM25 Score= 0.26
Dist= 0.52 	 Snake Viper Reptile cobra anaconda 	 BM25 Rank= 4  BM25 Score= 0.0
Dist= 0.65 	 Lion Tiger Elephant Whale Crocodile 	 BM25 Rank= 5  BM25 Score= 0.0
-----------------------------------------------------------------------------------
Search string :  PHP
-----------------------------------------------------------------------------------
Dist= 0.48 	 C Java Javascript Python Pascal 	 BM25 Rank= 1  BM25 Score= 0.0
Dist= 0.56 	 Python Python Python Python Python 	 BM25 Rank= 2  BM25 Score= 0.0
Dist= 0.61 	 Python HR Development Sales Marketing 	 BM25 

<pymilvus.orm.search.Hits at 0x7fd382fe55e0>

### Generate using LLM

In [104]:
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

search_string="Where is Python used?"

model = "mistral-tiny"

#Generate without Retrieval support
messages = [
    ChatMessage(role="user", content=search_string)
]

chat_response = mistral_client.chat(
    model=model,
    messages=messages
)
print("Response without Retrieval : \n----------------------------\n", 
      chat_response.choices[0].message.content)


Response without Retrieval : 
----------------------------
 Python is a versatile and widely-used programming language that can be applied to various fields and industries, making it a popular choice for developers and data scientists. Here are some common areas where Python is used:

1. Web Development: Python is used to build web applications through frameworks like Django and Flask. These frameworks help in creating dynamic websites, handling databases, and managing user authentication.

2. Data Science and Machine Learning: Python is the go-to language for data science and machine learning due to its simplicity and the availability of powerful libraries like NumPy, Pandas, Matplotlib, and Scikit-learn. These libraries make it easy to process, analyze, and visualize complex data.

3. Scientific Computing: Python is used extensively in scientific computing due to its ease of use, flexibility, and the availability of scientific libraries like NumPy, SciPy, and SciPy. These libraries p

In [105]:
#Generate with retreival support
retreived_results, doc_ranks = run_search(search_string)

-----------------------------------------------------------------------------------
Search string :  Where is Python used?
-----------------------------------------------------------------------------------
Dist= 0.39 	 Python Python Python Python Python 	 BM25 Rank= 1  BM25 Score= 0.49
Dist= 0.51 	 C Java Javascript Python Pascal 	 BM25 Rank= 2  BM25 Score= 0.26
Dist= 0.57 	 Python HR Development Sales Marketing 	 BM25 Rank= 3  BM25 Score= 0.26
Dist= 0.65 	 Snake Viper Reptile cobra anaconda 	 BM25 Rank= 4  BM25 Score= 0.0
Dist= 0.72 	 Google Amazon Microsoft Facebook Apple 	 BM25 Rank= 5  BM25 Score= 0.0


In [106]:
#Pick only the top 3 results and form the prompt

context_list = [i.entity.get("content_text") 
                for i in retreived_results]

#Sort the results based on doc ranks
sorted_list = [x for _, x in sorted(zip(doc_ranks, context_list))]

context_str=". ".join(sorted_list[0:3])

prompt="Context information is below.\n" + \
"---------------------\n" + \
context_str +"\n" + \
"---------------------\n" + \
"Given the context information and not any prior knowledge, answer the query. \
Answer should use only information in the context\n" +  \
"Query: " + search_string

print(prompt)

Context information is below.
---------------------
Python Python Python Python Python. C Java Javascript Python Pascal. Python HR Development Sales Marketing
---------------------
Given the context information and not any prior knowledge, answer the query. Answer should use only information in the context
Query: Where is Python used?


In [107]:
#Generate with Retrieval support
messages = [
    ChatMessage(role="user", content=prompt)
]

chat_response = mistral_client.chat(
    model=model,
    messages=messages
)
print("Response with Retrieval : \n----------------------------\n", 
      chat_response.choices[0].message.content)

Response with Retrieval : 
----------------------------
 Based on the context information provided, Python is mentioned in close proximity to various fields such as HR Development, Sales, and Marketing. While it does not explicitly state that Python is used in these fields, the presence of the keyword in this context suggests that Python might be used for various tasks or applications within these domains. Additionally, Python is also mentioned in the same line as other programming languages like Java, Javascript, and Pascal, implying its use in software development or related areas. Overall, the context suggests that Python may be used in HR Development, Sales, Marketing, and software development roles.
