In [17]:
from milvus import default_server
from pymilvus import FieldSchema, CollectionSchema, DataType, Collection, utility, connections

In [18]:
default_server.start()

In [19]:
connections.connect(
    host = "127.0.0.1",
    port= default_server.listen_port
)

In [20]:
# '''Used to Clear the Collection (deletes collection and all data)'''

# collection = Collection(name="Profspective")
# collection.drop()

In [21]:
DIMENSION = 384
MAX_STRING_LENGTH = 500

fields = [
	FieldSchema(name = "id", dtype= DataType.INT64, is_primary = True, auto_id = True),
 	FieldSchema(name = "embedding", dtype=DataType.FLOAT_VECTOR, dim = DIMENSION),
  	FieldSchema(name="professor", dtype=DataType.VARCHAR, max_length=MAX_STRING_LENGTH),
    FieldSchema(name="review", dtype=DataType.VARCHAR, max_length=MAX_STRING_LENGTH),
    FieldSchema(name="stars", dtype=DataType.INT8),
]

schema = CollectionSchema(fields = fields, enable_dynamic_field = True)

In [22]:
if utility.has_collection("Profspective"):
    collection = Collection(name="Profspective")  #Prevent changing schema's if collection alr exists
else:
	collection = Collection(name= "Profspective", schema=schema)

In [23]:
index_params = {
	"index_type": "IVF_FLAT",
 	"metric_type": "L2",
 	"params": {"nlist": 50},
}

# '''Add if getting "cannot have more than 1 distinct index per field" (happens when changes to index_param)'''
# collection.release()
# collection.drop_index()

collection.create_index(field_name="embedding", index_params=index_params,)
collection.load()

In [24]:
from sentence_transformers import SentenceTransformer

embedder = SentenceTransformer("all-MiniLM-L6-v2")

In [25]:
def reviewExists(professor, review, subject, stars):
    query = f'professor == "{professor}" && review == "{review}" && subject == "{subject}" && stars == {stars}'
    results = collection.query(query, output_fields=["id"])
    
    return len(results) > 0

In [26]:
def validateProfData(profData):
    requiredField = ["professor", "review", "subject", "stars"]
    
    for field in requiredField:
        if field not in profData:
            raise Exception(f'Missing Field in Prof Data "{field}"')

In [27]:
import json

jsonData = json.load(open("../review.json"))
reviews = jsonData["reviews"]

insertDb = []

for profData in reviews:
    validateProfData(profData)
    
    if(reviewExists(profData["professor"], profData["review"], profData["subject"], profData["stars"])):
        print("Review Already Exists in Vector Database")
        continue
    
    entry = profData.copy()
    
    reviewEmbedding = embedder.encode(profData["review"])
    entry["embedding"] = reviewEmbedding
    
    insertDb.append(entry)
    
if(insertDb):
    output = collection.insert(insertDb)
    print(output)
else:
    print("Nothing to Add")

Review Already Exists in Vector Database
Review Already Exists in Vector Database
Review Already Exists in Vector Database
Review Already Exists in Vector Database
Review Already Exists in Vector Database
Review Already Exists in Vector Database
Review Already Exists in Vector Database
Review Already Exists in Vector Database
Review Already Exists in Vector Database
Review Already Exists in Vector Database
Review Already Exists in Vector Database
Review Already Exists in Vector Database
Review Already Exists in Vector Database
Review Already Exists in Vector Database
Review Already Exists in Vector Database
Review Already Exists in Vector Database
Review Already Exists in Vector Database
Review Already Exists in Vector Database
Review Already Exists in Vector Database
Review Already Exists in Vector Database
Review Already Exists in Vector Database
Review Already Exists in Vector Database
Review Already Exists in Vector Database
Review Already Exists in Vector Database
Review Already E

In [28]:
collection.flush()

In [29]:
vector_count = collection.num_entities
print(f'Vector Count = {vector_count}')

Vector Count = 27
