In [3]:
from sentence_transformers import SentenceTransformer
from pymilvus import connections,utility,Collection,CollectionSchema, FieldSchema,DataType
from langchain.vectorstores import Milvus
from langchain.embeddings import HuggingFaceEmbeddings, SentenceTransformerEmbeddings

In [6]:
connections.connect(host="localhost",port="19530")
model = SentenceTransformer("paraphrase-MiniLM-L6-v2")

Downloading (…)001fa/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)3bbb8001fa/README.md:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading (…)bb8001fa/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)001fa/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading (…)3bbb8001fa/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)b8001fa/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [56]:
## Define the fields that would go into the schema 
item_id = FieldSchema(
  name="id",
  dtype=DataType.INT64,
  is_primary=True,
  auto_id=True
)

## you need to provide max_length
text = FieldSchema(
  name="text",
  dtype=DataType.VARCHAR,
  max_length= 50000

)

## one float_vector field mandatory ?
embeddings = FieldSchema(
  name="embeddings",
  dtype=DataType.FLOAT_VECTOR,
  dim=384
)
## define collection schema 
schema = CollectionSchema(
  fields=[item_id, text, embeddings],
  description="Search clinical trial docs",
  enable_dynamic_field=True
)

## define the collection
collection = Collection(
    name="clinical_trials",
    schema=schema,
    using='default'
    )


In [58]:
## text from here - fda.gov/drugs/investigational-new-drug-ind-application/ind-applications-clinical-investigations-clinical-protocols
texts = ["Early developmental protocols should specify in detail all \
         the elements of the study that are critical to safety. Such elements \
         may include all clinical safety assessments, toxicity monitoring,\
         description of toxicity-based stopping rules, dose adjustment rules\
         for individual patients and the overall trial, and adverse event recording and reporting",
         "Study enrollment criteria should be written with consideration of the following: (1)\
         background risks associated with the disease or condition studied, (2) previous\
         knowledge of toxicities of the investigational drug observed in animal studies\
         or with human experience, (3) warnings and precautions described in the product’s label\
         (when approved products are investigated for other than approved uses",
         "It is preferable that toxicity is assessed and graded according to a standardized grading\
         scale relevant to the studied population and that adverse events are collected, \
         recorded, and reported in a consistent manner."]
embeds = [list(embed) for embed in model.encode(texts)]

In [59]:
len(embeds) ## no of embeddings 

3

In [60]:
len(embeds[0]) ## dim of each embedding

384

In [61]:
collection.insert([texts,embeds])

(insert count: 3, delete count: 0, upsert count: 0, timestamp: {self._timestamp}, success count: {self.succ_count}, err count: {self.err_count})

In [62]:
collection.create_index(field_name="embeddings",\
                        index_params={"metric_type":"IP","index_type":"IVF_FLAT","params":{"nlist":16384}})



Status(code=0, message=)

In [69]:
query = "adjusting the doses"
query_encode = [list(i) for i in model.encode([query])]

In [70]:
collection = Collection('clinical_trials')
collection.load()
documents = collection.search(data=query_encode, anns_field="embeddings", param={"metric":"IP","offset":0},
                  output_fields=["text"], limit=1)


In [71]:
for values in documents:
    for doc in values:
        print(doc.entity._row_data["text"],doc.distance,doc.id)
collection.release()



In [54]:
## drop the collection, everything is lost.
utility.drop_collection("clinical_trials")

## Using Langchain

In [84]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter

loader = PyPDFLoader("clinical_trial_conduct.pdf")
pages = loader.load_and_split()

In [77]:
#pip install pypdf

In [85]:
len(pages)

130

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(pages)

In [100]:
embeddings = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)e9125/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading (…)9125/train_script.py:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading (…)7e55de9125/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)5de9125/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [101]:
vector_db = Milvus.from_documents(
    docs,
    embeddings,
    connection_args={"host": "127.0.0.1", "port": "19530"},
)

In [137]:
query = "what is the investigator’s responsibility for the medical care of research subjects"
docs = vector_db.similarity_search(query)

In [138]:
len(docs)

4

In [None]:
## But wait, what about the Milvus collection schema and collection itself ? 

In [140]:
print(f"Default collection name - {vector_db.collection_name}")
print(f"Default search params - {vector_db.search_params}")
print(f"Default index params - {vector_db.index_params}") #HNSW

Default collection name - LangChainCollection
Default search params - {'metric_type': 'L2', 'params': {'ef': 10}}
Default index params - {'metric_type': 'L2', 'index_type': 'HNSW', 'params': {'M': 8, 'efConstruction': 64}}


In [123]:
vector_db_custom = Milvus.from_documents(
    docs,
    embeddings,
    connection_args={"host": "127.0.0.1", "port": "19530"},
    collection_name = "clinical_trial_langchain", ## custom collection name 
    search_params = {"metric":"IP","offset":0}, ## search params
)

In [125]:
vector_db_custom.collection_name

'clinical_trial_langchain'

In [126]:
query = "what is the investigator’s responsibility for the medical care of research subjects"
docs_new_metric = vector_db.similarity_search(query)

In [132]:
docs_new_metric[2]

Document(page_content='Investigator Selection (ICH E6, Section 5.6)\nAllocation of Duties and Functions (ICH E6, Section 5.7)\nFor regulatory authorities , refer to\nConducting the Inspection (A Guide to Clinical Investigator Inspec-\ntions, PAHO, Annex 4, Section 2)\nSee also:Discussion of the WHO Principles of GCP\nGCP Principle 9: Investigator Qualiﬁ  cations\nDeﬁ nitions for:\nInvestigator (ICH E6, 1.34)Subinvestigator (ICH E6, 1.56)Well-being (of the trial subjects) (ICH E6, 1.62)\nPRINCIPLE 10: STAFF QUALIFICATIONS | 91', metadata={'source': 'clinical_trial_conduct.pdf', 'page': 95})

In [None]:
## Milvis collection details :

In [136]:
vector_db.col

<Collection>:
-------------
<name>: LangChainCollection
<partitions>: [{"name": "_default", "collection_name": "LangChainCollection", "description": ""}]
<description>: 
<schema>: {
  auto_id: True
  description: 
  fields: [{
    name: source
    description: 
    type: 21
    params: {'max_length': 65535}
  }, {
    name: page
    description: 
    type: 5
  }, {
    name: text
    description: 
    type: 21
    params: {'max_length': 65535}
  }, {
    name: pk
    description: 
    type: 5
    is_primary: True
    auto_id: True
  }, {
    name: vector
    description: 
    type: 101
    params: {'dim': 384}
  }]
}

In [135]:
vector_db_custom.col

<Collection>:
-------------
<name>: clinical_trial_langchain
<partitions>: [{"name": "_default", "collection_name": "clinical_trial_langchain", "description": ""}]
<description>: 
<schema>: {
  auto_id: True
  description: 
  fields: [{
    name: source
    description: 
    type: 21
    params: {'max_length': 65535}
  }, {
    name: page
    description: 
    type: 5
  }, {
    name: text
    description: 
    type: 21
    params: {'max_length': 65535}
  }, {
    name: pk
    description: 
    type: 5
    is_primary: True
    auto_id: True
  }, {
    name: vector
    description: 
    type: 101
    params: {'dim': 384}
  }]
}