In [1]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# Import the kaggle dataset
df = pd.read_csv('Medicine_Details.csv')

In [3]:
df.head()

Unnamed: 0,Medicine Name,Composition,Uses,Side_effects,Image URL,Manufacturer,Excellent Review %,Average Review %,Poor Review %
0,Avastin 400mg Injection,Bevacizumab (400mg),Cancer of colon and rectum Non-small cell lun...,Rectal bleeding Taste change Headache Noseblee...,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Roche Products India Pvt Ltd,22,56,22
1,Augmentin 625 Duo Tablet,Amoxycillin (500mg) + Clavulanic Acid (125mg),Treatment of Bacterial infections,Vomiting Nausea Diarrhea Mucocutaneous candidi...,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Glaxo SmithKline Pharmaceuticals Ltd,47,35,18
2,Azithral 500 Tablet,Azithromycin (500mg),Treatment of Bacterial infections,Nausea Abdominal pain Diarrhea,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Alembic Pharmaceuticals Ltd,39,40,21
3,Ascoril LS Syrup,Ambroxol (30mg/5ml) + Levosalbutamol (1mg/5ml)...,Treatment of Cough with mucus,Nausea Vomiting Diarrhea Upset stomach Stomach...,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Glenmark Pharmaceuticals Ltd,24,41,35
4,Aciloc 150 Tablet,Ranitidine (150mg),Treatment of Gastroesophageal reflux disease (...,Headache Diarrhea Gastrointestinal disturbance,"https://onemg.gumlet.io/l_watermark_346,w_480,...",Cadila Pharmaceuticals Ltd,34,37,29


In [4]:
# Drop irrelevant columns
columns_to_drop = ['Image URL', 'Excellent Review %', 'Average Review %', 'Poor Review %']
df = df.drop(columns=columns_to_drop)

In [5]:
df.isnull().sum()

Unnamed: 0,0
Medicine Name,0
Composition,0
Uses,0
Side_effects,0
Manufacturer,0


In [6]:
df.to_csv('medicine_df.csv', index=False)

In [7]:
%%capture
!pip install langchain pypdf langchain_experimental faiss-cpu tiktoken langchain-huggingface sentence-transformers

In [8]:
from langchain_core.documents import Document

In [9]:
# Convert dataset rows into document objects with metadata for efficient retrieval
medicine_docs = []
for _, row in df.iterrows():

    content = (
        f"Medicine Name: {row['Medicine Name']}\n"
        f"Composition: {row['Composition']}\n"
        f"Uses: {row['Uses']}\n"
        f"Side Effects: {row['Side_effects']}\n"
        f"Manufacturer: {row['Manufacturer']}"
    )
    metadata = {
        "source": "Kaggle_Dataset",
        "type": "Product_Sheet",
        "medicine_name": row['Medicine Name']
    }
    medicine_docs.append(Document(page_content=content, metadata=metadata))

In [10]:
medicine_docs[10]

Document(metadata={'source': 'Kaggle_Dataset', 'type': 'Product_Sheet', 'medicine_name': 'Azee 500 Tablet'}, page_content='Medicine Name: Azee 500 Tablet\nComposition: Azithromycin (500mg)\nUses: Treatment of Bacterial infections\nSide Effects: Nausea Abdominal pain Diarrhea\nManufacturer: Cipla Ltd')

In [11]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [12]:
# PDF Chunking: Split text into segments with overlap to maintain semantic context
def process_book(path, name):
    loader = PyPDFLoader(path)
    docs = loader.load()

    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=1000,
        chunk_overlap=150,
        length_function=len
    )
    chunks = text_splitter.split_documents(docs)

    for chunk in chunks:
        chunk.metadata["name"] = name
        chunk.metadata["source_type"] = "Medical_Literature"

    return chunks

In [14]:
katzung_chunks = process_book("Katzung - Basic and Clinical Pharmacology 12th Edition (2012).pdf", "Katzung_Clinical")
principles_chunks = process_book("GENERAL PRINCIPLES OF PHARMACOLOGY.pdf", "Gen_Principles")

all_book_chunks = katzung_chunks + principles_chunks
print(f"Libros procesados: {len(all_book_chunks)} fragmentos generados.")
print(f"katzung: {len(katzung_chunks)} fragmentos generados.")
print(f"principles: {len(principles_chunks)} fragmentos generados.")



Libros procesados: 11324 fragmentos generados.
katzung: 7260 fragmentos generados.
principles: 4064 fragmentos generados.


In [15]:
print(katzung_chunks[0].page_content)

SCHEDULE OF CONTROLLED DRUGS 1
SCHEDULE I
(All nonresearch use illegal under federal law.)
Flunitrazepam (Rohypnol)
Narcotics:
Heroin and many nonmarketed synthetic narcotics
Hallucinogens:
LSD
MDA, STP , DMT, DET, mescaline, peyote, bufotenine, ibogaine, 
psilocybin, phencyclidine (PCP; veterinary drug only)
Marijuana
Methaqualone
SCHEDULE II
(No telephone prescriptions, no refills.)2
Opioids:
Opium
Opium alkaloids and derived phenanthrene alkaloids: codeine, 
morphine, (Avinza, Kadian, MSContin, Roxanol), hydromorphone 
(Dilaudid ), oxymorphone (, Exalgo), oxycodone (dihydroxycodei-
none, a component of Oxycontin, Percodan, Percocet, Roxicodone, 
Tylox)
Designated synthetic drugs: meperidine (Demerol), methadone, 
levorphanol (Levo-Dromoran), fentanyl (Duragesic, Actiq, 
Fentora), alfentanil (Alfenta), sufentanil (Sufenta), remifentanil 
(Ultiva), tapentadol (Nycynta)
Stimulants:
Coca leaves and cocaine
Amphetamine
Amphetamine complex (Biphetamine)
Amphetamine salts (Adderall)


In [16]:
%%capture
!pip install langchain-huggingface sentence-transformers faiss-cpu

In [17]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

In [18]:
docs = medicine_docs + katzung_chunks + principles_chunks
print(f"Total documents to index: {len(docs)}")

Total documents to index: 23149


In [19]:
# Initialize the embedding model on GPU (CUDA)
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': False}

embeddings = HuggingFaceEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [20]:
# Create and save the FAISS vector database locally for persistent storage
vectorstore = FAISS.from_documents(docs, embeddings)
vectorstore.save_local("faiss_pharmacy")

In [21]:
# Perform a similarity search to verify information retrieval from both the dataset and medical literature
query = "What is the mechanism of action and side effects of Azithromycin?"

print(f"Query: '{query}'\n")
results = vectorstore.similarity_search(query, k=5)

for i, doc in enumerate(results):

    source = doc.metadata.get('source', doc.metadata.get('book_id', 'Unknown'))
    source_type = doc.metadata.get('type', doc.metadata.get('source_type', 'N/A'))

    print(f" RESULT {i+1}")
    print(f" SOURCE: {source} ({source_type})")
    print(f" CONTENT: {doc.page_content[:300]}...\n")
    print("-" * 50,"\n")

sources_found = [doc.metadata.get('source', doc.metadata.get('book_id')) for doc in results]
if any('Kaggle' in s for s in sources_found if s) and any('Katzung' in s or 'Principles' in s for s in sources_found if s):
    print("SUCCESS: The system has retrieved information from both the Dataset and the Literature.")
else:
    print("WARNING: Information was only retrieved from a single source. Consider adjusting the 'k' value or the query.")

Query: 'What is the mechanism of action and side effects of Azithromycin?'

 RESULT 1
 SOURCE: Katzung - Basic and Clinical Pharmacology 12th Edition (2012).pdf (Medical_Literature)
 CONTENT: AZITHROMYCIN 
 Azithromycin, a 15-atom lactone macrolide ring compound, is 
derived from erythromycin by addition of a methylated nitrogen 
into the lactone ring. Its spectrum of activity, mechanism of action, 
and clinical uses are similar to those of clarithromycin. Azithromycin 
is active against...

-------------------------------------------------- 

 RESULT 2
 SOURCE: Kaggle_Dataset (Product_Sheet)
 CONTENT: Medicine Name: Azithral 200 Liquid
Composition: Azithromycin (200mg/5ml)
Uses: Treatment of Bacterial infections
Side Effects: Nausea Abdominal pain Diarrhea
Manufacturer: Alembic Pharmaceuticals Ltd...

-------------------------------------------------- 

 RESULT 3
 SOURCE: Kaggle_Dataset (Product_Sheet)
 CONTENT: Medicine Name: Azithral XL 200 Liquid
Composition: Azithromycin (200mg/5m