In [9]:
import numpy as np
import pandas as pd
import pickle

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import nltk
from nltk.stem import WordNetLemmatizer


In [10]:
medicine = pd.read_csv("medicine.csv")

medicine = medicine.dropna(subset=["Drug_Name", "Description", "Reason"])
medicine = medicine.reset_index(drop=True)

print("Dataset shape:", medicine.shape)


Dataset shape: (9720, 4)


In [11]:
nltk.download("wordnet")
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = str(text).lower()
    return " ".join(lemmatizer.lemmatize(word) for word in text.split())


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
medicine["tags"] = medicine["Description"] + " " + medicine["Reason"]
medicine["tags"] = medicine["tags"].apply(preprocess_text)


In [13]:
tfidf = TfidfVectorizer(
    stop_words="english",
    max_features=5000,
    ngram_range=(1, 2)
)

vectors = tfidf.fit_transform(medicine["tags"])
print("TF-IDF vector shape:", vectors.shape)


TF-IDF vector shape: (9720, 2414)


In [14]:
def recommend(drug_name, top_n=5):
    if drug_name not in medicine["Drug_Name"].values:
        raise ValueError("Medicine not found in database.")

    idx = medicine.index[medicine["Drug_Name"] == drug_name][0]

    scores = cosine_similarity(vectors[idx], vectors).flatten()
    top_indices = np.argsort(scores)[::-1][1:top_n+1]

    return medicine.iloc[top_indices][
        ["Drug_Name", "Reason"]
    ].reset_index(drop=True)


In [15]:
results = recommend("ACGEL CL NANO Gel 15gm", top_n=5)
print(results)


                                Drug_Name Reason
0                 Entice Natura Soap 75gm   Acne
1                         Alight Gel 15gm   Acne
2             Acnetor AD 1% Ointment 15gm   Acne
3                    Alight Plus Gel 15gm   Acne
4  Acnetor AD Cream 15Acnetor AD Gel 15gm   Acne


In [16]:
pickle.dump(medicine, open("medicine_df.pkl", "wb"))
pickle.dump(tfidf, open("tfidf_vectorizer.pkl", "wb"))
pickle.dump(vectors, open("tfidf_vectors.pkl", "wb"))

print("Artifacts saved successfully.")


Artifacts saved successfully.
