In [1]:
import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
import os
import glob
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
import scipy.sparse as sp


In [2]:
# Load the dataset with low_memory=False to avoid the DtypeWarning
file_path = 'medicine_dataset.csv'
medicine_data = pd.read_csv(file_path, low_memory=False)
medicine_data.head()


Unnamed: 0,id,name,substitute0,substitute1,substitute2,substitute3,substitute4,sideEffect0,sideEffect1,sideEffect2,...,sideEffect41,use0,use1,use2,use3,use4,Chemical Class,Habit Forming,Therapeutic Class,Action Class
0,1,augmentin 625 duo tablet,Penciclav 500 mg/125 mg Tablet,Moxikind-CV 625 Tablet,Moxiforce-CV 625 Tablet,Fightox 625 Tablet,Novamox CV 625mg Tablet,Vomiting,Nausea,Diarrhea,...,,Treatment of Bacterial infections,,,,,,No,ANTI INFECTIVES,
1,2,azithral 500 tablet,Zithrocare 500mg Tablet,Azax 500 Tablet,Zady 500 Tablet,Cazithro 500mg Tablet,Trulimax 500mg Tablet,Vomiting,Nausea,Abdominal pain,...,,Treatment of Bacterial infections,,,,,Macrolides,No,ANTI INFECTIVES,Macrolides
2,3,ascoril ls syrup,Solvin LS Syrup,Ambrodil-LX Syrup,Zerotuss XP Syrup,Capex LS Syrup,Broxum LS Syrup,Nausea,Vomiting,Diarrhea,...,,Treatment of Cough with mucus,,,,,,No,RESPIRATORY,
3,4,allegra 120mg tablet,Lcfex Tablet,Etofex 120mg Tablet,Nexofex 120mg Tablet,Fexise 120mg Tablet,Histafree 120 Tablet,Headache,Drowsiness,Dizziness,...,,Treatment of Sneezing and runny nose due to al...,Treatment of Allergic conditions,,,,Diphenylmethane Derivative,No,RESPIRATORY,H1 Antihistaminics (second Generation)
4,5,avil 25 tablet,Eralet 25mg Tablet,,,,,Sleepiness,Dryness in mouth,,...,,Treatment of Allergic conditions,,,,,Pyridines Derivatives,No,RESPIRATORY,H1 Antihistaminics (First Generation)


Step 1: Data Preparation

In [3]:
# Fill all missing values (NaN) in the DataFrame medicine_data with empty strings ('')
medicine_data.fillna('', inplace=True)

# Clean text data
def clean_text(text):
    return text.lower().replace('_', ' ')

text_columns = ['name', 'substitute0', 'substitute1', 'substitute2', 'substitute3', 'substitute4',
                'sideEffect0', 'sideEffect1', 'sideEffect2', 'use0', 'use1', 'Chemical Class',
                'Therapeutic Class', 'Action Class']

for col in text_columns:
    medicine_data[col] = medicine_data[col].apply(clean_text)

medicine_data['combined_text'] = medicine_data[text_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)  # Further reduce the number of features
X = vectorizer.fit_transform(medicine_data['combined_text'])
y = medicine_data['name']

# Sample a subset of the data for initial testing without stratification
sample_size = 1000  # Adjust this based on available resources
X_sample, _, y_sample, _ = train_test_split(X, y, train_size=sample_size, random_state=42)

# Split the sample data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

# Combine labels from both training and test sets before fitting the LabelEncoder
all_labels = pd.concat([y_train, y_test])
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)

# Encode the target labels
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Verify that the encoded labels are within the valid range
print(f"Training labels range: {y_train_encoded.min()} to {y_train_encoded.max()}")
print(f"Testing labels range: {y_test_encoded.min()} to {y_test_encoded.max()}")

Training labels range: 1 to 996
Testing labels range: 0 to 979


Step 2: Define the Patient Questionnaire Function

In [4]:
def collect_patient_data():
    patient_data = {
        'primary_reason': input("What is your primary reason for seeking medication? "),
        'allergies': input("Do you have any known allergies or sensitivities to medications? "),
        'current_medications': input("Are you currently taking any other medications (prescription, over-the-counter, supplements)? "),
        'adverse_reactions': input("Have you had any adverse reactions to medications in the past? If so, please describe. "),
        'chronic_conditions': input("Do you have any chronic medical conditions (e.g., diabetes, hypertension, asthma)? "),
        'symptoms': input("Can you describe your symptoms in detail? When did they start? "),
        'symptom_severity': input("How severe are your symptoms? Have they been getting better, worse, or staying the same? "),
    }
    return patient_data

Step 3: Collect Patient Data and Feature Engineering

In [5]:
# Collect patient data
patient_data = collect_patient_data()

# Convert patient data to DataFrame for easy manipulation
patient_df = pd.DataFrame([patient_data])

# Process text data (e.g., combine all text fields into one column)
text_fields = [
    'primary_reason', 'allergies', 'current_medications', 'adverse_reactions', 'chronic_conditions',
    'symptoms', 'symptom_severity'
]

# Combine text fields into a single feature for vectorization
patient_df['combined_text'] = patient_df[text_fields].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# Vectorize the combined text data using the same vectorizer
patient_vector = vectorizer.transform(patient_df['combined_text'])

# Verify the shape of the patient vector
print(f"Patient vector shape: {patient_vector.shape}")

Patient vector shape: (1, 1000)


Step 4: Model Training

In [6]:
# Combine patient vector with medicine vectors for model training
X_combined = sp.vstack([X_train, patient_vector])
y_combined = pd.concat([y_train, pd.Series(['patient'])])

# Ensure the patient label is correctly handled
if 'patient' not in label_encoder.classes_:
    label_encoder.classes_ = np.append(label_encoder.classes_, 'patient')

# Encode the combined labels
y_combined_encoded = label_encoder.transform(y_combined)

# Verify that the combined encoded labels are within the valid range
print(f"Combined labels range: {y_combined_encoded.min()} to {y_combined_encoded.max()}")

# Build a simple feedforward neural network
model = Sequential()
model.add(Input(shape=(X_train.shape[1],)))
model.add(Dense(128, activation='relu'))  # Further reduce the number of neurons
model.add(Dense(64, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model with smaller batch size and fewer epochs
model.fit(X_combined.toarray(), y_combined_encoded, epochs=5, batch_size=16, validation_data=(X_test.toarray(), y_test_encoded))

Combined labels range: 1 to 997
Epoch 1/5
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 40ms/step - accuracy: 0.0000e+00 - loss: 6.9099 - val_accuracy: 0.0000e+00 - val_loss: 6.9311
Epoch 2/5
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.0468 - loss: 6.8816 - val_accuracy: 0.0000e+00 - val_loss: 7.1153
Epoch 3/5
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.0540 - loss: 6.7208 - val_accuracy: 0.0000e+00 - val_loss: 7.7123
Epoch 4/5
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.1029 - loss: 6.3052 - val_accuracy: 0.0000e+00 - val_loss: 9.3978
Epoch 5/5
[1m51/51[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.1292 - loss: 5.5857 - val_accuracy: 0.0000e+00 - val_loss: 11.8026


<keras.src.callbacks.history.History at 0x1cb07f8a530>

Step 5: Recommendation System

In [7]:
# Predict medicine recommendations for the patient
patient_prediction = model.predict(patient_vector.toarray())
predicted_medicine_index = np.argmax(patient_prediction, axis=1)
recommended_medicine = label_encoder.inverse_transform(predicted_medicine_index)

print(f"Recommended Medicine: {recommended_medicine[0]}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 269ms/step
Recommended Medicine: duloxee m tablet
