1. Load and Clean the Dataset

In [10]:
import pandas as pd

# Load the dataset with low_memory=False to avoid the DtypeWarning
file_path = 'cleaned_medicine_dataset.csv'
medicine_data = pd.read_csv(file_path, low_memory=False)

# Fill all missing values with empty strings
medicine_data.fillna('', inplace=True)

# Clean text data
def clean_text(text):
    return text.lower().replace('_', ' ')

text_columns = ['name', 'use0']

for col in text_columns:
    medicine_data[col] = medicine_data[col].apply(clean_text)

# Combine relevant columns into a single 'text' column for processing
medicine_data['combined_text'] = medicine_data[text_columns].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

2. Vectorize Text Data Using TF-IDF

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the text data using TF-IDF
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(medicine_data['combined_text'])
y = medicine_data['name']

3. Sample and Split the Data

In [12]:
from sklearn.model_selection import train_test_split

# Sample a subset of the data for initial testing without stratification
sample_size = 1000
X_sample, _, y_sample, _ = train_test_split(X, y, train_size=sample_size, random_state=42)

# Split the sample data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.2, random_state=42)

4. Encode the Labels

In [13]:
from sklearn.preprocessing import LabelEncoder

# Fit the LabelEncoder on the entire dataset before splitting
label_encoder = LabelEncoder()
label_encoder.fit(y_sample)

# Encode the target labels
y_train_encoded = label_encoder.transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

5. Collect Patient Data

In [14]:
def collect_patient_data():
    patient_data = {
        'primary_reason': input("What is your primary reason for seeking medication? "),
        'allergies': input("Do you have any known allergies or sensitivities to medications? "),
        'current_medications': input("Are you currently taking any other medications (prescription, over-the-counter, supplements)? "),
        'adverse_reactions': input("Have you had any adverse reactions to medications in the past? If so, please describe. "),
        'chronic_conditions': input("Do you have any chronic medical conditions (e.g., diabetes, hypertension, asthma)? "),
        'symptoms': input("Can you describe your symptoms in detail? When did they start? "),
    }
    return patient_data

# Collect patient data
patient_data = collect_patient_data()

# Convert patient data to DataFrame for easy manipulation
patient_df = pd.DataFrame([patient_data])

# Combine text fields into a single feature for vectorization
text_fields = [
    'primary_reason', 'allergies', 'current_medications', 'adverse_reactions', 'chronic_conditions',
    'symptoms', 'symptom_severity'
]

patient_df['combined_text'] = patient_df[text_fields].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# Vectorize the combined text data using the same vectorizer
patient_vector = vectorizer.transform(patient_df['combined_text'])

6. Build and Train the Model

In [15]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input

# Build a simple feedforward neural network
model = Sequential()
model.add(Input(shape=(X_train.shape[1],)))
model.add(Dense(128, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(label_encoder.classes_), activation='softmax'))

# Compile the model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model with smaller batch size and fewer epochs
model.fit(X_train.toarray(), y_train_encoded, epochs=5, batch_size=16, validation_data=(X_test.toarray(), y_test_encoded))

Epoch 1/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 43ms/step - accuracy: 0.0000e+00 - loss: 6.9122 - val_accuracy: 0.0000e+00 - val_loss: 6.9334
Epoch 2/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.0352 - loss: 6.8862 - val_accuracy: 0.0000e+00 - val_loss: 7.1035
Epoch 3/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.0254 - loss: 6.7544 - val_accuracy: 0.0000e+00 - val_loss: 7.6140
Epoch 4/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.0742 - loss: 6.4177 - val_accuracy: 0.0000e+00 - val_loss: 9.1214
Epoch 5/5
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 18ms/step - accuracy: 0.0599 - loss: 5.9156 - val_accuracy: 0.0000e+00 - val_loss: 11.4094


<keras.src.callbacks.history.History at 0x1988249b3a0>

7. Predict Medication for the Patient

In [16]:
import numpy as np

# Predict medicine recommendations for the patient
patient_prediction = model.predict(patient_vector.toarray())
predicted_medicine_index = np.argmax(patient_prediction, axis=1)
recommended_medicine = label_encoder.inverse_transform(predicted_medicine_index)

print(f"Recommended Medicine: {recommended_medicine[0]}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 296ms/step
Recommended Medicine: kenadol plus oral suspension
