In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
from keras.models import Sequential
from keras.layers import Dense, Input
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
import scipy.sparse as sp
from tqdm import tqdm
import multiprocessing as mp
import warnings
import speech_recognition as sr  # Using speech_recognition instead of whisper

# Suppress specific FutureWarnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Function to preprocess text using spaCy
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

def preprocess_text(text):
    doc = nlp(text)
    return ' '.join([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])

# Function to preprocess and vectorize user input
def preprocess_user_input(user_input, vectorizer):
    combined_text = ' '.join([preprocess_text(value) for value in user_input.values()])
    processed_text = vectorizer.transform([combined_text])
    return processed_text

# Load the dataset
file_path = 'medicine_dataset.csv'
data = pd.read_csv(file_path, low_memory=False)

# Keep only the specified columns
columns_to_keep = ['name', 'use0', 'use1']
data = data[columns_to_keep]

# Display basic information about the dataset
print("Dataset Info:")
print(data.info())
print("\nFirst few rows of the dataset:")
print(data.head())

# Handle missing values
data.fillna('', inplace=True)

# Clean text data
def clean_text(text):
    return str(text).lower().replace('_', ' ')

for col in columns_to_keep:
    if col in data.columns:
        data[col] = data[col].apply(clean_text)

# Remove duplicates
data = data.drop_duplicates()

# Save cleaned dataset
cleaned_file_path = 'cleaned_medicine_dataset.csv'
data.to_csv(cleaned_file_path, index=False)

print("Dataset cleaned and saved successfully.")

# Load cleaned data
data = pd.read_csv(cleaned_file_path, low_memory=False)

# Combine relevant columns into a single 'text' column for processing
def combine_text(row):
    return ' '.join([str(row['use0']), str(row['use1'])])

data['combined_text'] = data.apply(combine_text, axis=1)

# Preprocess the combined text in parallel
def parallelize_dataframe(df, func):
    num_cores = mp.cpu_count()
    df_split = np.array_split(df, num_cores)
    pool = mp.Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

def preprocess_texts(df):
    df['combined_text'] = df['combined_text'].apply(preprocess_text)
    return df

data = parallelize_dataframe(data, preprocess_texts)

# Vectorize text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['combined_text'])
y = data['name']

# Save the processed data
X_file_path = 'X_tfidf_vectors.npz'
y_file_path = 'y_labels.csv'
sp.save_npz(X_file_path, X)
y.to_csv(y_file_path, index=False)

print("Text data preprocessed and saved successfully.")

# Load processed data
X = sp.load_npz(X_file_path)
y = pd.read_csv(y_file_path)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y.values.ravel())

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Build and compile model
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(256, activation='relu'),
    Dense(128, activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model with reduced sample size for quick testing
sample_size = 5000  # Adjust this for quicker runs
X_train_sample, _, y_train_sample, _ = train_test_split(X_train, y_train, train_size=sample_size, random_state=42)

# Train model
model.fit(X_train_sample, y_train_sample, epochs=5, batch_size=32, validation_data=(X_test, y_test))

# Predict and evaluate
y_pred = model.predict(X_test)
y_pred_classes = y_pred.argmax(axis=1)

# Use unique classes in y_test for the classification report
unique_test_labels = np.unique(y_test)
target_names = label_encoder.inverse_transform(unique_test_labels)

print(classification_report(y_test, y_pred_classes, labels=unique_test_labels, target_names=target_names, zero_division=0))

# Function to convert speech to text using SpeechRecognition
def speech_to_text(audio_path):
    recognizer = sr.Recognizer()
    with sr.AudioFile(audio_path) as source:
        audio = recognizer.record(source)
    try:
        return recognizer.recognize_google(audio)
    except sr.UnknownValueError:
        return "Speech Recognition could not understand audio"
    except sr.RequestError as e:
        return f"Could not request results from Google Speech Recognition service; {e}"

# Get user input via text
user_input = {
    'primary_reason': input("What is your primary reason for seeking medication? "),
    'allergies': input("Do you have any known allergies or sensitivities to medications? "),
    'current_medications': input("Are you currently taking any other medications (prescription, over-the-counter, supplements)? "),
    'chronic_conditions': input("Do you have any chronic medical conditions (e.g., diabetes, hypertension, asthma)? "),
    'symptoms': input("Can you describe your symptoms in detail? When did they start? ")
}

# For audio input (uncomment and specify audio file path)
# audio_path = 'path_to_audio_file.wav'
# speech_text = speech_to_text(audio_path)
# user_input['symptoms'] = speech_text

# Preprocess and vectorize user input
user_vector = preprocess_user_input(user_input, vectorizer)

# Predict medication
user_prediction = model.predict(user_vector)
predicted_medicine_index = user_prediction.argmax(axis=1)
recommended_medicine = label_encoder.inverse_transform(predicted_medicine_index)

print(f"Recommended Medicine: {recommended_medicine[0]}")

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 248218 entries, 0 to 248217
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   name    248218 non-null  object
 1   use0    248218 non-null  object
 2   use1    73365 non-null   object
dtypes: object(3)
memory usage: 5.7+ MB
None

First few rows of the dataset:
                       name  \
0  augmentin 625 duo tablet   
1       azithral 500 tablet   
2          ascoril ls syrup   
3      allegra 120mg tablet   
4            avil 25 tablet   

                                                use0  \
0                  Treatment of Bacterial infections   
1                  Treatment of Bacterial infections   
2                      Treatment of Cough with mucus   
3  Treatment of Sneezing and runny nose due to al...   
4                   Treatment of Allergic conditions   

                               use1  
0                               NaN  
1             