In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
import spacy
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load the data
data = pd.read_csv('C://Users//Harish//Downloads//symptomssingle.csv')

# Check for any missing values and remove them
data = data.dropna()

# Define a function to separate symptoms and diseases from the text

'''This function takes a text input, extracts symptoms using regular expressions,
removes patterns representing symptoms from the original text, and returns a
tuple containing the joined symptoms and the processed disease text.'''

def separate_symptoms_and_diseases(text):
    symptoms = re.findall(r'{"symptoms":"(.*?)"}', text)
    disease = re.sub(r'(?:{"symptoms":".*?"},?)+', '', text).strip()
    disease = disease.replace('],', '').strip()  # Remove '],' from the disease name
    return ' '.join(symptoms), disease  # Join symptoms into a single string

# Apply the function to the data
data['symptoms_and_diseases'] = data['data'].apply(separate_symptoms_and_diseases)
data[['symptoms', 'disease']] = pd.DataFrame(data['symptoms_and_diseases'].tolist(), index=data.index)
data = data.drop(columns=['data', 'symptoms_and_diseases'])

# Load the spaCy model
nlp = spacy.load('en_core_web_sm')

# Preprocessing function
def preprocess(symptoms):
    processed_symptoms = []
    for symptom in symptoms:
        doc = nlp(symptom)
        processed_symptom = ' '.join(token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha)
        processed_symptoms.append(processed_symptom)
    return ' '.join(processed_symptoms)

# Preprocess the symptoms column
data['symptoms_preprocessed'] = data['symptoms'].apply(preprocess)


# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(data['symptoms'], data['disease'], test_size=0.2, random_state=42)

# Create a pipeline for text classification
pipeline = Pipeline([
    ('count_vectorizer', CountVectorizer()),
    ('classifier', MultinomialNB())
])

# Train the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Save the trained model
joblib.dump(pipeline, 'DiseasePredictionBasedonSymptoms.joblib')

# Load the saved model
loaded_pipeline = joblib.load('DiseasePredictionBasedonSymptoms.joblib')

# Function to predict diseases based on symptoms and additional questions
def predict_diseases(symptoms, top_n=3):
    if not symptoms or all(symptom.strip() == '' for symptom in symptoms):
        return "Please provide your symptoms."

    processed_symptoms = preprocess(symptoms)
    input_data = processed_symptoms
    probabilities = loaded_pipeline.predict_proba([input_data])[0]
    top_n_indices = probabilities.argsort()[-top_n:][::-1]  # Get indices of top N probabilities
    top_n_diseases = loaded_pipeline.classes_[top_n_indices]  # Get corresponding disease labels
    return top_n_diseases



Accuracy:  0.0
Classification Report:
                                                 precision    recall  f1-score   support

                                     Achalasia       0.00      0.00      0.00       0.0
                              Acute bronchitis       0.00      0.00      0.00       1.0
                            Acute bronchospasm       0.00      0.00      0.00       1.0
         Acute fatty liver of pregnancy (AFLP)       0.00      0.00      0.00       0.0
                                Acute glaucoma       0.00      0.00      0.00       1.0
                         Acute stress reaction       0.00      0.00      0.00       1.0
                           Adjustment reaction       0.00      0.00      0.00       1.0
                               Adrenal adenoma       0.00      0.00      0.00       0.0
                                  Air embolism       0.00      0.00      0.00       0.0
                                 Alcohol abuse       0.00      0.00      0.00   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [2]:
def predict_diseases(symptoms, top_n=3):
    if not symptoms or all(symptom.strip() == '' for symptom in symptoms):
        return "Please provide your symptoms."

    processed_symptoms = preprocess(symptoms)

    # Check if processed symptoms are empty or contain only non-informative text
    if not processed_symptoms or all(token.isspace() for token in processed_symptoms.split()):
        return "Please provide meaningful symptoms."

    input_data = processed_symptoms
    probabilities = loaded_pipeline.predict_proba([input_data])[0]
    top_n_indices = probabilities.argsort()[-top_n:][::-1]  # Get indices of top N probabilities
    top_n_diseases = loaded_pipeline.classes_[top_n_indices]  # Get corresponding disease labels
    return top_n_diseases


In [4]:
# Example usage with user input
user_input_symptoms = input("Enter your symptoms (comma-separated): ")
user_symptoms = user_input_symptoms.split(',')
predicted_diseases = predict_diseases(user_symptoms)
print("Predicted diseases:", predicted_diseases)

In [1]:
import os

def get_file_paths(directory):
    file_paths = []
    for root, directories, files in os.walk(directory):
        for filename in files:
            file_paths.append(os.path.join(root, filename))
    return file_paths

directory_path = "C://Users//Harish//OneDrive - marken.com//Desktop//Natural_Language_Processing"
file_paths = get_file_paths(directory_path)

print(file_paths)


['C://Users//Harish//OneDrive - marken.com//Desktop//Natural_Language_Processing\\bag_of_words.py', 'C://Users//Harish//OneDrive - marken.com//Desktop//Natural_Language_Processing\\lemmatization.py', 'C://Users//Harish//OneDrive - marken.com//Desktop//Natural_Language_Processing\\SMSSpamCollection.txt', 'C://Users//Harish//OneDrive - marken.com//Desktop//Natural_Language_Processing\\spam_classifier.py', 'C://Users//Harish//OneDrive - marken.com//Desktop//Natural_Language_Processing\\stemming.py', 'C://Users//Harish//OneDrive - marken.com//Desktop//Natural_Language_Processing\\tf-idf.py']
