<a href="https://colab.research.google.com/github/isaashka/NLPMedicalAnalyzer/blob/main/NLPMedicalAnalyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Medical Analyzer

# Imports

In [1]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()



showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [2]:
# General imports
import os
import numpy as np
import nltk
import pandas as pd
from nltk.probability import FreqDist
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /Users/sasha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/sasha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Model specific imports
from decimal import Decimal
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score
from nltk.util import ngrams
import math

from sklearn.naive_bayes import MultinomialNB

import matplotlib.pyplot as plt

In [4]:
# unused
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Data Preprocessing
### Goal: convert user input --> symptoms

In [43]:
# load the dataset
symptoms_dataset = pd.read_csv("Dataset/dis_sym_dataset_comb.csv")

# extract features to use as a symptom array
X = symptoms_dataset.iloc[:, 1:]
dataset_symptoms = list(X.columns)
# dataset_symptoms

### Normalizing User Input

In [44]:
import contractions
import re
from spellchecker import SpellChecker

In [92]:
def user_input():
    print("Enter symptoms: ")
    user_input = input()
    return normalize_user_input(user_input)

def correct_spelling(user_input):
    spell = SpellChecker()
    words = user_input.split()
    corrected_words = [spell.correction(word) for word in words]
    return ' '.join(corrected_words)

def normalize_user_input(user_input):
    # lowercase
    normalized_input = user_input.lower()
    # fix contractions
    normalized_input = contractions.fix(normalized_input)
    # get rid of punctuation
    normalized_input = re.sub(r'[^\w\s]', '', normalized_input)

    normalized_input = correct_spelling(normalized_input)
    # print(normalized_input)
    return(normalized_input)

In [93]:
user_symptoms = user_input()
user_symptoms

Enter symptoms: 


 I have been having migraines and headaches. I can't sleep. My whole body is shaking and shivering. I feel dizzy sometimes


'i have been having migraines and headaches i cannot sleep my whole body is shaking and shivering i feel dizzy sometimes'

### Ngram approach

Test input: I have been having migraines and headaches. I can't sleep. My whole body is shaking and shivering. I feel dizzy sometimes

In [7]:
def find_symptom_phrases(tokens, symptom_keywords, body_parts, n=2):
    symptom_phrases = []
    ngrams_list = list(ngrams(tokens, n))

    for ngram in ngrams_list:
        for keyword in symptom_keywords:
            for body_part in body_parts:
                if keyword in ngram and body_part in ngram:
                    symptom_phrases.append(' '.join(ngram))

    return symptom_phrases

# Lemmatize input
def lemmatize_tokens(tokens):
    lemmatizer = nltk.WordNetLemmatizer()
    lemmatized_tokens = []  
    for token in tokens:
        lemmatized_tokens.append(lemmatizer.lemmatize(token))
    return lemmatized_tokens

def tokenize_user_symptoms():
    print("Enter symptoms: ")
    
    symptoms = input()
    # tokenize input
    symptom_tokens = word_tokenize(symptoms)

    # lemmatize input
    symptom_tokens_lemmatized = lemmatize_tokens(symptom_tokens)

    # find symptom phrases using n-grams
    symptom_phrases = find_symptom_phrases(symptom_tokens_lemmatized, \
                                           dataset_symptoms, dataset_symptoms) # dataset_symptoms = symptoms retrieved from the dataset
    
    return symptom_phrases

In [8]:
extracted_symptoms = tokenize_user_symptoms()
print()
print('Here\'s your list of symptoms: ')
print(extracted_symptoms)

Enter symptoms: 


 I have been having migraines and headaches. I can't sleep. My whole body is shaking and shivering. I feel dizzy



Here's your list of symptoms: 
['and headache', 'headache .', 'is shaking', 'shaking and', 'and shivering', 'shivering .']


### NER / BERT approach --- WIP

In [38]:
# Install the necessary packages
!pip install transformers pandas nltk



In [46]:
# Import the necessary libraries
from transformers import pipeline

In [49]:
# Load a pre-trained NER model that's specialized for biomedical text
ner = pipeline("ner", model="d4data/biomedical-ner-all")

# Load the symptoms dataset
symptoms_dataset = pd.read_csv("Dataset/dis_sym_dataset_comb.csv")

# Extract the list of symptoms from the dataset
# Assuming the symptoms are in a column named 'symptoms'
# symptoms_list = symptoms_dataset['symptom'].str.lower().tolist()

def extract_symptoms(text):
    # Use the NER pipeline to extract entities
    entities = ner(text)
    
    # Print the entities for debugging
    # print("Entities:", entities)
    
    # Extract symptom-related entities detected by the model
    symptoms = [entity['word'] for entity in entities if 'symptom' in entity['entity'].lower()]
    
    # Tokenize the text for manual matching
    tokens = word_tokenize(text.lower())
    
    # Add manually matched symptoms from the dataset
    for token in tokens:
        if token in dataset_symptoms and token not in symptoms:
            symptoms.append(token)
    
    return symptoms

# Function to clean and join tokens into keywords
def clean_keywords(keywords):
    clean_tokens = []
    for token in keywords:
        if token.startswith("##"):
            clean_tokens[-1] = clean_tokens[-1] + token[2:]
        else:
            clean_tokens.append(token)
    return clean_tokens

In [51]:
# Example user input
user_input = "I have been having migraines and headaches. I can't sleep. My whole body is shaking and shivering. I feel dizzy sometimes. I have a fever."
new_user_input = normalize_user_input(user_input)

# Extract symptoms
symptoms = extract_symptoms(new_user_input)

# Clean and join the tokens into keywords
keywords = clean_keywords(symptoms)

# Display the symptoms
print("Extracted Symptoms:", symptoms)
print("Keywords:", keywords)

Extracted Symptoms: ['mig', '##raine', 'headache', 'cannot', 'sleep', 'shaking', 'shivering', 'dizzy', 'fever']
Keywords: ['migraine', 'headache', 'cannot', 'sleep', 'shaking', 'shivering', 'dizzy', 'fever']


#### Expand list of symptoms by finding synonyms

In [52]:
# find synonyms for extracted keywords
nltk.download('wordnet')
from nltk.corpus import wordnet

[nltk_data] Downloading package wordnet to /Users/sasha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [53]:
# Get synonyms of a list of words
def get_synonyms(words):
    synonyms = set()
    for word in words:
        # Get synsets (sets of synonyms) for each word
        synsets = wordnet.synsets(word)
        for synset in synsets:
            # Add synonyms of the word from each synset to the set
            synonyms.update([lemma.name() for lemma in synset.lemmas()])
    return synonyms

In [74]:
synonyms = get_synonyms(keywords)
print("Synonym set: ")
print(synonyms)
print()

print("List of all symptoms: ")
final_symptoms = list(synonyms)
print(final_symptoms)

Synonym set: 
{'throw_off', 'migraine', 'kip', 'shake_off', 'judder', 'lightheaded', 'headache', 'hemicrania', 'sleep', 'sick_headache', 'sway', 'head_ache', 'light-headed', 'megrim', 'didder', 'quivering', 'dizzy', 'quiver', 'fever', 'cephalalgia', 'escape_from', 'silly', 'throb', 'woozy', 'palpitation', 'vexation', "catch_some_Z's", 'trembling', 'rest', 'eternal_rest', 'shake_up', 'febrility', 'chill', 'empty-headed', 'stimulate', 'vertiginous', 'eternal_sleep', 'shakiness', 'nap', 'featherbrained', 'pyrexia', 'shaking', 'giddy', 'shudder', 'rock', 'quietus', 'shivering', 'airheaded', 'stir', 'thrill', 'sopor', 'febricity', 'concern', 'agitate', 'shaky', 'excite', 'shiver', "log_Z's", 'worry', 'vibration', 'slumber', 'shake', 'feverishness'}

List of all symptoms: 
['throw_off', 'migraine', 'kip', 'shake_off', 'judder', 'lightheaded', 'headache', 'hemicrania', 'sleep', 'sick_headache', 'sway', 'head_ache', 'light-headed', 'megrim', 'didder', 'quivering', 'dizzy', 'quiver', 'fever', '

In [76]:
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.25.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.25.1 (from python-Levenshtein)
  Downloading Levenshtein-0.25.1-cp310-cp310-macosx_10_9_x86_64.whl.metadata (3.3 kB)
Collecting rapidfuzz<4.0.0,>=3.8.0 (from Levenshtein==0.25.1->python-Levenshtein)
  Downloading rapidfuzz-3.9.3-cp310-cp310-macosx_10_9_x86_64.whl.metadata (12 kB)
Downloading python_Levenshtein-0.25.1-py3-none-any.whl (9.4 kB)
Downloading Levenshtein-0.25.1-cp310-cp310-macosx_10_9_x86_64.whl (135 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m135.4/135.4 kB[0m [31m566.8 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hDownloading rapidfuzz-3.9.3-cp310-cp310-macosx_10_9_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully inst

In [112]:
def extract_relevant_symptoms(final_symptoms):
    syms_in_dataset = []
    for sym in final_symptoms:
        if sym in dataset_symptoms:
            syms_in_dataset.append(sym)
            # print(sym)
    return syms_in_dataset
    
extract_relevant_symptoms(final_symptoms)

['headache', 'fever', 'chill', 'shakiness', 'shaking', 'shivering']

## Possible future implementations for Preprocessing:

Look for symptom phrases to develop / inference a more descriptive symptom. Some keywords may not mean much on their own, but the words around them give more meaning. For example the word "sleep," the complete phrase may be "I can't sleep" or "I have trouble sleeping." To get the full picture we can look at words preceding and following the symptom keyword. 

_ _ _ sleep

sleep _ _ _


#### Getting different forms of the symptom keywords

We can increase chances of finding the right symptoms just by changing the form of the word used. Ex. "sleepy" to "sleepiness". But there are currently no libraries that can do this accurately and coding this up would take a lot of time which is beyond the scope of this project.

In [75]:
# Get all forms of the extracted symptoms
# i.e. 'sleep' --> 'sleepy', 'sleepiness', 'sleep'
#
# !pip install reversestem

from reversestem import unstem

unstem('sleep')

['sleepe',
 'sleeped',
 'sleepes',
 'sleeping',
 'sleepful',
 'sleepness',
 'sleepfulness']

# Models

Classification models used to match an input list of symptoms to the disease.

Following code inspired by this github repo: https://github.com/rahul15197/Disease-Detection-based-on-Symptoms?source=post_page-----54e6be60a3d1--------------------------------

We used the dataset from the github repo as well as some general structure of code. Most of the code was written by us to suit our overall structure.


In [None]:
# lists used for performance plots

model_list = []
f1_scores = []
accuracies = []

### Dataset

In [None]:
# load the dataset
symptoms_dataset = pd.read_csv("Dataset/dis_sym_dataset_comb.csv")

# separate the features and labels
X = symptoms_dataset.iloc[:, 1:]
Y = symptoms_dataset.iloc[:, 0:1]

# convert Y (labels) to a 1D array 
y_array = Y.to_numpy() if isinstance(Y, pd.Series) else Y.values
Y = y_array.ravel()

In [None]:
# split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.10)     

### Evaluate the Model

In [None]:
# fit model according to its type and make predictions
def fit_and_predict(classifier, model_type):
    classifier = classifier.fit(X, Y)

    # Predict on the test set
    y_pred = classifier.predict(x_test)
    
    precision, recall, f1, accuracy = evaluate_model(y_test, y_pred, model_type)
    print_evaluation_metrics(precision, recall, f1, accuracy)

# Evaluate the model on different metrics
def evaluate_model(y_test, y_pred, model_type):
    # Calculate precision, recall, and F1-score for each class and then compute the macro average
    precision = round(Decimal(precision_score(y_test, y_pred, average='macro', zero_division=1) * 100), 2)
    recall= round(Decimal(recall_score(y_test, y_pred, average='macro', zero_division=1) * 100), 2)
    f1= round(Decimal(f1_score(y_test, y_pred, average='macro') * 100), 2)
    
    # calculate accuracy
    accuracy = round(Decimal(accuracy_score(y_test, y_pred) * 100), 2)
    
    # Add metrics to overall arrays used later for graphing
    model_list.append(model_type)
    f1_scores.append(f1)
    accuracies.append(accuracy)

    return precision, recall, f1, accuracy
    
def print_evaluation_metrics(precision, recall, f1, accuracy):
    print(f'Macro-average Precision: {precision}%')
    print(f'Macro-average Recall: {recall}%')
    print(f'Macro-average F1-score: {f1}%')
    print(f'Accuracy: {accuracy}%')


In [None]:
# prints all model names and their f1 score and accuracy
def print_all_metrics():
    for i in range(len(model_list)):
        print(f'Model: {model_list[i]}, f1 = {f1_scores[i]} accuracy = {accuracies[i]}')
        # print(model_list[i], f1_scores[i], accuracies[i])

### Multinomial Naive Bayes

In [None]:
# Train the Naive Bayes classifier
mnb = MultinomialNB()
fit_and_predict(mnb, 'MultinomialNaiveBayes')

### Support Vector Machine

In [None]:
# Train the Support Vector Machine classifier
svm = SVC()
fit_and_predict(svm, 'SupportVectorMachine')

### Logistic Regression

In [None]:
# Train the Support Vector Machine classifier
lr = LogisticRegression()
fit_and_predict(lr, 'LogisticRegression')

In [None]:
print_all_metrics()

In [None]:
# Metric lists for reference
# model_list = []
# f1_scores = []
# accuracies = []

# Comparison plot for all classifiers with their accuracy
plt.style.use('_classic_test_patch')
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot()
plt.title("Model Vs Accuracy", color='black', pad=30)
plt.xlabel('Classifier', color='black')
plt.ylabel('Accuracy (%)', color='black')
plt.bar(model_list, accuracies, color='lightblue')
for i, j in enumerate(accuracies):
    ax.text(float(i)-0.15, float(j)+0.7, str(j), color='blue')
plt.show()

In [None]:
# comparison plot for all classifiers with their F1-score
plt.style.use('_classic_test_patch')
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot()
plt.title("Model Vs F1-score", color='black', pad=30)
plt.xlabel('Classifier', color='black')
plt.ylabel('F1-score (%)', color='black')
plt.bar(model_list, f1_scores, color='lightblue')
for i, j in enumerate(f1_scores):
    ax.text(float(i)-0.15, float(j)+0.7, str(j), color='blue')
plt.show()

# MedicalAnalyzer for Disease Output

This is the main user-model interactive section, all the above code needs to be run so that the following works. This is the minimalist and to the point interaction between user and our model.

### Logistic Regression Model

The best performing model was Logistic Regression, so we'll use that in our evaluations of user input.

This code almost completely adapted from: https://github.com/rahul15197/Disease-Detection-based-on-Symptoms/blob/master/SymptomSuggestion.ipynb


In [81]:
# load the dataset
df_comb = pd.read_csv("Dataset/dis_sym_dataset_comb.csv")
df_norm = pd.read_csv("Dataset/dis_sym_dataset_norm.csv")

# separate the features and labels
X = df_comb.iloc[:, 1:]
Y = df_comb.iloc[:, 0:1]

# convert Y (labels) to a 1D array 
y_array = Y.to_numpy() if isinstance(Y, pd.Series) else Y.values
Y = y_array.ravel()

In [82]:
lr = LogisticRegression()
lr = lr.fit(X, Y)
scores = cross_val_score(lr, X, Y, cv=5)



In [83]:
X = df_norm.iloc[:, 1:]
Y = df_norm.iloc[:, 0:1]

In [84]:
# List of symptoms
dataset_symptoms = list(X.columns)

### Input Symptoms --> Disease

This is where we use the model to tell us what diseases the user may have based on the symptoms extracted from their input.

In [114]:
# Create vector from processed user symptoms to be used by the model

# example symptom list
# sym_list = ["yellowish skin","wheezing","abdominal cramp","back","feeling cold"]

def create_symptom_vector(sym_list):
    sym_vector = [0 for i in range(0, len(dataset_symptoms))]
    for sym in sym_list:
        # print(sym)
        sym_vector[dataset_symptoms.index(sym)] = 1
    return sym_vector

In [116]:
# Predict the likelihood for each disease
def predict_diseases(sym_vector):
    disease_predict = lr.predict_proba([sym_vector])
    k = 10
    diseases = list(set(Y['label_dis']))
    diseases.sort()
    top_k = disease_predict[0].argsort()[-k:][::-1]
    return top_k

In [119]:
def output_top_k_diseases(top_k):
    print(f"\nTop {k} diseases predicted based on symptoms")
    topk_dict = {}
    # Show top 10 highly probable disease to the user.
    for idx,t in  enumerate(top_k):
        match_sym=set()
        row = df_norm.loc[df_norm['label_dis'] == diseases[t]].values.tolist()
        row[0].pop(0)
    
        for idx,val in enumerate(row[0]):
            if val!=0:
                match_sym.add(dataset_symptoms[idx])
        prob = (len(match_sym.intersection(set(sym_list)))+1)/(len(set(sym_list))+1)
        prob *= np.mean(scores)
        topk_dict[t] = prob
    j = 0
    topk_index_mapping = {}
    topk_sorted = dict(sorted(topk_dict.items(), key=lambda kv: kv[1], reverse=True))
    for key in topk_sorted:
      prob = topk_sorted[key]*100
      print(str(j+1) + " Disease name:",diseases[key], "\tProbability:",str(round(prob, 2))+"%")
      topk_index_mapping[j] = key
      j += 1


# User-interactable Interface

Note: In actual implementation the user wouldn't get the list of diseases shown to them, it would instead be saved to a file and shown to the doctor. This implementation is created to show all of the output in one place to see how the model works.

In [121]:
# Example user input
# user_input = "I have been having migraines and headaches. I can't sleep. My whole body is shaking and shivering. I feel dizzy sometimes. I have a fever."
# new_user_input = normalize_user_input(user_input)
def MedicalAnalyzer():
    new_user_input = user_input()
    print()
    
    # Extract symptoms
    symptoms = extract_symptoms(new_user_input)
    
    # Clean and join the tokens into keywords
    keywords = clean_keywords(symptoms)
    
    # Display the symptoms
    print("Key symptom words: ", keywords)
    print()
    
    # Display all symptoms including synonyms
    synonyms = get_synonyms(keywords)
    final_symptoms = list(synonyms)
    print("Final symptoms list: ", final_symptoms)
    
    # Get relevant symptoms
    extract_relevant_symptoms(final_symptoms)
    
    # Create a vector to be used by the model
    sym_vector = create_symptom_vector(sym_list)
    
    # Get top diseases
    topk_diseases = predict_diseases(sym_vector)
    
    output_top_k_diseases(topk_diseases)

In [122]:
MedicalAnalyzer()

Enter symptoms: 


 I have been having migraines and headaches. I can't sleep. My whole body is shaking and shivering. I feel dizzy sometimes. I have a fever.



Key symptom words:  ['migraine', 'headache', 'cannot', 'sleep', 'shaking', 'shivering', 'dizzy', 'fever']

Final symptoms list:  ['throw_off', 'migraine', 'kip', 'shake_off', 'judder', 'lightheaded', 'headache', 'hemicrania', 'sleep', 'sick_headache', 'sway', 'head_ache', 'light-headed', 'megrim', 'didder', 'quivering', 'dizzy', 'quiver', 'fever', 'cephalalgia', 'escape_from', 'silly', 'throb', 'woozy', 'palpitation', 'vexation', "catch_some_Z's", 'trembling', 'rest', 'eternal_rest', 'shake_up', 'febrility', 'chill', 'empty-headed', 'stimulate', 'vertiginous', 'eternal_sleep', 'shakiness', 'nap', 'featherbrained', 'pyrexia', 'shaking', 'giddy', 'shudder', 'rock', 'quietus', 'shivering', 'airheaded', 'stir', 'thrill', 'sopor', 'febricity', 'concern', 'agitate', 'shaky', 'excite', 'shiver', "log_Z's", 'worry', 'vibration', 'slumber', 'shake', 'feverishness']

Top 10 diseases predicted based on symptoms
1 Disease name: Yellow Fever 	Probability: 44.6%
2 Disease name: Parkinson's Disease 

