<a href="https://colab.research.google.com/github/isaashka/NLPMedicalAnalyzer/blob/main/NLPMedicalAnalyzer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Medical Analyzer

# Imports

In [2]:
import nltk
import ssl

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

nltk.download()



showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [3]:
# General imports
import os
import numpy as np
import nltk
import pandas as pd
from nltk.probability import FreqDist
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /Users/sasha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /Users/sasha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Model specific imports
from decimal import Decimal
import pandas as pd
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import precision_score, recall_score, f1_score
from nltk.util import ngrams

from sklearn.naive_bayes import MultinomialNB

import matplotlib.pyplot as plt

In [5]:
# unused
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

# Data Preprocessing
### Convert: user input --> symptoms

In [6]:
# load the dataset
symptoms_dataset = pd.read_csv("Dataset/dis_sym_dataset_comb.csv")

# extract features to use as a symptom array
X = symptoms_dataset.iloc[:, 1:]
dataset_symptoms = list(X.columns)
dataset_symptoms

['abdominal cramp',
 'abdominal distention',
 'abnormal behavior',
 'abnormal bleeding',
 'abnormal sensation',
 'abnormally frequent',
 'abscess',
 'aching',
 'acne',
 'acquiring drinking alcohol taking lot time',
 'affected part turning white',
 'anemia',
 'anxiety',
 'arm',
 'attack pain',
 'back',
 'bacterial infection',
 'bad breath',
 'bad smelling thin vaginal discharge',
 'bad smelling vaginal discharge',
 'barky cough',
 'belching',
 'better sitting worse lying',
 'birth baby younger week gestational age',
 'bleeding gum',
 'bleeding skin',
 'blindness',
 'blindness one eye',
 'blister sunlight',
 'bloating',
 'blood stool',
 'blood urine',
 'bloody diarrhea',
 'blue',
 'bluish skin coloration',
 'blurred vision',
 'blurry vision',
 'body tremor',
 'bone pain',
 'bowed leg',
 'breakdown skeletal muscle',
 'breathing problem',
 'bruising',
 'burning',
 'burning redness eye',
 'burning stabbing pain',
 'burning urination',
 'certain thought repeatedly',
 'change bowel movement',

In [7]:
# Tokenizer class
class Tokenizer:
  def __init__(self, lowercase=False):
    self.lowercase = lowercase  # If this is True, convert text to lowercase while tokenizing.
    self.vocab = []

  def tokenize(self, string):
    tokens = word_tokenize(string)
    self.vocab += [w for w in set(tokens) if w not in self.vocab]

    return tokens

In [8]:
def find_symptom_phrases(tokens, symptom_keywords, body_parts, n=2):
    symptom_phrases = []
    ngrams_list = list(ngrams(tokens, n))

    for ngram in ngrams_list:
        for keyword in symptom_keywords:
            for body_part in body_parts:
                if keyword in ngram and body_part in ngram:
                    symptom_phrases.append(' '.join(ngram))

    return symptom_phrases

# Lemmatize input
def lemmatize_tokens(tokens):
    lemmatizer = nltk.WordNetLemmatizer()
    lemmatized_tokens = []  
    for token in tokens:
        lemmatized_tokens.append(lemmatizer.lemmatize(token))
    return lemmatized_tokens

def tokenize_user_symptoms():
    print("Enter symptoms: ")
    symptoms = input()
    # tokenize input
    symptom_tokens = word_tokenize(symptoms)

    # lemmatize input
    symptom_tokens_lemmatized = lemmatize_tokens(symptom_tokens)

    # find symptom phrases using n-grams
    symptom_phrases = find_symptom_phrases(symptom_tokens_lemmatized, \
                                           dataset_symptoms, dataset_symptoms) # dataset_symptoms = symptoms retrieved from the dataset
    
    return symptom_phrases

In [9]:
# I have been having migraines and headaches. I can't sleep. My whole body is shaking and shivering. I feel dizzy sometimes
print(tokenize_user_symptoms())

Enter symptoms: 


 I have been having migraines and headaches. I can't sleep. My whole body is shaking and shivering. I feel dizzy sometimes


['and headache', 'headache .', 'is shaking', 'shaking and', 'and shivering', 'shivering .']


# Models

In [10]:
# lists used for accuracy plots

model_list = []
f1_scores = []
accuracies = []

### Dataset

In [11]:
# load the dataset
symptoms_dataset = pd.read_csv("Dataset/dis_sym_dataset_comb.csv")

# separate the features and labels
X = symptoms_dataset.iloc[:, 1:]
Y = symptoms_dataset.iloc[:, 0:1]

# convert Y (labels) to a 1D array 
y_array = Y.to_numpy() if isinstance(Y, pd.Series) else Y.values
Y = y_array.ravel()

In [12]:
# split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size = 0.10)     

### Evaluate the Model

In [13]:
# fit model according to its type and make predictions
def fit_and_predict(classifier, model_type):
    classifier = classifier.fit(X, Y)

    # Predict on the test set
    y_pred = classifier.predict(x_test)
    
    precision, recall, f1, accuracy = evaluate_model(y_test, y_pred, model_type)
    print_evaluation_metrics(precision, recall, f1, accuracy)

# Evaluate the model on different metrics
def evaluate_model(y_test, y_pred, model_type):
    # Calculate precision, recall, and F1-score for each class and then compute the macro average
    precision = round(Decimal(precision_score(y_test, y_pred, average='macro', zero_division=1) * 100), 2)
    recall= round(Decimal(recall_score(y_test, y_pred, average='macro', zero_division=1) * 100), 2)
    f1= round(Decimal(f1_score(y_test, y_pred, average='macro') * 100), 2)
    
    # calculate accuracy
    accuracy = round(Decimal(accuracy_score(y_test, y_pred) * 100), 2)
    
    # Add metrics to overall arrays used later for graphing
    model_list.append(model_type)
    f1_scores.append(f1)
    accuracies.append(accuracy)

    return precision, recall, f1, accuracy
    
def print_evaluation_metrics(precision, recall, f1, accuracy):
    print(f'Macro-average Precision: {precision}%')
    print(f'Macro-average Recall: {recall}%')
    print(f'Macro-average F1-score: {f1}%')
    print(f'Accuracy: {accuracy}%')


In [14]:
# prints all model names and their f1 score and accuracy
def print_all_metrics():
    for i in range(len(model_list)):
        print(f'Model: {model_list[i]}, f1 = {f1_scores[i]} accuracy = {accuracies[i]}')
        # print(model_list[i], f1_scores[i], accuracies[i])

### Multinomial Naive Bayes

In [15]:
# Train the Naive Bayes classifier
mnb = MultinomialNB()
fit_and_predict(mnb, 'MultinomialNaiveBayes')

Macro-average Precision: 94.03%
Macro-average Recall: 57.59%
Macro-average F1-score: 57.23%
Accuracy: 85.07%


### Support Vector Machine

In [16]:
# Train the Support Vector Machine classifier
svm = SVC()
fit_and_predict(svm, 'SupportVectorMachine')

Macro-average Precision: 93.10%
Macro-average Recall: 69.63%
Macro-average F1-score: 68.76%
Accuracy: 89.25%


### Logistic Regression

In [17]:
# Train the Support Vector Machine classifier
lr = LogisticRegression()
fit_and_predict(lr, 'LogisticRegression')

Macro-average Precision: 92.46%
Macro-average Recall: 71.30%
Macro-average F1-score: 69.28%
Accuracy: 89.14%


In [18]:
print_all_metrics()

Model: MultinomialNaiveBayes, f1 = 57.23 accuracy = 85.07
Model: SupportVectorMachine, f1 = 68.76 accuracy = 89.25
Model: LogisticRegression, f1 = 69.28 accuracy = 89.14
