In [21]:
# New technology used
#  %%capture
# SentencePiece as a supplement to NLTK

Our application allows for patients to input some symptoms into a symptom tracker.  This information is then associated to some possible diagnoses.

Some new technologies we used (we did not cover in our boot camp) are:
1. **SentencePiece**, which is a supplement to our NLTK.  This supplement is needed to assist in translating medical terms or more complex words.
2.**%%capture** which is unique to Google Colab.  This allows for the !pip installs to run without generating all the responses, which clutter up the application.
3. **sqlite** which is a lightweight database management system.  Given that we are dealing with large dataset(s) for our model, sqlite allows our application to store and retreive data using SQL (structured query language.)  We are using this for efficiency and speed of use.

In [43]:
# Our pip installs needed to run our application.  Note the %%capture being used is for google colab only.
# Remove if you are going to run this in VSCode.

%%capture

!pip install gradio
!pip intall nltk
!pip install transformers
!pip install torch
!pip install sentencepiece
!pip install tensorflow
!pip install tensorflow_hub
!pip install tensorflow_text
!pip install pandas
!pip install numpy
!pip install matplotlib
!pip install seaborn
!pip install sqlite3
!pip install extract_named_entities_nltk




In [44]:
# Imports needed for this application
import gradio as gr
import torch
import sentencepiece
import tensorflow
import tensorflow_hub
import tensorflow_text
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sqlite3




In [45]:
from google.colab import files
uploaded = files.upload()

Saving symbipredict.csv to symbipredict (2).csv


In [46]:
#  Read the .csv using pandas
import pandas as pd

# Load the data
disease_data = pd.read_csv('symbipredict.csv')

# Display the first few rows of the dataset
print(disease_data.head())

            Disease  Symptom_1             Symptom_2             Symptom_3  \
0  Fungal Infection    itching             skin_rash  nodal_skin_eruptions   
1  Fungal Infection  skin_rash  nodal_skin_eruptions   dischromic _patches   
2  Fungal Infection    itching  nodal_skin_eruptions   dischromic _patches   
3  Fungal Infection    itching             skin_rash   dischromic _patches   
4  Fungal Infection    itching             skin_rash  nodal_skin_eruptions   

             Symptom_4 Symptom_5 Symptom_6 Symptom_7 Symptom_8 Symptom_9  \
0  dischromic _patches       NaN       NaN       NaN       NaN       NaN   
1                  NaN       NaN       NaN       NaN       NaN       NaN   
2                  NaN       NaN       NaN       NaN       NaN       NaN   
3                  NaN       NaN       NaN       NaN       NaN       NaN   
4                  NaN       NaN       NaN       NaN       NaN       NaN   

  Symptom_1.1 Symptom_11 Symptom_12 Symptom_13 Symptom_14 Symptom_15  \
0 

In [47]:
# After loading the data, it is necessary to combine the symptom_columns into a single column
symptom_columns = [col for col in disease_data.columns if col != 'Disease']
disease_data['Processed_Symptoms'] = disease_data[symptom_columns].apply(lambda x: ' '.join(x.astype(str)), axis=1)

In the section below, we import the necessary libraries and dictionaries in order to build our NLTK model.

In [48]:
# Import necessary libraries starting with nltk
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import gradio as gr
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('maxent_ne_chunker')
nltk.download('words')

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


In the section below, we are defining how we want to use our dataset(s).  We want patients to input their symptoms, then we associate them to key words from our dataset(s).  This is our preprocessing of the model.

In [49]:
# Define the prepocessing of the data
def preprocess_symptoms(symptom_text):
    tokens = word_tokenize(symptom_text.lower())
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [w for w in tokens if not w in stop_words and w.isalnum()]
    return ' '.join(filtered_tokens)

# Example synonym dictionary (this can be expanded)
synonym_dict = {
    'fever': ['fever', 'pyrexia'],
    'headache': ['headache', 'migraine', 'cephalalgia'],
    'nausea': ['nausea', 'queasiness', 'sickness'],
    'vomiting': ['vomiting', 'throwing up', 'emesis'],
    'sore throat': ['sore throat', 'pharyngitis', 'throat pain']
}

def expand_keywords(keywords):
    expanded_keywords = set()
    for keyword in keywords:
        if keyword in synonym_dict:
            expanded_keywords.update(synonym_dict[keyword])
        else:
            expanded_keywords.add(keyword)
    return list(expanded_keywords)

def extract_keywords(patient_feedback):
    tokens = word_tokenize(patient_feedback.lower())
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [w for w in tokens if not w in stop_words and w.isalnum()]
    pos_tags = nltk.pos_tag(filtered_tokens)
    keywords = [word for word, pos in pos_tags if pos.startswith('NN') or pos.startswith('JJ') or pos.startswith('VB')]
    return keywords






In the section below, the application will read the patient input and suggest diagnosis.  This is our vectorizing process.  Once we vectorize, we run the gradio app, which generates an input cell for patient data, and an output cell for possible diagnoses.

NOTE:  We are allowing for a possible 5 diagnoses.  Many symptoms cross over many diagnoses.  For now, we are merely suggesting some possible diagnoses.  Our future model will be much more precise.  Much more data is needed to establish that kind of precision.  Given these challenges and the short runway of time we had to develop this application, we decided to put in a patient feedback loop in our gradio application.  This allows the patient to tell us if the proposed diagnoses are "Correct", "Incorrect", "Needs Improvement".

In [52]:
import os
import gradio as gr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Use the preloaded disease_data
def extract_keywords(feedback):
    # Placeholder for actual keyword extraction logic
    return feedback.split()

def expand_keywords(keywords):
    # Placeholder for actual keyword expansion logic
    return keywords

def suggest_diagnosis_tfidf(patient_feedback):
    keywords = extract_keywords(patient_feedback)
    expanded_keywords = expand_keywords(keywords)
    print("Expanded Keywords:", expanded_keywords)

    processed_feedback = ' '.join(expanded_keywords)
    vectorizer = TfidfVectorizer()
    symptom_matrix = vectorizer.fit_transform(disease_data['Processed_Symptoms'])
    feedback_vector = vectorizer.transform([processed_feedback])
    similarities = cosine_similarity(feedback_vector, symptom_matrix)
    sorted_indices = similarities.argsort()[0][::-1]

    possible_diagnoses = []
    added_diseases = set()  # To track added diagnoses and avoid duplicates
    for idx in sorted_indices:
        disease_name = disease_data.loc[idx, 'Disease']
        if disease_name not in added_diseases:
            possible_diagnoses.append(disease_name)
            added_diseases.add(disease_name)

    if not possible_diagnoses:
        possible_diagnoses = ["Unable to determine a diagnosis based on the provided information."]

    return possible_diagnoses[:5]  # Return top 5 possible diagnoses

# Ensure the custom flagged directory exists
custom_flagged_dir = 'custom_flagged_data'
if not os.path.exists(custom_flagged_dir):
    os.makedirs(custom_flagged_dir)
print(f"Flagged data should be saved in: {os.path.abspath(custom_flagged_dir)}")

# Gradio Interface
iface = gr.Interface(
    fn=suggest_diagnosis_tfidf,
    inputs=gr.Textbox(lines=5, placeholder="Describe your symptoms..."),
    outputs="text",
    title="Symptom Checker",
    description="Enter your symptoms, and we'll suggest possible diagnoses.",
    flagging_mode="manual",
    flagging_options=["Correct", "Incorrect", "Needs Improvement"],
    flagging_dir=custom_flagged_dir
)

iface.launch(share=True)



Flagged data should be saved in: /content/custom_flagged_data
Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://37af0a923dd5e7610a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




Now that we have established the symptom to diagnosis part of the application, our next piece is to make some suggested remedies for the suggested diagnosis.

**NOTE:  Usha below is where your piece of coding comes into play.**

In [30]:
import os
print(os.getcwd())

/content


In [31]:
os.listdir('/')

['tmp',
 'libx32',
 'bin',
 'lib64',
 'sys',
 'srv',
 'dev',
 'opt',
 'home',
 'mnt',
 'run',
 'proc',
 'root',
 'usr',
 'lib',
 'boot',
 'media',
 'sbin',
 'var',
 'lib32',
 'etc',
 'content',
 '.dockerenv',
 'tools',
 'datalab',
 'python-apt',
 'python-apt.tar.xz',
 'NGC-DL-CONTAINER-LICENSE',
 'cuda-keyring_1.1-1_all.deb']

In [32]:
flagged_dir = 'flagged'
for root, dirs, files in os.walk('/'):
    if flagged_dir in dirs:
        print(f'Flagged directory found at: {os.path.join(root, flagged_dir)}')
        break


Flagged directory found at: /content/.gradio/flagged


In [33]:
custom_flagged_dir = 'custom_flagged_data'  # or the name you used
if os.path.exists(custom_flagged_dir):
    print(f'Custom flagged directory found at: {custom_flagged_dir}')
    print(os.listdir(custom_flagged_dir))
else:
    print(f'Custom flagged directory {custom_flagged_dir} not found.')


Custom flagged directory found at: custom_flagged_data
[]


In [34]:
#  Load and Access Treatment Database
treatment_database = pd.read_csv('treatment_database.csv')
treatment_database.head()

FileNotFoundError: [Errno 2] No such file or directory: 'treatment_database.csv'

In [None]:
# File import
import sqlite3
        conn = sqlite3.connect('treatment_database.db')
        cursor = conn.cursor()
        cursor.

In [None]:
# Link the disease to a possible treatment
def get_treatments(diagnosed_disease):
        treatments = treatment_data[treatment_data['disease'] == diagnosed_disease]['treatment'].tolist()
        if treatments:
            return treatments
        else:
            return "No treatments found for this disease."