# Disease Prediction Using Symptoms

In [44]:
import csv
import pandas as pd
import numpy as np
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt

In [70]:
# Load Dataset
data= pd.read_excel("DSDataset.xlsx")
data

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,hypertensive disease,3363.0,pain chest
1,hypertensive disease,,shortness of breath
2,hypertensive disease,,dizziness
3,hypertensive disease,,asthenia
4,hypertensive disease,,fall
...,...,...,...
1861,affect labile,,bedridden
1862,affect labile,,prostatism
1863,decubitus ulcer,42.0,systolic murmur
1864,decubitus ulcer,,frail


In [71]:
# Remove column 1
df = data.loc[:, data.columns!='Count of Disease Occurrence']
df

Unnamed: 0,Disease,Symptom
0,hypertensive disease,pain chest
1,hypertensive disease,shortness of breath
2,hypertensive disease,dizziness
3,hypertensive disease,asthenia
4,hypertensive disease,fall
...,...,...
1861,affect labile,bedridden
1862,affect labile,prostatism
1863,decubitus ulcer,systolic murmur
1864,decubitus ulcer,frail


In [73]:
# Available data types
df.dtypes

Disease    object
Symptom    object
dtype: object

In [80]:
# Preprocessing
import spacy

def preprocess(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    tokens = [word.lemma_ for word in doc if word.is_stop is False and word.is_alpha is True]
    return tokens


preprocess('shortness of breath')

['shortness', 'breath']

In [87]:
# Vectorize the data
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(stop_words='english')
vector = vectorizer.fit_transform(df['Disease'])

In [89]:
# Encode the Labels
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder


label_encoder = LabelEncoder()
integer_encode = label_encoder.fit_transform(df['Symptom'])
print(integer_encode)

[252 323  84 ... 355 127 119]


In [94]:
#Unique symptoms
unique_sym = np.asarray(df['Symptom'].unique())
unique_sym

array(['pain chest', 'shortness of breath', 'dizziness', 'asthenia',
       'fall', 'syncope', 'vertigo', 'sweating increased', 'palpitation',
       'nausea', 'angina pectoris', 'pressure chest', 'polyuria',
       'polydypsia', 'orthopnea', 'rale', 'unresponsiveness',
       'mental status changes', 'vomiting', 'labored breathing',
       'feeling suicidal', 'suicidal', 'hallucinations auditory',
       'feeling hopeless', 'weepiness', 'sleeplessness',
       'motor retardation', 'irritable mood', 'blackout',
       'mood depressed', 'hallucinations visual', 'worry', 'agitation',
       'tremor', 'intoxication', 'verbal auditory hallucinations',
       'energy increased', 'difficulty', 'nightmare',
       'unable to concentrate', 'homelessness', 'hypokinesia',
       'dyspnea on exertion', 'chest tightness', 'cough', 'fever',
       'decreased translucency', 'productive cough', 'pleuritic pain',
       'yellow sputum', 'breath sounds decreased', 'chill', 'rhonchus',
       'green spu