<a href="https://colab.research.google.com/github/iamdsc/disease_diagnoser/blob/master/DiseaseDiagnosis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Diagnosis of Disease from Symptoms [Part-1]**

### **Dataset Used:**
##### The data used for this project is taken from a study conducted at Columbia University. [[Link to data]](http://people.dbmi.columbia.edu/~friedma/Projects/DiseaseSymptomKB/index.html)
##### The table used is a knowledge database of disease-symptom associations generated by an automated method based on information in textual discharge summaries of patients at New York Presbyterian Hospital admitted during 2004.

In [0]:
import csv
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
from sklearn.naive_bayes import MultinomialNB

In [4]:
# read in the raw scrapped data
data = pd.read_excel('raw_data.xlsx')
data.head()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,,,UMLS:C0392680_shortness of breath
2,,,UMLS:C0012833_dizziness
3,,,UMLS:C0004093_asthenia
4,,,UMLS:C0085639_fall


### Preprocessing the data

In [5]:
data = data.fillna(method='ffill')
data.head()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0392680_shortness of breath
2,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0012833_dizziness
3,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0004093_asthenia
4,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0085639_fall


In [6]:
list(data)

['Disease', 'Count of Disease Occurrence', 'Symptom']

In [0]:
# Process Disease and Symptom Names
def process_name(data):
  data_list = []
  data_name = data.replace('^','_').split('_')
  n = 1
  for names in data_name:
    if n%2==0:
      data_list.append(names)
    n+=1
  return data_list

In [0]:
disease_list = []
disease_symptom_dict = defaultdict(list)
disease_symptom_count = {}
count = 0

for idx, row in data.iterrows():
    
    # Get the Disease Names
    if (row['Disease'] !="\xc2\xa0") and (row['Disease'] != ""):
        disease = row['Disease']
        disease_list = process_name(data=disease)
        count = row['Count of Disease Occurrence']

    # Get the Symptoms Corresponding to Diseases
    if (row['Symptom'] !="\xc2\xa0") and (row['Symptom'] != ""):
        symptom = row['Symptom']
        symptom_list = process_name(data=symptom)
        for d in disease_list:
            for s in symptom_list:
                disease_symptom_dict[d].append(s)
            disease_symptom_count[d] = count

In [0]:
# Saving the cleaned data
with open('dataset_clean.csv','w') as csvfile:
  writer = csv.writer(csvfile)
  for key, value in disease_symptom_dict.items():
    for v in value:
      key = str.encode(key).decode('utf-8')
      writer.writerow([key,v,disease_symptom_count[key]])

In [10]:
columns = ['Source', 'Target', 'Weight']
data = pd.read_csv('dataset_clean.csv', names=columns, encoding='ISO-8859-1')
data.head()

Unnamed: 0,Source,Target,Weight
0,hypertensive disease,pain chest,3363.0
1,hypertensive disease,shortness of breath,3363.0
2,hypertensive disease,dizziness,3363.0
3,hypertensive disease,asthenia,3363.0
4,hypertensive disease,fall,3363.0


In [0]:
data.to_csv('dataset_clean.csv',index=False)

### Analyzing our cleaned data

In [12]:
unique_diseases = data['Source'].unique()
print('No. of diseases:', len(unique_diseases))
print('Disease:')
for disease in unique_diseases:
  print(disease)

No. of diseases: 149
Disease:
hypertensive disease
diabetes
depression mental
depressive disorder
coronary arteriosclerosis
coronary heart disease
pneumonia
failure heart congestive
accidentÂ cerebrovascular
asthma
myocardial infarction
hypercholesterolemia
infection
infection urinary tract
anemia
chronic obstructive airway disease
dementia
insufficiency renal
confusion
degenerativeÂ polyarthritis
hypothyroidism
anxiety state
malignant neoplasms
primary malignant neoplasm
acquiredÂ immuno-deficiency syndrome
HIV
hiv infections
cellulitis
gastroesophageal reflux disease
septicemia
systemic infection
sepsis (invertebrate)
deep vein thrombosis
dehydration
neoplasm
embolism pulmonary
epilepsy
cardiomyopathy
chronic kidney failure
carcinoma
hepatitis C
peripheral vascular disease
psychotic disorder
hyperlipidemia
bipolar disorder
obesity
ischemia
cirrhosis
exanthema
benign prostatic hypertrophy
kidney failure acute
mitral valve insufficiency
arthritis
bronchitis
hemiparesis
osteoporosis
tra

In [13]:
unique_symptoms = data['Target'].unique()
print('No. of symptoms',len(unique_symptoms))
print('Symptoms:')
for symptom in unique_symptoms:
  print(symptom)

No. of symptoms 405
Symptoms:
pain chest
shortness of breath
dizziness
asthenia
fall
syncope
vertigo
sweat
sweating increased
palpitation
nausea
angina pectoris
pressure chest
polyuria
polydypsia
orthopnea
rale
unresponsiveness
mental status changes
vomiting
labored breathing
feeling suicidal
suicidal
hallucinations auditory
feeling hopeless
weepiness
sleeplessness
motor retardation
irritable mood
blackout
mood depressed
hallucinations visual
worry
agitation
tremor
intoxication
verbal auditory hallucinations
energy increased
difficulty
nightmare
unable to concentrate
homelessness
hypokinesia
dyspnea on exertion
chest tightness
cough
fever
decreased translucency
productive cough
pleuritic pain
yellow sputum
breath sounds decreased
chill
rhonchus
green sputum
non-productive cough
wheezing
haemoptysis
distress respiratory
tachypnea
malaise
night sweat
jugular venous distention
dyspnea
dysarthria
speech slurred
facial paresis
hemiplegia
seizure
numbness
symptom aggravating factors
st segme

### Transforming Data

In [14]:
df_1 = pd.get_dummies(data.Target)
df_1.head()

Unnamed: 0,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,abortion,abscess bacterial,absences finding,achalasia,ache,adverse effect,adverse reaction,agitation,air fluid level,alcohol binge episode,alcoholic withdrawal symptoms,ambidexterity,angina pectoris,anorexia,anosmia,aphagia,apyrexial,arthralgia,ascites,asterixis,asthenia,asymptomatic,ataxia,atypia,aura,awakening early,barking cough,bedridden,behavior hyperactive,behavior showing increased motor activity,blackout,blanch,...,tenesmus,terrify,thicken,throat sore,throbbing sensation quality,tinnitus,tired,titubation,todd paralysis,tonic seizures,transaminitis,transsexual,tremor,tremor resting,tumor cell invasion,unable to concentrate,unconscious state,uncoordination,underweight,unhappy,unresponsiveness,unsteady gait,unwell,urge incontinence,urgency ofÂ micturition,urinary hesitation,urinoma,verbal auditory hallucinations,verbally abusive behavior,vertigo,vision blurred,vomiting,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [15]:
df_s = data['Source']
df_pivoted = pd.concat([df_s, df_1], axis=1)
df_pivoted.drop_duplicates(keep='first',inplace=True)
df_pivoted = df_pivoted.groupby('Source',sort=False).sum()
df_pivoted = df_pivoted.reset_index()
df_pivoted.head()

Unnamed: 0,Source,Heberden's node,Murphy's sign,Stahli's line,abdomen acute,abdominal bloating,abdominal tenderness,abnormal sensation,abnormally hard consistency,abortion,abscess bacterial,absences finding,achalasia,ache,adverse effect,adverse reaction,agitation,air fluid level,alcohol binge episode,alcoholic withdrawal symptoms,ambidexterity,angina pectoris,anorexia,anosmia,aphagia,apyrexial,arthralgia,ascites,asterixis,asthenia,asymptomatic,ataxia,atypia,aura,awakening early,barking cough,bedridden,behavior hyperactive,behavior showing increased motor activity,blackout,...,tenesmus,terrify,thicken,throat sore,throbbing sensation quality,tinnitus,tired,titubation,todd paralysis,tonic seizures,transaminitis,transsexual,tremor,tremor resting,tumor cell invasion,unable to concentrate,unconscious state,uncoordination,underweight,unhappy,unresponsiveness,unsteady gait,unwell,urge incontinence,urgency ofÂ micturition,urinary hesitation,urinoma,verbal auditory hallucinations,verbally abusive behavior,vertigo,vision blurred,vomiting,weepiness,weight gain,welt,wheelchair bound,wheezing,withdraw,worry,yellow sputum
0,hypertensive disease,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,diabetes,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0
2,depression mental,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0
3,depressive disorder,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0
4,coronary arteriosclerosis,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [16]:
len(df_pivoted)

149

In [0]:
# Save the transformed data
df_pivoted.to_csv('df_pivoted.csv')

In [18]:
x = df_pivoted[df_pivoted.columns[1:]]
y = df_pivoted['Source']
print(x[:5])
print(y[:5])

   Heberden's node  Murphy's sign  ...  worry  yellow sputum
0                0              0  ...      0              0
1                0              0  ...      0              0
2                0              0  ...      1              0
3                0              0  ...      1              0
4                0              0  ...      0              0

[5 rows x 404 columns]
0         hypertensive disease
1                     diabetes
2            depression mental
3          depressive disorder
4    coronary arteriosclerosis
Name: Source, dtype: object


### Training Multinomial Naive Bayes Classifier

In [19]:
# Computing prior probabilities of classes from weights
weights = np.fromiter(disease_symptom_count.values(), dtype=float)
total=sum(weights)
prob = weights/total
print(prob)

[0.08658822 0.03658694 0.03442416 0.03442416 0.03305955 0.03305955
 0.02649399 0.02479467 0.02278637 0.02149901 0.01954221 0.01763691
 0.01622081 0.01537115 0.01400654 0.01349159 0.01297665 0.01145756
 0.0105049  0.01042766 0.01024743 0.01004145 0.00911455 0.00911455
 0.00901156 0.00901156 0.00901156 0.00877983 0.00836788 0.00800742
 0.00800742 0.00800742 0.00798167 0.00764695 0.00764695 0.00756971
 0.00746672 0.00728649 0.00720925 0.00692603 0.00692603 0.00690028
 0.00687453 0.00635959 0.0062051  0.00587039 0.00581889 0.00561291
 0.00535544 0.00494348 0.004789   0.004789   0.00460877 0.00442854
 0.00440279 0.0043513  0.00432555 0.00427405 0.00424831 0.00424831
 0.00424831 0.00424831 0.00422256 0.00419681 0.00419681 0.00414532
 0.00411957 0.00406808 0.00391359 0.00391359 0.00378486 0.00373336
 0.00370761 0.00368187 0.00365612 0.00360462 0.00365612 0.00360462
 0.00360462 0.00355313 0.00347589 0.00342439 0.00329566 0.00324416
 0.00319267 0.00316692 0.00314117 0.00306393 0.00293519 0.0029

In [0]:
mnb_tot = MultinomialNB()
mnb_tot = mnb_tot.fit(x, y)

In [21]:
mnb_tot.score(x, y)

0.8993288590604027

In [0]:
# finding where the model fails
disease_pred = mnb_tot.predict(x)

In [0]:
disease_real = y.values

In [24]:
for i in range(0, len(disease_real)):
  if disease_pred[i]!=disease_real[i]:
    print('Pred:',disease_pred[i])
    print('Actual:',disease_real[i])
    print('##########################')

Pred: depression mental
Actual: depressive disorder
##########################
Pred: coronary arteriosclerosis
Actual: coronary heart disease
##########################
Pred: malignant neoplasms
Actual: primary malignant neoplasm
##########################
Pred: HIV
Actual: acquiredÂ immuno-deficiency syndrome
##########################
Pred: HIV
Actual: hiv infections
##########################
Pred: sepsis (invertebrate)
Actual: septicemia
##########################
Pred: sepsis (invertebrate)
Actual: systemic infection
##########################
Pred: carcinoma prostate
Actual: malignant neoplasm of prostate
##########################
Pred: carcinoma breast
Actual: malignant neoplasm of breast
##########################
Pred: candidiasis
Actual: oralcandidiasis
##########################
Pred: carcinoma colon
Actual: malignant tumor of colon
##########################
Pred: tonic-clonic epilepsy
Actual: tonic-clonic seizures
##########################
Pred: carcinoma of lung
Actual:

In [0]:
# Using class prior prob
mnb_prob = MultinomialNB(class_prior=prob)
mnb_prob = mnb_prob.fit(x, y)

In [26]:
mnb_prob.score(x, y)

0.8791946308724832

In [0]:
disease_pred = mnb_prob.predict(x)

In [28]:
for i in range(0, len(disease_real)):
  if disease_pred[i]!=disease_real[i]:
    print('Pred:',disease_pred[i])
    print('Actual:',disease_real[i])
    print('##########################')

Pred: depression mental
Actual: depressive disorder
##########################
Pred: coronary arteriosclerosis
Actual: coronary heart disease
##########################
Pred: malignant neoplasms
Actual: primary malignant neoplasm
##########################
Pred: HIV
Actual: acquiredÂ immuno-deficiency syndrome
##########################
Pred: HIV
Actual: hiv infections
##########################
Pred: sepsis (invertebrate)
Actual: septicemia
##########################
Pred: sepsis (invertebrate)
Actual: systemic infection
##########################
Pred: biliary calculus
Actual: pancreatitis
##########################
Pred: carcinoma prostate
Actual: malignant neoplasm of prostate
##########################
Pred: carcinoma breast
Actual: malignant neoplasm of breast
##########################
Pred: candidiasis
Actual: oralcandidiasis
##########################
Pred: Alzheimer's disease
Actual: kidney disease
##########################
Pred: carcinoma colon
Actual: malignant tumor of co

In [0]:
## Saving the Naive Bayes Model
filename = 'NB_model.sav'
pickle.dump(mnb_tot, open(filename, 'wb'))

In [0]:
# Load model and predict
model = pickle.load(open(filename,'rb'))
# model.predict([100*[1]+100*[0]+204*[0]])

In [31]:
symptoms = df_pivoted.columns[1:].values
print(symptoms)

["Heberden's node" "Murphy's sign" "Stahli's line" 'abdomen acute'
 'abdominal bloating' 'abdominal tenderness' 'abnormal sensation'
 'abnormally hard consistency' 'abortion' 'abscess bacterial'
 'absences finding' 'achalasia' 'ache' 'adverse effect' 'adverse reaction'
 'agitation' 'air fluid level' 'alcohol binge episode'
 'alcoholic withdrawal symptoms' 'ambidexterity' 'angina pectoris'
 'anorexia' 'anosmia' 'aphagia' 'apyrexial' 'arthralgia' 'ascites'
 'asterixis' 'asthenia' 'asymptomatic' 'ataxia' 'atypia' 'aura'
 'awakening early' 'barking cough' 'bedridden' 'behavior hyperactive'
 'behavior showing increased motor activity' 'blackout' 'blanch'
 'bleeding of vagina' 'bowel sounds decreased' 'bradycardia'
 'bradykinesia' 'breakthrough pain' 'breath sounds decreased'
 'breath-holding spell' 'breech presentation' 'bruit' 'burning sensation'
 'cachexia' 'cardiomegaly' 'cardiovascular event' 'cardiovascular finding'
 'catatonia' 'catching breath' 'charleyhorse' 'chest discomfort'
 'che

In [43]:
test_input = [0]*404
user_symptoms = list(input().split(','))
for symptom in user_symptoms:
  test_input[np.where(symptoms==symptom)[0][0]] = 1
print('Most probable disease:',model.predict([test_input]))

shortness of breath,sweating increased,feeling hopeless
Most probable disease: ['coronary arteriosclerosis']


In [0]:
# For reference some diseases with their symptoms
"""
hypertensive disease: pain chest,shortness of breath,dizziness,fall,vertigo,sweating increased,nausea,pressure chest
diabetes: polyuria,polydypsia,unresponsiveness,labored breathing
depression mental: feeling suicidal,feeling hopeless,mood depressed,homelessness,unable to concentrate
pneumonia: cough,fever,distress respiratory,night sweat
"""