## Disease Prediction from Symptoms

For this project, we'll be using the dataset from here: http://people.dbmi.columbia.edu/~friedma/Projects/DiseaseSymptomKB/index.html

Copy the data from all the columns and paste it into an excel sheet -> `raw_data.xlsx`

In [26]:
# Import Dependencies
import csv
import pandas as pd
import numpy as np
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [27]:
# Read Raw Dataset
df = pd.read_excel('raw_data.xlsx')

In [28]:
df

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,,,UMLS:C0392680_shortness of breath
2,,,UMLS:C0012833_dizziness
3,,,UMLS:C0004093_asthenia
4,,,UMLS:C0085639_fall
...,...,...,...
1861,,,UMLS:C0425251_bedridden^UMLS:C0741453_bedridden
1862,,,UMLS:C0242453_prostatism
1863,UMLS:C0011127_decubitus ulcer,42.0,UMLS:C0232257_systolic murmur
1864,,,UMLS:C0871754_frail


In [29]:
# Fill all NaN with the values above
data = df.fillna(method='ffill')

In [30]:
data

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0392680_shortness of breath
2,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0012833_dizziness
3,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0004093_asthenia
4,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0085639_fall
...,...,...,...
1861,UMLS:C0233472_affect labile,45.0,UMLS:C0425251_bedridden^UMLS:C0741453_bedridden
1862,UMLS:C0233472_affect labile,45.0,UMLS:C0242453_prostatism
1863,UMLS:C0011127_decubitus ulcer,42.0,UMLS:C0232257_systolic murmur
1864,UMLS:C0011127_decubitus ulcer,42.0,UMLS:C0871754_frail


In [31]:
# Process Disease and Symptom Names
def process_data(data):
    data_list = []
    data_name = data.replace('^','_').split('_')
    print(data_name)
    n = 1
    for names in data_name:
        if (n % 2 == 0):
            data_list.append(names)
        n += 1
    print(f"data_list=={data_list}")
    return data_list

In [32]:
# Data Cleanup
disease_list = []
disease_symptom_dict = defaultdict(list)
disease_symptom_count = {}
count = 0

for idx, row in data.iterrows():
    # Get the Disease Names
    if (row['Disease'] !="\xc2\xa0") and (row['Disease'] != ""):
        disease = row['Disease']
        disease_list = process_data(data=disease)
        count = row['Count of Disease Occurrence']

    # Get the Symptoms Corresponding to Diseases
    
    if (row['Symptom'] !="\xc2\xa0") and (row['Symptom'] != ""):
        symptom = row['Symptom']
        symptom_list = process_data(data=symptom)
        for d in disease_list:
            for s in symptom_list:
                disease_symptom_dict[d].append(s)
            disease_symptom_count[d] = count

['UMLS:C0020538', 'hypertensive disease']
data_list==['hypertensive disease']
['UMLS:C0008031', 'pain chest']
data_list==['pain chest']
['UMLS:C0020538', 'hypertensive disease']
data_list==['hypertensive disease']
['UMLS:C0392680', 'shortness of breath']
data_list==['shortness of breath']
['UMLS:C0020538', 'hypertensive disease']
data_list==['hypertensive disease']
['UMLS:C0012833', 'dizziness']
data_list==['dizziness']
['UMLS:C0020538', 'hypertensive disease']
data_list==['hypertensive disease']
['UMLS:C0004093', 'asthenia']
data_list==['asthenia']
['UMLS:C0020538', 'hypertensive disease']
data_list==['hypertensive disease']
['UMLS:C0085639', 'fall']
data_list==['fall']
['UMLS:C0020538', 'hypertensive disease']
data_list==['hypertensive disease']
['UMLS:C0039070', 'syncope']
data_list==['syncope']
['UMLS:C0020538', 'hypertensive disease']
data_list==['hypertensive disease']
['UMLS:C0042571', 'vertigo']
data_list==['vertigo']
['UMLS:C0020538', 'hypertensive disease']
data_list==['hyper

['UMLS:C0036690', 'septicemia', 'UMLS:C0243026', 'systemic infection', 'UMLS:C1090821', 'sepsis (invertebrate)']
data_list==['septicemia', 'systemic infection', 'sepsis (invertebrate)']
['UMLS:C0231835', 'tachypnea']
data_list==['tachypnea']
['UMLS:C0036690', 'septicemia', 'UMLS:C0243026', 'systemic infection', 'UMLS:C1090821', 'sepsis (invertebrate)']
data_list==['septicemia', 'systemic infection', 'sepsis (invertebrate)']
['UMLS:C0085593', 'chill']
data_list==['chill']
['UMLS:C0036690', 'septicemia', 'UMLS:C0243026', 'systemic infection', 'UMLS:C1090821', 'sepsis (invertebrate)']
data_list==['septicemia', 'systemic infection', 'sepsis (invertebrate)']
['UMLS:C0023380', 'lethargy']
data_list==['lethargy']
['UMLS:C0036690', 'septicemia', 'UMLS:C0243026', 'systemic infection', 'UMLS:C1090821', 'sepsis (invertebrate)']
data_list==['septicemia', 'systemic infection', 'sepsis (invertebrate)']
['UMLS:C0428977', 'bradycardia']
data_list==['bradycardia']
['UMLS:C0036690', 'septicemia', 'UMLS:

['UMLS:C1456784', 'paranoia']
data_list==['paranoia']
['UMLS:C0233762', 'hallucinations auditory']
data_list==['hallucinations auditory']
['UMLS:C1456784', 'paranoia']
data_list==['paranoia']
['UMLS:C0233763', 'hallucinations visual']
data_list==['hallucinations visual']
['UMLS:C1456784', 'paranoia']
data_list==['paranoia']
['UMLS:C0085631', 'agitation']
data_list==['agitation']
['UMLS:C1456784', 'paranoia']
data_list==['paranoia']
['UMLS:C0022107', 'irritable mood']
data_list==['irritable mood']
['UMLS:C1456784', 'paranoia']
data_list==['paranoia']
['UMLS:C0424068', 'verbal auditory hallucinations']
data_list==['verbal auditory hallucinations']
['UMLS:C1456784', 'paranoia']
data_list==['paranoia']
['UMLS:C0424000', 'feeling suicidal']
data_list==['feeling suicidal']
['UMLS:C1456784', 'paranoia']
data_list==['paranoia']
['UMLS:C0438696', 'suicidal']
data_list==['suicidal']
['UMLS:C1456784', 'paranoia']
data_list==['paranoia']
['UMLS:C0558261', 'terrify']
data_list==['terrify']
['UMLS:C

data_list==['diverticulosis']
['UMLS:C0424000', 'feeling suicidal']
data_list==['feeling suicidal']
['UMLS:C1510475', 'diverticulosis']
data_list==['diverticulosis']
['UMLS:C0150041', 'feeling hopeless']
data_list==['feeling hopeless']
['UMLS:C0038663', 'suicide attempt']
data_list==['suicide attempt']
['UMLS:C0233762', 'hallucinations auditory']
data_list==['hallucinations auditory']
['UMLS:C0038663', 'suicide attempt']
data_list==['suicide attempt']
['UMLS:C0917801', 'sleeplessness']
data_list==['sleeplessness']
['UMLS:C0038663', 'suicide attempt']
data_list==['suicide attempt']
['UMLS:C0438696', 'suicidal']
data_list==['suicidal']
['UMLS:C0038663', 'suicide attempt']
data_list==['suicide attempt']
['UMLS:C0424230', 'motor retardation']
data_list==['motor retardation']
['UMLS:C0038663', 'suicide attempt']
data_list==['suicide attempt']
['UMLS:C0424109', 'weepiness']
data_list==['weepiness']
['UMLS:C0038663', 'suicide attempt']
data_list==['suicide attempt']
['UMLS:C0235198', 'unable 

['UMLS:C0024713', 'manic disorder']
data_list==['manic disorder']
['UMLS:C0558089', 'verbally abusive behavior']
data_list==['verbally abusive behavior']
['UMLS:C0024713', 'manic disorder']
data_list==['manic disorder']
['UMLS:C0438696', 'suicidal']
data_list==['suicidal']
['UMLS:C0024713', 'manic disorder']
data_list==['manic disorder']
['UMLS:C0424000', 'feeling suicidal']
data_list==['feeling suicidal']
['UMLS:C0031212', 'personality disorder']
data_list==['personality disorder']
['UMLS:C0028084', 'nightmare']
data_list==['nightmare']
['UMLS:C0031212', 'personality disorder']
data_list==['personality disorder']
['UMLS:C0150041', 'feeling hopeless']
data_list==['feeling hopeless']
['UMLS:C0031212', 'personality disorder']
data_list==['personality disorder']
['UMLS:C0558141', 'transsexual']
data_list==['transsexual']
['UMLS:C0031212', 'personality disorder']
data_list==['personality disorder']
['UMLS:C0233762', 'hallucinations auditory']
data_list==['hallucinations auditory']
['UMLS:C

['UMLS:C0520888', 't wave inverted']
data_list==['t wave inverted']
['UMLS:C0039239', 'tachycardia sinus']
data_list==['tachycardia sinus']
['UMLS:C0042963', 'vomiting']
data_list==['vomiting']
['UMLS:C0039239', 'tachycardia sinus']
data_list==['tachycardia sinus']
['UMLS:C0000737', 'pain abdominal']
data_list==['pain abdominal']
['UMLS:C1258215', 'ileus']
data_list==['ileus']
['UMLS:C0549483', 'abscess bacterial']
data_list==['abscess bacterial']
['UMLS:C1258215', 'ileus']
data_list==['ileus']
['UMLS:C0000727', 'abdomen acute']
data_list==['abdomen acute']
['UMLS:C1258215', 'ileus']
data_list==['ileus']
['UMLS:C0740844', 'air fluid level']
data_list==['air fluid level']
['UMLS:C1258215', 'ileus']
data_list==['ileus']
['UMLS:C0425491', 'catching breath']
data_list==['catching breath']
['UMLS:C1258215', 'ileus']
data_list==['ileus']
['UMLS:C0232498', 'abdominal tenderness']
data_list==['abdominal tenderness']
['UMLS:C1258215', 'ileus']
data_list==['ileus']
['UMLS:C0027497', 'nausea']
da

In [33]:
# See that the data is Processed Correctly
disease_symptom_dict

defaultdict(list,
            {'hypertensive disease': ['pain chest',
              'shortness of breath',
              'dizziness',
              'asthenia',
              'fall',
              'syncope',
              'vertigo',
              'sweat',
              'sweating increased',
              'palpitation',
              'nausea',
              'angina pectoris',
              'pressure chest'],
             'diabetes': ['polyuria',
              'polydypsia',
              'shortness of breath',
              'pain chest',
              'asthenia',
              'nausea',
              'orthopnea',
              'rale',
              'sweat',
              'sweating increased',
              'unresponsiveness',
              'mental status changes',
              'vertigo',
              'vomiting',
              'labored breathing'],
             'depression mental': ['feeling suicidal',
              'suicidal',
              'hallucinations auditory',
              'feel

In [34]:
# Count of Disease Occurence w.r.t each Disease
disease_symptom_count

{'hypertensive disease': 3363.0,
 'diabetes': 1421.0,
 'depression mental': 1337.0,
 'depressive disorder': 1337.0,
 'coronary arteriosclerosis': 1284.0,
 'coronary heart disease': 1284.0,
 'pneumonia': 1029.0,
 'failure heart congestive': 963.0,
 'accident\xa0cerebrovascular': 885.0,
 'asthma': 835.0,
 'myocardial infarction': 759.0,
 'hypercholesterolemia': 685.0,
 'infection': 630.0,
 'infection urinary tract': 597.0,
 'anemia': 544.0,
 'chronic obstructive airway disease': 524.0,
 'dementia': 504.0,
 'insufficiency renal': 445.0,
 'confusion': 408.0,
 'degenerative\xa0polyarthritis': 405.0,
 'hypothyroidism': 398.0,
 'anxiety state': 390.0,
 'malignant neoplasms': 354.0,
 'primary malignant neoplasm': 354.0,
 'acquired\xa0immuno-deficiency syndrome': 350.0,
 'HIV': 350.0,
 'hiv infections': 350.0,
 'cellulitis': 341.0,
 'gastroesophageal reflux disease': 325.0,
 'septicemia': 311.0,
 'systemic infection': 311.0,
 'sepsis (invertebrate)': 311.0,
 'deep vein thrombosis': 310.0,
 'deh

In [35]:
# Save cleaned data as CSV
f = open('cleaned_data.csv', 'w')

with f:
    writer = csv.writer(f)
    for key, val in disease_symptom_dict.items():
        for i in range(len(val)):
            writer.writerow([key, val[i], disease_symptom_count[key]])

In [36]:
# Read Cleaned Data as DF
df = pd.read_csv('cleaned_data.csv',encoding="ISO-8859-1")
df.columns = ['disease', 'symptom', 'occurence_count']
df.head()

Unnamed: 0,disease,symptom,occurence_count
0,hypertensive disease,shortness of breath,3363.0
1,hypertensive disease,dizziness,3363.0
2,hypertensive disease,asthenia,3363.0
3,hypertensive disease,fall,3363.0
4,hypertensive disease,syncope,3363.0


In [37]:
# Remove any rows with empty values
df.replace(float('nan'), np.nan, inplace=True)
df.dropna(inplace=True)

In [38]:
from sklearn import preprocessing

In [39]:
n_unique = len(df['symptom'].unique())
n_unique

404

In [40]:
df.dtypes

disease             object
symptom             object
occurence_count    float64
dtype: object

In [41]:
# Encode the Labels
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(df['symptom'])
print(integer_encoded)

[328  87  28 ... 361 130 122]


In [42]:
print(integer_encoded)

[328  87  28 ... 361 130 122]


In [43]:
# One Hot Encode the Labels
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [44]:
onehot_encoded[10]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [45]:
len(onehot_encoded[0])

404

In [46]:
cols = np.asarray(df['symptom'].unique())
cols

array(['shortness of breath', 'dizziness', 'asthenia', 'fall', 'syncope',
       'vertigo', 'sweat', 'sweating increased', 'palpitation', 'nausea',
       'angina pectoris', 'pressure chest', 'polyuria', 'polydypsia',
       'pain chest', 'orthopnea', 'rale', 'unresponsiveness',
       'mental status changes', 'vomiting', 'labored breathing',
       'feeling suicidal', 'suicidal', 'hallucinations auditory',
       'feeling hopeless', 'weepiness', 'sleeplessness',
       'motor retardation', 'irritable mood', 'blackout',
       'mood depressed', 'hallucinations visual', 'worry', 'agitation',
       'tremor', 'intoxication', 'verbal auditory hallucinations',
       'energy increased', 'difficulty', 'nightmare',
       'unable to concentrate', 'homelessness', 'hypokinesia',
       'dyspnea on exertion', 'chest tightness', 'cough', 'fever',
       'decreased translucency', 'productive cough', 'pleuritic pain',
       'yellow sputum', 'breath sounds decreased', 'chill', 'rhonchus',
       '

In [47]:
# Create a new dataframe to save OHE labels
df_ohe = pd.DataFrame(columns = cols)
df_ohe.head()

Unnamed: 0,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,nausea,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts


In [48]:
for i in range(len(onehot_encoded)):
    df_ohe.loc[i] = onehot_encoded[i]

In [49]:
df_ohe.head

<bound method NDFrame.head of       shortness of breath  dizziness  asthenia  fall  syncope  vertigo  sweat  \
0                     0.0        0.0       0.0   0.0      0.0      0.0    0.0   
1                     0.0        0.0       0.0   0.0      0.0      0.0    0.0   
2                     0.0        0.0       0.0   0.0      0.0      0.0    0.0   
3                     0.0        0.0       0.0   0.0      0.0      0.0    0.0   
4                     0.0        0.0       0.0   0.0      0.0      0.0    0.0   
...                   ...        ...       ...   ...      ...      ...    ...   
2121                  0.0        0.0       0.0   0.0      0.0      0.0    0.0   
2122                  0.0        0.0       0.0   0.0      0.0      0.0    0.0   
2123                  0.0        0.0       0.0   0.0      0.0      0.0    0.0   
2124                  0.0        0.0       0.0   0.0      0.0      0.0    0.0   
2125                  0.0        0.0       0.0   0.0      0.0      0.0    0.0  

In [50]:
len(df_ohe)

2126

In [51]:
# Disease Dataframe
df_disease = df['disease']
df_disease.head()

0    hypertensive disease
1    hypertensive disease
2    hypertensive disease
3    hypertensive disease
4    hypertensive disease
Name: disease, dtype: object

In [52]:
# Concatenate OHE Labels with the Disease Column
df_concat = pd.concat([df_disease,df_ohe], axis=1)
df_concat.head()

Unnamed: 0,disease,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
0,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [53]:
df_concat.drop_duplicates(keep='last',inplace=True)

In [54]:
df_concat.head()

Unnamed: 0,disease,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
0,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [55]:
len(df_concat)

2097

In [56]:
cols = df_concat.columns
cols

Index(['disease', 'shortness of breath', 'dizziness', 'asthenia', 'fall',
       'syncope', 'vertigo', 'sweat', 'sweating increased', 'palpitation',
       ...
       'feces in rectum', 'prodrome', 'hypoproteinemia',
       'alcohol binge episode', 'abdomen acute', 'air fluid level',
       'catching breath', 'large-for-dates fetus', 'immobile',
       'homicidal thoughts'],
      dtype='object', length=405)

In [57]:
cols = cols[1:]

In [58]:
# Since, every disease has multiple symptoms, combine all symptoms per disease per row
df_concat = df_concat.groupby('disease').sum()
df_concat = df_concat.reset_index()
df_concat[:100]

Unnamed: 0,disease,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
0,Alzheimer's disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,HIV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Pneumocystis carinii pneumonia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,accident cerebrovascular,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,acquired immuno-deficiency syndrome,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,malignant neoplasm of breast,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,malignant neoplasm of lung,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,malignant neoplasm of prostate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,malignant neoplasms,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
len(df_concat)

149

In [60]:
df_concat.to_csv("training_dataset.csv", index=False)

In [61]:
# One Hot Encoded Features
X = df_concat[cols]

# Labels
y = df_concat['disease']

## Model Training

In [62]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [63]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [64]:
len(X_train), len(y_train)

(119, 119)

In [65]:
len(X_test), len(y_test)

(30, 30)

In [67]:
dt = DecisionTreeClassifier()
clf_dt=dt.fit(X_train, y_train)

# Model saving

In [45]:
import pickle

In [73]:
with open('arist_demo','wb') as f:
    pickle.dump(clf_dt,f)

In [74]:
import joblib

In [75]:
joblib.dump(clf_dt,'Arista_symtoms_model')

['Arista_symtoms_model']

In [68]:
clf_dt.score(X, y)

0.0

In [77]:
export_graphviz(dt, 
                out_file='./tree.dot', 
                feature_names=cols)

In [78]:
!pip install graphviz



In [79]:
# from graphviz import Source
# from sklearn import tree

# graph = Source(export_graphviz(dt, 
#                 out_file=None, 
#                 feature_names=cols))

# png_bytes = graph.pipe(format='png')

# with open('tree.png','wb') as f:
#     f.write(png_bytes)

In [82]:
# from IPython.display import Image
# Image(png_bytes)

In [83]:
symptoms=open('symptoms.txt','w',encoding='UTF-8')
l=[]
for i in X.columns:
    l.append(i+' \n')
symptoms.writelines(l)
symptoms.close()

In [84]:
disease_pred = clf_dt.predict(X)

In [91]:
disease_pred


array(["Alzheimer's disease", 'HIV',
       'Pneumocystis\xa0carinii\xa0pneumonia',
       'accident\xa0cerebrovascular',
       'acquired\xa0immuno-deficiency syndrome', 'adenocarcinoma',
       'adhesion', 'affect labile', 'anemia', 'anxiety state', 'aphasia',
       'arthritis', 'asthma', 'bacteremia',
       'benign prostatic hypertrophy', 'biliary calculus',
       'bipolar disorder', 'bronchitis', 'candidiasis', 'carcinoma',
       'carcinoma breast', 'carcinoma colon', 'carcinoma of lung',
       'carcinoma prostate', 'cardiomyopathy', 'cellulitis',
       'cholecystitis', 'cholelithiasis',
       'chronic alcoholic intoxication', 'chronic kidney failure',
       'chronic obstructive airway disease', 'cirrhosis', 'colitis',
       'confusion', 'coronary arteriosclerosis',
       'coronary arteriosclerosis', 'decubitus ulcer',
       'deep vein thrombosis', 'degenerative\xa0polyarthritis',
       'deglutition disorder', 'dehydration', 'delirium', 'delusion',
       'dementia', 'd

In [85]:
li=[]
for i in range(1,405):
    if i==15 or i==20 or i==11 or i==100 or i== 29 or (i>=95 and i<=100):
        li.append(1)
    else:
        li.append(0)
    

In [69]:
p=[0]*404

In [70]:
len(X)

149

In [87]:
clf_dt.predict([p])[0]

'decubitus ulcer'

In [88]:
disease_real = y.values

In [90]:
for i in range(0, len(disease_real)):
    if disease_pred[i]!=disease_real[i]:
        print ('Pred: {0}\nActual: {1}\n'.format(disease_pred[i], disease_real[i]))
    else:
        print ('Pred: {0}\nActual: {1}\n'.format(disease_pred[i], disease_real[i]))

Pred: Alzheimer's disease
Actual: Alzheimer's disease

Pred: HIV
Actual: HIV

Pred: Pneumocystis carinii pneumonia
Actual: Pneumocystis carinii pneumonia

Pred: accident cerebrovascular
Actual: accident cerebrovascular

Pred: acquired immuno-deficiency syndrome
Actual: acquired immuno-deficiency syndrome

Pred: adenocarcinoma
Actual: adenocarcinoma

Pred: adhesion
Actual: adhesion

Pred: affect labile
Actual: affect labile

Pred: anemia
Actual: anemia

Pred: anxiety state
Actual: anxiety state

Pred: aphasia
Actual: aphasia

Pred: arthritis
Actual: arthritis

Pred: asthma
Actual: asthma

Pred: bacteremia
Actual: bacteremia

Pred: benign prostatic hypertrophy
Actual: benign prostatic hypertrophy

Pred: biliary calculus
Actual: biliary calculus

Pred: bipolar disorder
Actual: bipolar disorder

Pred: bronchitis
Actual: bronchitis

Pred: candidiasis
Actual: candidiasis

Pred: carcinoma
Actual: carcinoma

Pred: carcinoma breast
Actual: carcinoma breast

Pred: carcinoma colon
Actual: carcino