## Disease Prediction from Symptoms

For this project, we'll be using the dataset from here: http://people.dbmi.columbia.edu/~friedma/Projects/DiseaseSymptomKB/index.html

Copy the data from all the columns and paste it into an excel sheet -> `raw_data.xlsx`

In [1]:
# Import Dependencies
import csv
import pandas as pd
import numpy as np
from collections import defaultdict
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
# Read Raw Dataset
df = pd.read_excel('raw_data.xlsx')

In [5]:
df

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,,,UMLS:C0392680_shortness of breath
2,,,UMLS:C0012833_dizziness
3,,,UMLS:C0004093_asthenia
4,,,UMLS:C0085639_fall
...,...,...,...
1861,,,UMLS:C0425251_bedridden^UMLS:C0741453_bedridden
1862,,,UMLS:C0242453_prostatism
1863,UMLS:C0011127_decubitus ulcer,42.0,UMLS:C0232257_systolic murmur
1864,,,UMLS:C0871754_frail


In [4]:
# Fill all NaN with the values above
data = df.fillna(method='ffill')

In [10]:
data

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0392680_shortness of breath
2,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0012833_dizziness
3,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0004093_asthenia
4,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0085639_fall


In [10]:
# Process Disease and Symptom Names
def process_data(data):
    data_list = []
    data_name = data.replace('^','_').split('_')
    print(data_name)
    n = 1
    for names in data_name:
        if (n % 2 == 0):
            data_list.append(names)
        n += 1
    print(f"data_list=={data_list}")
    return data_list

In [13]:
# Data Cleanup
disease_list = []
disease_symptom_dict = defaultdict(list)
disease_symptom_count = {}
count = 0

for idx, row in data.iterrows():
    # Get the Disease Names
    if (row['Disease'] !="\xc2\xa0") and (row['Disease'] != ""):
        disease = row['Disease']
        disease_list = process_data(data=disease)
        count = row['Count of Disease Occurrence']

    # Get the Symptoms Corresponding to Diseases
    if (row['Symptom'] !="\xc2\xa0") and (row['Symptom'] != ""):
        symptom = row['Symptom']
        symptom_list = process_data(data=symptom)
        for d in disease_list:
            for s in symptom_list:
                disease_symptom_dict[d].append(s)
            disease_symptom_count[d] = count

idx==0 and row==<class 'pandas.core.series.Series'>
['UMLS:C0020538', 'hypertensive disease']
data_list==['hypertensive disease']
['UMLS:C0008031', 'pain chest']
data_list==['pain chest']
idx==1 and row==<class 'pandas.core.series.Series'>
['UMLS:C0020538', 'hypertensive disease']
data_list==['hypertensive disease']
['UMLS:C0392680', 'shortness of breath']
data_list==['shortness of breath']
idx==2 and row==<class 'pandas.core.series.Series'>
['UMLS:C0020538', 'hypertensive disease']
data_list==['hypertensive disease']
['UMLS:C0012833', 'dizziness']
data_list==['dizziness']
idx==3 and row==<class 'pandas.core.series.Series'>
['UMLS:C0020538', 'hypertensive disease']
data_list==['hypertensive disease']
['UMLS:C0004093', 'asthenia']
data_list==['asthenia']
idx==4 and row==<class 'pandas.core.series.Series'>
['UMLS:C0020538', 'hypertensive disease']
data_list==['hypertensive disease']
['UMLS:C0085639', 'fall']
data_list==['fall']
idx==5 and row==<class 'pandas.core.series.Series'>
['UMLS:C

idx==410 and row==<class 'pandas.core.series.Series'>
['UMLS:C0034065', 'embolism pulmonary']
data_list==['embolism pulmonary']
['UMLS:C0744727', 'hematocrit decreased']
data_list==['hematocrit decreased']
idx==411 and row==<class 'pandas.core.series.Series'>
['UMLS:C0034065', 'embolism pulmonary']
data_list==['embolism pulmonary']
['UMLS:C0008031', 'pain chest']
data_list==['pain chest']
idx==412 and row==<class 'pandas.core.series.Series'>
['UMLS:C0034065', 'embolism pulmonary']
data_list==['embolism pulmonary']
['UMLS:C0013404', 'dyspnea']
data_list==['dyspnea']
idx==413 and row==<class 'pandas.core.series.Series'>
['UMLS:C0034065', 'embolism pulmonary']
data_list==['embolism pulmonary']
['UMLS:C0008033', 'pleuritic pain']
data_list==['pleuritic pain']
idx==414 and row==<class 'pandas.core.series.Series'>
['UMLS:C0034065', 'embolism pulmonary']
data_list==['embolism pulmonary']
['UMLS:C0151315', 'neck stiffness']
data_list==['neck stiffness']
idx==415 and row==<class 'pandas.core.se

data_list==['posturing']
idx==810 and row==<class 'pandas.core.series.Series'>
['UMLS:C0019270', 'hernia']
data_list==['hernia']
['UMLS:C0018991', 'hemiplegia']
data_list==['hemiplegia']
idx==811 and row==<class 'pandas.core.series.Series'>
['UMLS:C0019270', 'hernia']
data_list==['hernia']
['UMLS:C0234233', 'sore to touch']
data_list==['sore to touch']
idx==812 and row==<class 'pandas.core.series.Series'>
['UMLS:C0019270', 'hernia']
data_list==['hernia']
['UMLS:C0019080', 'haemorrhage']
data_list==['haemorrhage']
idx==813 and row==<class 'pandas.core.series.Series'>
['UMLS:C0019270', 'hernia']
data_list==['hernia']
['UMLS:C0277797', 'apyrexial']
data_list==['apyrexial']
idx==814 and row==<class 'pandas.core.series.Series'>
['UMLS:C0019270', 'hernia']
data_list==['hernia']
['UMLS:C0149696', 'food intolerance']
data_list==['food intolerance']
idx==815 and row==<class 'pandas.core.series.Series'>
['UMLS:C0019270', 'hernia']
data_list==['hernia']
['UMLS:C0277899', 'pulse absent']
data_list

data_list==['pain abdominal']
idx==1210 and row==<class 'pandas.core.series.Series'>
['UMLS:C0019163', 'hepatitis B']
data_list==['hepatitis B']
['UMLS:C0232498', 'abdominal tenderness']
data_list==['abdominal tenderness']
idx==1211 and row==<class 'pandas.core.series.Series'>
['UMLS:C0019163', 'hepatitis B']
data_list==['hepatitis B']
['UMLS:C0558195', 'wheelchair bound']
data_list==['wheelchair bound']
idx==1212 and row==<class 'pandas.core.series.Series'>
['UMLS:C0019163', 'hepatitis B']
data_list==['hepatitis B']
['UMLS:C0859032', 'moan']
data_list==['moan']
idx==1213 and row==<class 'pandas.core.series.Series'>
['UMLS:C0030567', 'parkinson disease']
data_list==['parkinson disease']
['UMLS:C1321756', 'achalasia']
data_list==['achalasia']
idx==1214 and row==<class 'pandas.core.series.Series'>
['UMLS:C0030567', 'parkinson disease']
data_list==['parkinson disease']
['UMLS:C0085639', 'fall']
data_list==['fall']
idx==1215 and row==<class 'pandas.core.series.Series'>
['UMLS:C0030567', 'p

['UMLS:C0205400', 'thicken']
data_list==['thicken']
idx==1610 and row==<class 'pandas.core.series.Series'>
['UMLS:C0014118', 'endocarditis']
data_list==['endocarditis']
['UMLS:C0231528', 'myalgia']
data_list==['myalgia']
idx==1611 and row==<class 'pandas.core.series.Series'>
['UMLS:C0014118', 'endocarditis']
data_list==['endocarditis']
['UMLS:C0277797', 'apyrexial']
data_list==['apyrexial']
idx==1612 and row==<class 'pandas.core.series.Series'>
['UMLS:C0014118', 'endocarditis']
data_list==['endocarditis']
['UMLS:C0028081', 'night sweat']
data_list==['night sweat']
idx==1613 and row==<class 'pandas.core.series.Series'>
['UMLS:C0014118', 'endocarditis']
data_list==['endocarditis']
['UMLS:C1517205', 'flare']
data_list==['flare']
idx==1614 and row==<class 'pandas.core.series.Series'>
['UMLS:C0014118', 'endocarditis']
data_list==['endocarditis']
['UMLS:C0392680', 'shortness of breath']
data_list==['shortness of breath']
idx==1615 and row==<class 'pandas.core.series.Series'>
['UMLS:C0014118'

In [13]:
# See that the data is Processed Correctly
disease_symptom_dict

defaultdict(list,
            {'hypertensive disease': ['pain chest',
              'shortness of breath',
              'dizziness',
              'asthenia',
              'fall',
              'syncope',
              'vertigo',
              'sweat',
              'sweating increased',
              'palpitation',
              'nausea',
              'angina pectoris',
              'pressure chest'],
             'diabetes': ['polyuria',
              'polydypsia',
              'shortness of breath',
              'pain chest',
              'asthenia',
              'nausea',
              'orthopnea',
              'rale',
              'sweat',
              'sweating increased',
              'unresponsiveness',
              'mental status changes',
              'vertigo',
              'vomiting',
              'labored breathing'],
             'depression mental': ['feeling suicidal',
              'suicidal',
              'hallucinations auditory',
              'feel

In [14]:
# Count of Disease Occurence w.r.t each Disease
disease_symptom_count

{'hypertensive disease': 3363.0,
 'diabetes': 1421.0,
 'depression mental': 1337.0,
 'depressive disorder': 1337.0,
 'coronary arteriosclerosis': 1284.0,
 'coronary heart disease': 1284.0,
 'pneumonia': 1029.0,
 'failure heart congestive': 963.0,
 'accident\xa0cerebrovascular': 885.0,
 'asthma': 835.0,
 'myocardial infarction': 759.0,
 'hypercholesterolemia': 685.0,
 'infection': 630.0,
 'infection urinary tract': 597.0,
 'anemia': 544.0,
 'chronic obstructive airway disease': 524.0,
 'dementia': 504.0,
 'insufficiency renal': 445.0,
 'confusion': 408.0,
 'degenerative\xa0polyarthritis': 405.0,
 'hypothyroidism': 398.0,
 'anxiety state': 390.0,
 'malignant neoplasms': 354.0,
 'primary malignant neoplasm': 354.0,
 'acquired\xa0immuno-deficiency syndrome': 350.0,
 'HIV': 350.0,
 'hiv infections': 350.0,
 'cellulitis': 341.0,
 'gastroesophageal reflux disease': 325.0,
 'septicemia': 311.0,
 'systemic infection': 311.0,
 'sepsis (invertebrate)': 311.0,
 'deep vein thrombosis': 310.0,
 'deh

In [15]:
# Save cleaned data as CSV
f = open('cleaned_data.csv', 'w')

with f:
    writer = csv.writer(f)
    for key, val in disease_symptom_dict.items():
        for i in range(len(val)):
            writer.writerow([key, val[i], disease_symptom_count[key]])

In [15]:
# Read Cleaned Data as DF
df = pd.read_csv('cleaned_data.csv',encoding="ISO-8859-1")
df.columns = ['disease', 'symptom', 'occurence_count']
df.head()

Unnamed: 0,disease,symptom,occurence_count
0,hypertensive disease,shortness of breath,3363.0
1,hypertensive disease,dizziness,3363.0
2,hypertensive disease,asthenia,3363.0
3,hypertensive disease,fall,3363.0
4,hypertensive disease,syncope,3363.0


In [17]:
# Remove any rows with empty values
df.replace(float('nan'), np.nan, inplace=True)
df.dropna(inplace=True)

In [18]:
from sklearn import preprocessing

In [19]:
n_unique = len(df['symptom'].unique())
n_unique

404

In [20]:
df.dtypes

disease             object
symptom             object
occurence_count    float64
dtype: object

In [21]:
# Encode the Labels
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(df['symptom'])
print(integer_encoded)

[328  87  28 ... 361 130 122]


In [22]:
print(integer_encoded)

[328  87  28 ... 361 130 122]


In [23]:
# One Hot Encode the Labels
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [24]:
onehot_encoded[1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [25]:
len(onehot_encoded[0])

404

In [26]:
cols = np.asarray(df['symptom'].unique())
cols

array(['shortness of breath', 'dizziness', 'asthenia', 'fall', 'syncope',
       'vertigo', 'sweat', 'sweating increased', 'palpitation', 'nausea',
       'angina pectoris', 'pressure chest', 'polyuria', 'polydypsia',
       'pain chest', 'orthopnea', 'rale', 'unresponsiveness',
       'mental status changes', 'vomiting', 'labored breathing',
       'feeling suicidal', 'suicidal', 'hallucinations auditory',
       'feeling hopeless', 'weepiness', 'sleeplessness',
       'motor retardation', 'irritable mood', 'blackout',
       'mood depressed', 'hallucinations visual', 'worry', 'agitation',
       'tremor', 'intoxication', 'verbal auditory hallucinations',
       'energy increased', 'difficulty', 'nightmare',
       'unable to concentrate', 'homelessness', 'hypokinesia',
       'dyspnea on exertion', 'chest tightness', 'cough', 'fever',
       'decreased translucency', 'productive cough', 'pleuritic pain',
       'yellow sputum', 'breath sounds decreased', 'chill', 'rhonchus',
       '

In [27]:
# Create a new dataframe to save OHE labels
df_ohe = pd.DataFrame(columns = cols)
df_ohe.head()

Unnamed: 0,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,nausea,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts


In [28]:
for i in range(len(onehot_encoded)):
    df_ohe.loc[i] = onehot_encoded[i]

In [29]:
df_ohe.head()

Unnamed: 0,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,nausea,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
len(df_ohe)

2126

In [31]:
# Disease Dataframe
df_disease = df['disease']
df_disease.head()

0    hypertensive disease
1    hypertensive disease
2    hypertensive disease
3    hypertensive disease
4    hypertensive disease
Name: disease, dtype: object

In [37]:
# Concatenate OHE Labels with the Disease Column
df_concat = pd.concat([df_disease,df_ohe], axis=1)
df_concat.head()

Unnamed: 0,disease,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
0,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [38]:
df_concat.drop_duplicates(keep='first',inplace=True)

In [39]:
df_concat.head()

Unnamed: 0,disease,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
0,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
len(df_concat)

2097

In [41]:
cols = df_concat.columns
cols

Index(['disease', 'shortness of breath', 'dizziness', 'asthenia', 'fall',
       'syncope', 'vertigo', 'sweat', 'sweating increased', 'palpitation',
       ...
       'feces in rectum', 'prodrome', 'hypoproteinemia',
       'alcohol binge episode', 'abdomen acute', 'air fluid level',
       'catching breath', 'large-for-dates fetus', 'immobile',
       'homicidal thoughts'],
      dtype='object', length=405)

In [42]:
cols = cols[1:]

In [43]:
# Since, every disease has multiple symptoms, combine all symptoms per disease per row
df_concat = df_concat.groupby('disease').sum()
df_concat = df_concat.reset_index()
df_concat[:100]

Unnamed: 0,disease,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
0,Alzheimer's disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,HIV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Pneumocystis carinii pneumonia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,accident cerebrovascular,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,acquired immuno-deficiency syndrome,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,malignant neoplasm of breast,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
96,malignant neoplasm of lung,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
97,malignant neoplasm of prostate,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
98,malignant neoplasms,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [44]:
len(df_concat)

149

In [45]:
df_concat.to_csv("training_dataset.csv", index=False)

In [46]:
# One Hot Encoded Features
X = df_concat[cols]

# Labels
y = df_concat['disease']

## Model Training

In [47]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier, export_graphviz

In [48]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

In [49]:
len(X_train), len(y_train)

(119, 119)

In [50]:
len(X_test), len(y_test)

(30, 30)

In [51]:
dt = DecisionTreeClassifier()
clf_dt=dt.fit(X, y)

# Model saving

In [52]:
import pickle

In [53]:
with open('arist_demo','wb') as f:
    pickle.dump(clf_dt,f)

In [55]:
import joblib

In [56]:
joblib.dump(clf_dt,'Arista_symtoms_model')

['Arista_symtoms_model']

In [57]:
clf_dt.score(X, y)

0.9731543624161074

In [68]:
export_graphviz(dt, 
                out_file='./tree.dot', 
                feature_names=cols)

In [69]:
!pip install graphviz



In [72]:
# from graphviz import Source
# from sklearn import tree

# graph = Source(export_graphviz(dt, 
#                 out_file=None, 
#                 feature_names=cols))

# png_bytes = graph.pipe(format='png')

# with open('tree.png','wb') as f:
#     f.write(png_bytes)

In [None]:
from IPython.display import Image
Image(png_bytes)

In [96]:
symptoms=open('symptoms.txt','w',encoding='UTF-8')
l=[]
for i in X.columns:
    l.append(i+' \n')
symptoms.writelines(l)
symptoms.close()

In [74]:
disease_pred = clf_dt.predict(X)

In [75]:
li=[]
for i in range(1,405):
    if i==15 or i==20 or i==11 or i==100 or i== 29 or (i>=95 and i<=100):
        li.append(1)
    else:
        li.append(0)
    

In [76]:
p=[0]*404

In [77]:
clf_dt.predict([p])[0]

'decubitus ulcer'

In [78]:
disease_real = y.values

In [79]:
for i in range(0, len(disease_real)):
    if disease_pred[i]!=disease_real[i]:
        print ('Pred: {0}\nActual: {1}\n'.format(disease_pred[i], disease_real[i]))

Pred: coronary arteriosclerosis
Actual: coronary heart disease

Pred: depression mental
Actual: depressive disorder

Pred: malignant neoplasms
Actual: primary malignant neoplasm

Pred: septicemia
Actual: systemic infection

