In [1]:
import csv
import pandas as pd
import numpy as np
from collections import defaultdict

In [2]:
# Read Raw Dataset
df = pd.read_csv("./raw_data.csv")
df.head()

Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,,,UMLS:C0392680_shortness of breath
2,,,UMLS:C0012833_dizziness
3,,,UMLS:C0004093_asthenia
4,,,UMLS:C0085639_fall


In [3]:
# Fill all NaN with the values above
data = df.fillna(method='ffill')  #forward fill
data.head()

  data = df.fillna(method='ffill')  #forward fill


Unnamed: 0,Disease,Count of Disease Occurrence,Symptom
0,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0008031_pain chest
1,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0392680_shortness of breath
2,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0012833_dizziness
3,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0004093_asthenia
4,UMLS:C0020538_hypertensive disease,3363.0,UMLS:C0085639_fall


In [4]:
# Process Disease and Symptom Names
def process_data(data):
    data_list = []
    data_name = data.replace('^','_').split('_')
    n = 1
    for names in data_name:
        if (n % 2 == 0):
            data_list.append(names) # storing only name 
        n += 1
    return data_list 

In [5]:
# Data Cleanup
disease_list = []
disease_symptom_dict = defaultdict(list)
disease_symptom_count = {}
count = 0

for idx, row in data.iterrows():
    
    # Get the Disease Names
    if (row['Disease'] !="\xc2\xa0") and (row['Disease'] != ""):
        disease = row['Disease']
        disease_list = process_data(data=disease)
        count = row['Count of Disease Occurrence']

    # Get the Symptoms Corresponding to Diseases
    if (row['Symptom'] !="\xc2\xa0") and (row['Symptom'] != ""):
        symptom = row['Symptom']
        symptom_list = process_data(data=symptom)
        for d in disease_list:
            for s in symptom_list:
                disease_symptom_dict[d].append(s)
            disease_symptom_count[d] = count

In [6]:
# See that the data is Processed Correctly
disease_symptom_dict

defaultdict(list,
            {'hypertensive disease': ['pain chest',
              'shortness of breath',
              'dizziness',
              'asthenia',
              'fall',
              'syncope',
              'vertigo',
              'sweat',
              'sweating increased',
              'palpitation',
              'nausea',
              'angina pectoris',
              'pressure chest'],
             'diabetes': ['polyuria',
              'polydypsia',
              'shortness of breath',
              'pain chest',
              'asthenia',
              'nausea',
              'orthopnea',
              'rale',
              'sweat',
              'sweating increased',
              'unresponsiveness',
              'mental status changes',
              'vertigo',
              'vomiting',
              'labored breathing'],
             'depression mental': ['feeling suicidal',
              'suicidal',
              'hallucinations auditory',
              'feel

In [7]:
# Count of Disease Occurence w.r.t each Disease
disease_symptom_count

{'hypertensive disease': 3363.0,
 'diabetes': 1421.0,
 'depression mental': 1337.0,
 'depressive disorder': 1337.0,
 'coronary arteriosclerosis': 1284.0,
 'coronary heart disease': 1284.0,
 'pneumonia': 1029.0,
 'failure heart congestive': 963.0,
 'accident\xa0cerebrovascular': 885.0,
 'asthma': 835.0,
 'myocardial infarction': 759.0,
 'hypercholesterolemia': 685.0,
 'infection': 630.0,
 'infection urinary tract': 597.0,
 'anemia': 544.0,
 'chronic obstructive airway disease': 524.0,
 'dementia': 504.0,
 'insufficiency renal': 445.0,
 'confusion': 408.0,
 'degenerative\xa0polyarthritis': 405.0,
 'hypothyroidism': 398.0,
 'anxiety state': 390.0,
 'malignant neoplasms': 354.0,
 'primary malignant neoplasm': 354.0,
 'acquired\xa0immuno-deficiency syndrome': 350.0,
 'HIV': 350.0,
 'hiv infections': 350.0,
 'cellulitis': 341.0,
 'gastroesophageal reflux disease': 325.0,
 'septicemia': 311.0,
 'systemic infection': 311.0,
 'sepsis (invertebrate)': 311.0,
 'deep vein thrombosis': 310.0,
 'deh

In [8]:
# Save cleaned data as CSV
f = open('./cleaned_data.csv', 'w')

with f:
    writer = csv.writer(f)
    for key, val in disease_symptom_dict.items():
        for i in range(len(val)):
            writer.writerow([key, val[i], disease_symptom_count[key]])

In [9]:
# Read Cleaned Data as DF
df = pd.read_csv('./cleaned_data.csv')
df.columns = ['disease', 'symptom', 'occurence_count']
df.head()

Unnamed: 0,disease,symptom,occurence_count
0,hypertensive disease,shortness of breath,3363.0
1,hypertensive disease,dizziness,3363.0
2,hypertensive disease,asthenia,3363.0
3,hypertensive disease,fall,3363.0
4,hypertensive disease,syncope,3363.0


In [10]:
# Remove any rows with empty values
df.replace(float('nan'), np.nan, inplace=True)
df.dropna(inplace=True)

# Data Preprocessing

In [11]:
from sklearn import preprocessing

In [12]:
n_unique = len(df['symptom'].unique())
n_unique

404

In [13]:
df.dtypes

disease             object
symptom             object
occurence_count    float64
dtype: object

In [14]:
# Encode the Labels
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(df['symptom'])
print(integer_encoded)

[328  87  28 ... 130 122 122]


In [15]:
# One Hot Encode the Labels
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
print(onehot_encoded)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]




In [16]:
onehot_encoded[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [17]:
len(onehot_encoded[0])

404

In [18]:
cols = np.asarray(df['symptom'].unique())
cols

array(['shortness of breath', 'dizziness', 'asthenia', 'fall', 'syncope',
       'vertigo', 'sweat', 'sweating increased', 'palpitation', 'nausea',
       'angina pectoris', 'pressure chest', 'polyuria', 'polydypsia',
       'pain chest', 'orthopnea', 'rale', 'unresponsiveness',
       'mental status changes', 'vomiting', 'labored breathing',
       'feeling suicidal', 'suicidal', 'hallucinations auditory',
       'feeling hopeless', 'weepiness', 'sleeplessness',
       'motor retardation', 'irritable mood', 'blackout',
       'mood depressed', 'hallucinations visual', 'worry', 'agitation',
       'tremor', 'intoxication', 'verbal auditory hallucinations',
       'energy increased', 'difficulty', 'nightmare',
       'unable to concentrate', 'homelessness', 'hypokinesia',
       'dyspnea on exertion', 'chest tightness', 'cough', 'fever',
       'decreased translucency', 'productive cough', 'pleuritic pain',
       'yellow sputum', 'breath sounds decreased', 'chill', 'rhonchus',
       '

In [19]:
# Create a new dataframe to save OHE labels
df_ohe = pd.DataFrame(columns = cols)
df_ohe.head()

Unnamed: 0,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,nausea,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts


In [20]:
# loading the data
for i in range(len(onehot_encoded)):
    df_ohe.loc[i] = onehot_encoded[i]

In [21]:
df_ohe.head()

Unnamed: 0,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,nausea,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
len(df_ohe)

2127

In [23]:
# Disease Dataframe
df_disease = df['disease']
df_disease.head()

0    hypertensive disease
1    hypertensive disease
2    hypertensive disease
3    hypertensive disease
4    hypertensive disease
Name: disease, dtype: object

In [24]:
# Concatenate OHE Labels with the Disease Column
df_concat = pd.concat([df_disease,df_ohe], axis=1)
df_concat.head()

Unnamed: 0,disease,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
0,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
df_concat.drop_duplicates(keep='first',inplace=True)

In [26]:
df_concat.head()

Unnamed: 0,disease,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
0,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,hypertensive disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
len(df_concat)

2098

In [28]:
cols = df_concat.columns
cols

Index(['disease', 'shortness of breath', 'dizziness', 'asthenia', 'fall',
       'syncope', 'vertigo', 'sweat', 'sweating increased', 'palpitation',
       ...
       'feces in rectum', 'prodrome', 'hypoproteinemia',
       'alcohol binge episode', 'abdomen acute', 'air fluid level',
       'catching breath', 'large-for-dates fetus', 'immobile',
       'homicidal thoughts'],
      dtype='object', length=405)

In [29]:
cols = cols[1:]

In [30]:
# Since, every disease has multiple symptoms, combine all symptoms per disease per row
df_concat = df_concat.groupby('disease').sum()
df_concat = df_concat.reset_index()
df_concat.head()

Unnamed: 0,disease,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
0,Alzheimer's disease,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,HIV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Pneumocystis carinii pneumonia,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,accident cerebrovascular,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,acquired immuno-deficiency syndrome,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
len(df_concat)

149

In [32]:
df_concat.to_csv("./training_dataset.csv", index=False)

## Model Training

In [33]:
# One Hot Encoded Features
X = df_concat[cols]

# Labels
y = df_concat['disease']

In [34]:
X

Unnamed: 0,shortness of breath,dizziness,asthenia,fall,syncope,vertigo,sweat,sweating increased,palpitation,nausea,...,feces in rectum,prodrome,hypoproteinemia,alcohol binge episode,abdomen acute,air fluid level,catching breath,large-for-dates fetus,immobile,homicidal thoughts
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
145,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
146,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
147,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [35]:
y

0                      Alzheimer's disease
1                                      HIV
2           Pneumocystis carinii pneumonia
3                 accident cerebrovascular
4      acquired immuno-deficiency syndrome
                      ...                 
144                  tonic-clonic seizures
145              transient ischemic attack
146          tricuspid valve insufficiency
147                           ulcer peptic
148            upper respiratory infection
Name: disease, Length: 149, dtype: object

## DecisionTreeClassifier

In [36]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [37]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [70]:
len(X_train), len(y_train) , len(X_test), len(y_test)

(119, 119, 30, 30)

In [71]:
dt = DecisionTreeClassifier()
clf_dt=dt.fit(X, y) # Training the model

In [72]:
disease_pred = clf_dt.predict(X) # Predicting the values

In [73]:
disease_real = y.values # Real values
disease_real

array(["Alzheimer's disease", 'HIV',
       'Pneumocystis\xa0carinii\xa0pneumonia',
       'accident\xa0cerebrovascular',
       'acquired\xa0immuno-deficiency syndrome', 'adenocarcinoma',
       'adhesion', 'affect labile', 'anemia', 'anxiety state', 'aphasia',
       'arthritis', 'asthma', 'bacteremia',
       'benign prostatic hypertrophy', 'biliary calculus',
       'bipolar disorder', 'bronchitis', 'candidiasis', 'carcinoma',
       'carcinoma breast', 'carcinoma colon', 'carcinoma of lung',
       'carcinoma prostate', 'cardiomyopathy', 'cellulitis',
       'cholecystitis', 'cholelithiasis',
       'chronic alcoholic intoxication', 'chronic kidney failure',
       'chronic obstructive airway disease', 'cirrhosis', 'colitis',
       'confusion', 'coronary arteriosclerosis', 'coronary heart disease',
       'decubitus ulcer', 'deep vein thrombosis',
       'degenerative\xa0polyarthritis', 'deglutition disorder',
       'dehydration', 'delirium', 'delusion', 'dementia', 'dependence'

In [74]:
crct_count = 0
for i in range(0, len(disease_real)):
    if disease_pred[i]==disease_real[i]:
        print ('Pred: {0}\nActual: {1}\n'.format(disease_pred[i], disease_real[i]))
        crct_count += 1

Pred: Alzheimer's disease
Actual: Alzheimer's disease

Pred: HIV
Actual: HIV

Pred: Pneumocystis carinii pneumonia
Actual: Pneumocystis carinii pneumonia

Pred: accident cerebrovascular
Actual: accident cerebrovascular

Pred: acquired immuno-deficiency syndrome
Actual: acquired immuno-deficiency syndrome

Pred: adenocarcinoma
Actual: adenocarcinoma

Pred: adhesion
Actual: adhesion

Pred: affect labile
Actual: affect labile

Pred: anemia
Actual: anemia

Pred: anxiety state
Actual: anxiety state

Pred: aphasia
Actual: aphasia

Pred: arthritis
Actual: arthritis

Pred: asthma
Actual: asthma

Pred: bacteremia
Actual: bacteremia

Pred: benign prostatic hypertrophy
Actual: benign prostatic hypertrophy

Pred: biliary calculus
Actual: biliary calculus

Pred: bipolar disorder
Actual: bipolar disorder

Pred: bronchitis
Actual: bronchitis

Pred: candidiasis
Actual: candidiasis

Pred: carcinoma
Actual: carcinoma

Pred: carcinoma breast
Actual: carcinoma breast

Pred: carcinoma colon
Actual: carcino

In [75]:
w_count = 0
for i in range(0, len(disease_real)):
    if disease_pred[i]!=disease_real[i]:
        print ('Pred: {0}\nActual: {1}\n'.format(disease_pred[i], disease_real[i]))
        w_count += 1

Pred: coronary arteriosclerosis
Actual: coronary heart disease

Pred: depression mental
Actual: depressive disorder

Pred: malignant neoplasms
Actual: primary malignant neoplasm

Pred: septicemia
Actual: systemic infection



In [76]:
print('DecisionTreeClassifier  :',clf_dt.score(X, y)*100,'%')

DecisionTreeClassifier  : 97.31543624161074 %


# ML Model

In [77]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

## Train Test Split

In [78]:
# Train Test Split
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

In [79]:
xtrain.shape, xtest.shape, ytrain.shape, ytest.shape

((119, 404), (30, 404), (119,), (30,))

In [80]:
ytrain

22                carcinoma of lung
15                 biliary calculus
65                         glaucoma
11                        arthritis
42                         delusion
                   ...             
71                      hepatitis C
106                        neoplasm
14     benign prostatic hypertrophy
92             kidney failure acute
102                        melanoma
Name: disease, Length: 119, dtype: object

In [81]:
ytrain = np.array(ytrain)
ytrain

array(['carcinoma of lung', 'biliary calculus', 'glaucoma', 'arthritis',
       'delusion', 'myocardial infarction', 'effusion pericardial',
       'cholelithiasis', 'acquired\xa0immuno-deficiency syndrome',
       'colitis', 'paroxysmal\xa0dyspnea', 'infection',
       'infection urinary tract', 'bipolar disorder', 'aphasia',
       'hypoglycemia', 'schizophrenia', 'thrombus', 'hyperbilirubinemia',
       'neuropathy', 'malignant neoplasm of breast',
       'mitral valve insufficiency', 'gout', "Alzheimer's disease",
       'thrombocytopaenia', 'hemiparesis',
       'chronic alcoholic intoxication', 'dehydration', 'dependence',
       'failure kidney', 'peripheral vascular disease', 'cardiomyopathy',
       'cellulitis', 'carcinoma prostate', 'lymphatic diseases',
       'deglutition disorder', 'lymphoma', 'suicide attempt', 'diabetes',
       'malignant neoplasm of lung', 'oral\xa0candidiasis', 'confusion',
       'malignant neoplasms', 'transient ischemic attack', 'gastritis',
     

## Classification

In [82]:
logreg = LogisticRegression()
svc_classifier = SVC()
dt_classifier = DecisionTreeClassifier()
knn_classifier = KNeighborsClassifier(5)
rf_classifier = RandomForestClassifier(n_estimators=1000, criterion = 'entropy', random_state = 0 )

In [83]:
logreg.fit(xtrain, ytrain)
svc_classifier.fit(xtrain, ytrain)
dt_classifier.fit(xtrain, ytrain)
knn_classifier.fit(xtrain, ytrain)
rf_classifier.fit(xtrain, ytrain)

In [84]:
logreg_ypred = logreg.predict(xtest)
svc_classifier_ypred = svc_classifier.predict(xtest)
dt_classifier_ypred = dt_classifier.predict(xtest)
knn_classifier_ypred = knn_classifier.predict(xtest)
rf_classifier_ypred = rf_classifier.predict(xtest)

In [85]:
# finding accuracy

logreg_acc = accuracy_score(ytest, logreg_ypred)
svc_classifier_acc = accuracy_score(ytest, svc_classifier_ypred)
dt_classifier_acc = accuracy_score(ytest, dt_classifier_ypred)
knn_classifier_acc = accuracy_score(ytest, knn_classifier_ypred)
rf_classifier_acc = accuracy_score(ytest, rf_classifier_ypred)

In [86]:
print ("Logistic Regression : ", round(logreg_acc*100, 2))
print ("Support Vector      : ", round(svc_classifier_acc*100, 2))
print ("Decision Tree       : ", round(dt_classifier_acc*100, 2))
print ("K-NN Classifier     : ", round(knn_classifier_acc*100, 2))
print ("Random Forest       : ", round(rf_classifier_acc*100, 2))

Logistic Regression :  0.0
Support Vector      :  0.0
Decision Tree       :  0.0
K-NN Classifier     :  0.0
Random Forest       :  0.0


In [55]:
logreg_ypred

array(['gastritis', 'oral\xa0candidiasis', 'cholelithiasis',
       'coronary heart disease', 'myocardial infarction', 'thrombus',
       'hypertensive disease', 'thrombus', 'failure heart congestive',
       'pneumothorax', 'systemic infection', 'failure heart congestive',
       'kidney disease', 'depressive disorder', 'ileus',
       'tonic-clonic epilepsy', 'obesity morbid', 'pyelonephritis',
       'failure heart congestive', 'edema pulmonary',
       'insufficiency renal', 'delusion', 'ileus', 'kidney failure acute',
       'malignant neoplasms', 'tonic-clonic epilepsy',
       'stenosis aortic valve', 'depressive disorder',
       'depressive disorder', 'failure heart congestive'], dtype=object)

In [56]:
svc_classifier_ypred

array(['pyelonephritis', 'oral\xa0candidiasis', 'cholelithiasis',
       'coronary heart disease', 'myocardial infarction',
       'kidney disease', 'hypertensive disease', 'thrombus',
       'failure heart congestive', 'pneumothorax', 'systemic infection',
       'failure heart congestive', 'kidney disease',
       'depressive disorder', 'ileus', 'failure heart congestive',
       'kidney disease', 'pyelonephritis', 'failure heart congestive',
       'failure heart congestive', 'pyelonephritis', 'delusion',
       'melanoma', 'kidney failure acute', 'malignant neoplasms',
       'tonic-clonic epilepsy', 'stenosis aortic valve', 'affect labile',
       'depressive disorder', 'failure heart congestive'], dtype=object)

In [57]:
dt_classifier_ypred

array(['neoplasm', 'exanthema', 'cholelithiasis', 'adhesion',
       'biliary calculus', 'lymphoma', 'coronary arteriosclerosis',
       'cholelithiasis', 'glaucoma', 'overload fluid', 'lymphoma',
       'hyperbilirubinemia', 'lymphoma', 'depressive disorder',
       'adhesion', 'tonic-clonic epilepsy', 'coronary arteriosclerosis',
       'osteomyelitis', 'failure kidney', 'gout', 'sepsis (invertebrate)',
       'carcinoma breast', 'lymphoma', 'kidney failure acute',
       'malignant neoplasms', 'tonic-clonic epilepsy',
       'tonic-clonic epilepsy', 'adhesion', 'depressive disorder',
       'coronary arteriosclerosis'], dtype=object)

In [58]:
knn_classifier_ypred

array(['gastritis', 'HIV', 'cholelithiasis',
       'accident\xa0cerebrovascular', 'coronary arteriosclerosis',
       'gastroenteritis', 'coronary heart disease',
       'coronary heart disease', 'arthritis',
       'accident\xa0cerebrovascular', 'benign prostatic hypertrophy',
       'arthritis', 'affect labile', 'bipolar disorder', 'cholelithiasis',
       'accident\xa0cerebrovascular', 'accident\xa0cerebrovascular',
       'failure heart congestive', 'benign prostatic hypertrophy',
       'arthritis', 'failure heart congestive', 'delusion',
       'cholelithiasis', 'arthritis', 'biliary calculus',
       'accident\xa0cerebrovascular', 'coronary arteriosclerosis',
       'affect labile', 'bipolar disorder', 'arthritis'], dtype=object)

In [59]:
np.array(ytest)

array(['hernia\xa0hiatal', 'candidiasis', 'pancytopenia',
       'hyperlipidemia', 'hypercholesterolemia', 'cirrhosis',
       'gastroesophageal reflux disease', 'tachycardia sinus',
       'hemorrhoids', 'hypothyroidism', 'neutropenia', 'asthma',
       'decubitus ulcer', 'anxiety state', 'carcinoma', 'epilepsy',
       'spasm bronchial', 'hepatitis', 'endocarditis',
       'respiratory failure', 'chronic kidney failure',
       'personality disorder', 'cholecystitis',
       'primary carcinoma of the liver cells',
       'primary malignant neoplasm', 'tonic-clonic seizures', 'obesity',
       'manic disorder', 'depression mental',
       'chronic obstructive airway disease'], dtype=object)

ValueError: could not convert string to float: 'hernia\xa0hiatal'