In [1]:
# Base Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Data Analysis Libraries
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.preprocessing import MinMaxScaler,StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score,roc_auc_score
from sklearn.model_selection import train_test_split, cross_val_score,GridSearchCV
from sklearn.metrics import classification_report,confusion_matrix

# Machine Learning Libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

# İgnore Warnings
import warnings
warnings.filterwarnings("ignore")

In [2]:
patients = pd.read_table('patient_symptom_profiles.csv', sep=',')
patients.head(5)

Unnamed: 0,Disease,Fever,Cough,Fatigue,DifficultyBreathing,Age,Gender,BloodPressure,CholesterolLevel,OutcomeVariable
0,Influenza,Yes,No,Yes,Yes,19,Female,Low,Normal,Positive
1,Common Cold,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
2,Eczema,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
3,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive
4,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive


In [3]:
patients2 = patients.copy()

In [4]:
le = LabelEncoder()
patients2['Fever'] = le.fit_transform(patients2['Fever'])
patients2['Cough'] = le.fit_transform(patients2['Cough'])
patients2['Fatigue'] = le.fit_transform(patients2['Fatigue'])
patients2['DifficultyBreathing'] = le.fit_transform(patients2['DifficultyBreathing'])
patients2['Gender'] = le.fit_transform(patients2['Gender'])
patients2['BloodPressure'] = le.fit_transform(patients2['BloodPressure'])
patients2['CholesterolLevel'] = le.fit_transform(patients2['CholesterolLevel'])
patients2['OutcomeVariable'] = le.fit_transform(patients2['OutcomeVariable'])

In [5]:
patients2.head(5)

Unnamed: 0,Disease,Fever,Cough,Fatigue,DifficultyBreathing,Age,Gender,BloodPressure,CholesterolLevel,OutcomeVariable
0,Influenza,1,0,1,1,19,0,1,2,1
1,Common Cold,0,1,1,0,25,0,2,2,0
2,Eczema,0,1,1,0,25,0,2,2,0
3,Asthma,1,1,0,1,25,1,2,2,1
4,Asthma,1,1,0,1,25,1,2,2,1


In [6]:
y = patients2['OutcomeVariable']
X = patients2.drop(['OutcomeVariable','Disease'],axis=1)
X.head(5)

Unnamed: 0,Fever,Cough,Fatigue,DifficultyBreathing,Age,Gender,BloodPressure,CholesterolLevel
0,1,0,1,1,19,0,1,2
1,0,1,1,0,25,0,2,2
2,0,1,1,0,25,0,2,2
3,1,1,0,1,25,1,2,2
4,1,1,0,1,25,1,2,2


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
models = [('LR', LogisticRegression()),
          ('KNN', KNeighborsClassifier()),
          ('CART', DecisionTreeClassifier()),
          ('RF', RandomForestClassifier())]

In [9]:
acclist=[]
for name, model in models:
    acc = np.mean(cross_val_score(model, X_train, y_train, cv=5, scoring="accuracy"))
    acclist.append([name,acc])
    print(f"ACC: {round(acc, 4)} ({name}) ")

ACC: 0.6489 (LR) 
ACC: 0.699 (KNN) 
ACC: 0.7168 (CART) 
ACC: 0.742 (RF) 


In [10]:
rf_model = RandomForestClassifier(random_state=1)
rf_model.fit(X_train, y_train)

test1 = [[0, 1, 0, 0, 25, 0, 1, 2]]

pred_1_m1 = rf_model.predict(test1)
print(pred_1_m1)

prob_1_m1 = rf_model.predict_proba(test1)
print(np.round(prob_1_m1, 4))

[0]
[[0.92 0.08]]


The above predicts possibility of *any* major illness within the dataset as opposed to determining which one. 


Possibilities are listed below:

In [13]:
print(patients.Disease.unique())

['Influenza' 'Common Cold' 'Eczema' 'Asthma' 'Hyperthyroidism'
 'Allergic Rhinitis' 'Anxiety Disorders' 'Diabetes' 'Gastroenteritis'
 'Pancreatitis' 'Rheumatoid Arthritis' 'Depression' 'Liver Cancer'
 'Stroke' 'Urinary Tract Infection' 'Dengue Fever' 'Hepatitis'
 'Kidney Cancer' 'Migraine' 'Muscular Dystrophy' 'Sinusitis'
 'Ulcerative Colitis' 'Bipolar Disorder' 'Bronchitis' 'Cerebral Palsy'
 'Colorectal Cancer' 'Hypertensive Heart Disease' 'Multiple Sclerosis'
 'Myocardial Infarction (Heart...' 'Urinary Tract Infection (UTI)'
 'Osteoporosis' 'Pneumonia' 'Atherosclerosis'
 'Chronic Obstructive Pulmonary...' 'Epilepsy' 'Hypertension'
 'Obsessive-Compulsive Disorde...' 'Psoriasis' 'Rubella' 'Cirrhosis'
 'Conjunctivitis (Pink Eye)' 'Liver Disease' 'Malaria' 'Spina Bifida'
 'Kidney Disease' 'Osteoarthritis' 'Klinefelter Syndrome' 'Acne'
 'Brain Tumor' 'Cystic Fibrosis' 'Glaucoma' 'Rabies' 'Chickenpox'
 'Coronary Artery Disease' 'Eating Disorders (Anorexia,...' 'Fibromyalgia'
 'Hemophilia' 

### It's likely best to use a model to predict a single disease when using such a small dataset. 