In [1]:
# libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score
from scipy.stats import mode
import json
import pickle

In [2]:
# dataset creation
train_data = pd.read_csv("../datasets/archive/Training.csv")
test_data = pd.read_csv("../datasets/archive/Testing.csv")

# features
x_train = train_data.drop(columns=["prognosis"]).dropna(axis=1, how="any", thresh=None, subset=None, inplace=False)

# labels
y_train = train_data["prognosis"]

x_test = test_data.drop(columns=["prognosis"])
y_test = test_data["prognosis"]

In [3]:
symptomes = list(x_train.columns)
diseases = list(set(y_train))

In [4]:
# symptomes list on which the disease is to be predicted
def df_sym(syms):
    di = {}
    for e in symptomes:
        di[e] = 0
    for e in syms:
        if e in di:
            di[e] = 1
    test = pd.DataFrame(di, index=[0])
    return test

In [5]:
test1 = df_sym(list(( 'headache', 'vomiting', 'high_fever', 'back_pain', 'joint_pain', 'nausea', 'loss_of_appetite', 'malaise' ,'muscle_pain')))
test2 = df_sym(list(('itching','skin_rash','nodal_skin_eruptions')))

In [7]:
# model training (skip if already done)

In [23]:
# support vector classifier
model_svm = SVC(probability=True)
model_svm.fit(x_train, y_train)
with open("../models/model_svm.pkl", "wb") as file:
    pickle.dump(model_svm, file)

# Gaussian naive bayes
model_gnb = GaussianNB()
model_gnb.fit(x_train, y_train)
with open("../models/model_gnb.pkl", "wb") as file:
    pickle.dump(model_gnb, file)

# Multinomial naive bayes
model_mnb = MultinomialNB()
model_mnb.fit(x_train, y_train)
with open("../models/model_mnb.pkl", "wb") as file:
    pickle.dump(model_mnb, file)

# decision tree classifier
model_dtc = DecisionTreeClassifier()
model_dtc.fit(x_train, y_train)
with open("../models/model_dtc.pkl", "wb") as file:
    pickle.dump(model_dtc, file)

# random forest classifier
model_rfc = RandomForestClassifier()
model_rfc.fit(x_train, y_train)
with open("../models/model_rfc.pkl", "wb") as file:
    pickle.dump(model_rfc, file)
    
# gradient boosting classifier
model_gbc = GradientBoostingClassifier()
model_gbc.fit(x_train, y_train)
with open("../models/model_gbc.pkl", "wb") as file:
    pickle.dump(model_gbc, file)

In [8]:
# loading saved models

In [9]:
with open("../models/model_svm.pkl", "rb") as file:
    model_svm = pickle.load(file)

with open("../models/model_gnb.pkl", "rb") as file:
    model_gnb = pickle.load(file)
    
with open("../models/model_mnb.pkl", "rb") as file:
    model_mnb = pickle.load(file)
    
with open("../models/model_rfc.pkl", "rb") as file:
    model_rfc = pickle.load(file)
    
with open("../models/model_dtc.pkl", "rb") as file:
    model_dtc = pickle.load(file)
    
with open("../models/model_gbc.pkl", "rb") as file:
    model_gbc = pickle.load(file)

In [10]:
# predtion results on test dataset

In [11]:
prediction_res_svm = model_svm.predict(x_test)
acc_score = accuracy_score(y_test, prediction_res_svm)*100
print("accuray score of support vector classifier : ",acc_score)

prediction_res_gnb = model_gnb.predict(x_test)
acc_score = accuracy_score(y_test, prediction_res_gnb)*100
print("accuray score of gaussian naive bayes classifier : ",acc_score)

prediction_res_mnb = model_mnb.predict(x_test)
acc_score = accuracy_score(y_test, prediction_res_mnb)*100
print("accuray score of multinomial naive bayes classifier : ",acc_score)

prediction_res_rfc = model_rfc.predict(x_test)
acc_score = accuracy_score(y_test, prediction_res_rfc)*100
print("accuray score of random forest classifier : ",acc_score)

prediction_res_dtc = model_dtc.predict(x_test)
acc_score = accuracy_score(y_test, prediction_res_dtc)*100
print("accuray score of decision tree classifier : ",acc_score)

prediction_res_gbc = model_gbc.predict(x_test)
acc_score = accuracy_score(y_test, prediction_res_gbc)*100
print("accuray score of gradient boosting classifier : ",acc_score)

accuray score of support vector classifier :  100.0
accuray score of gaussian naive bayes classifier :  100.0
accuray score of multinomial naive bayes classifier :  100.0
accuray score of random forest classifier :  97.61904761904762
accuray score of decision tree classifier :  97.61904761904762
accuray score of gradient boosting classifier :  97.61904761904762


In [12]:
# loading medicines dataset

In [23]:
with open("../datasets/medicines.json") as file:
    m_di = json.load(file)

In [14]:
# sample input tests

In [15]:
test1 = df_sym(list(( 'headache', 'vomiting', 'high_fever', 'back_pain', 'joint_pain', 'nausea', 'loss_of_appetite', 'malaise' ,'muscle_pain')))
test2 = df_sym(list(('itching','skin_rash','nodal_skin_eruptions')))

In [16]:
def complete_prediction(model, test):
    pred_prob = model.predict_proba(test)
    classes = list(model.classes_)
    results = {}
    for i in range(len(classes)):
        results[classes[i]] = pred_prob[0][i]*100
    top3 = sorted(results, key=results.get, reverse=True)[:3]
    final_prediction = []
    for e in top3:
        disease_name = e
        confidence_ratio = results[e]
#         taking only 3 medicines atmost
        medicines = m_di[e][:3]
        final_prediction.append({"disease_name":disease_name,
                                "confidence_ratio":confidence_ratio,
                                "medicines":medicines})
    return final_prediction

In [17]:
# testing on sample test1

In [21]:
print("svc results : ")
svc_results = complete_prediction(model_svm, test1)
for e in svc_results:
    print(e)

svc results : 
{'disease_name': 'Dengue', 'confidence_ratio': 16.262309588521823, 'medicines': ['Dengvaxia', 'Corticosteroids', 'corticosteroid']}
{'disease_name': 'Malaria', 'confidence_ratio': 7.8459822040306655, 'medicines': ['Doxycycline', 'Lariam', 'Doxy 100']}
{'disease_name': 'Paralysis (brain hemorrhage)', 'confidence_ratio': 4.154518400124923, 'medicines': ['Dichlorphenamide', 'thiopental', 'GABA']}


In [24]:
print("gnb results : ")
gnb_results = complete_prediction(model_gnb, test1)
for e in gnb_results:
    print(e)

gnb results : 
{'disease_name': 'Dengue', 'confidence_ratio': 100.0, 'medicines': ['Dengvaxia', 'Corticosteroids', 'corticosteroid']}
{'disease_name': '(vertigo) Paroymsal  Positional Vertigo', 'confidence_ratio': 0.0, 'medicines': ['NA']}
{'disease_name': 'AIDS', 'confidence_ratio': 0.0, 'medicines': ['Marinol', 'Somatropin', 'Dronabinol']}


In [25]:
print("mnb results : ")
mnb_results = complete_prediction(model_mnb, test1)
for e in mnb_results:
    print(e)

mnb results : 
{'disease_name': 'Dengue', 'confidence_ratio': 99.9999416572663, 'medicines': ['Dengvaxia', 'Corticosteroids', 'corticosteroid']}
{'disease_name': 'Malaria', 'confidence_ratio': 5.3433195082607805e-05, 'medicines': ['Atovaquone / proguanil', 'Doxycycline', 'Coartem']}
{'disease_name': 'hepatitis A', 'confidence_ratio': 3.7137414231450943e-06, 'medicines': ['Hepatitis b adult vaccine', 'Primavax', 'paracetamol']}


In [27]:
print("dtc results : ")
dtc_results = complete_prediction(model_dtc, test1)
for e in dtc_results:
    print(e)

dtc results : 
{'disease_name': 'Malaria', 'confidence_ratio': 100.0, 'medicines': ['Atovaquone / proguanil', 'Doxycycline', 'Coartem']}
{'disease_name': '(vertigo) Paroymsal  Positional Vertigo', 'confidence_ratio': 0.0, 'medicines': ['NA']}
{'disease_name': 'AIDS', 'confidence_ratio': 0.0, 'medicines': ['Marinol', 'Somatropin', 'Dronabinol']}


In [28]:
print("rfc results : ")
rfc_results = complete_prediction(model_rfc, test1)
for e in rfc_results:
    print(e)

rfc results : 
{'disease_name': 'Dengue', 'confidence_ratio': 42.0, 'medicines': ['Dengvaxia', 'Corticosteroids', 'corticosteroid']}
{'disease_name': 'Malaria', 'confidence_ratio': 24.0, 'medicines': ['Atovaquone / proguanil', 'Doxycycline', 'Coartem']}
{'disease_name': 'hepatitis A', 'confidence_ratio': 9.0, 'medicines': ['Hepatitis b adult vaccine', 'Primavax', 'paracetamol']}


In [29]:
print("gbc results : ")
gbc_results = complete_prediction(model_gbc, test1)
for e in gbc_results:
    print(e)

gbc results : 
{'disease_name': 'Hepatitis D', 'confidence_ratio': 55.54184134604032, 'medicines': ['Hepatitis b adult vaccine', 'Primavax', 'paracetamol']}
{'disease_name': 'Cervical spondylosis', 'confidence_ratio': 11.346819086615815, 'medicines': ['Botox', 'OnabotulinumtoxinA', 'Gardasil']}
{'disease_name': 'Malaria', 'confidence_ratio': 6.119281795973552, 'medicines': ['Atovaquone / proguanil', 'Doxycycline', 'Coartem']}


In [49]:
def get_final_result(test1):
    svc_results = complete_prediction(model_svm, test1)
    gnb_results = complete_prediction(model_gnb, test1)
    mnb_results = complete_prediction(model_mnb, test1)
    dtc_results = complete_prediction(model_dtc, test1)
    rfc_results = complete_prediction(model_rfc, test1)
    gbc_results = complete_prediction(model_gbc, test1)
#     calculation of scores 
    scores = {}
    total = 0
    for i in range(3):
        scores[svc_results[i]['disease_name']] = scores.get(svc_results[i]['disease_name'], 0) + (3-i)*100
        scores[gnb_results[i]['disease_name']] = scores.get(gnb_results[i]['disease_name'], 0) + (3-i)*100
        scores[mnb_results[i]['disease_name']] = scores.get(mnb_results[i]['disease_name'], 0) + (3-i)*100
        scores[dtc_results[i]['disease_name']] = scores.get(dtc_results[i]['disease_name'], 0) + (3-i)*100
        scores[rfc_results[i]['disease_name']] = scores.get(rfc_results[i]['disease_name'], 0) + (3-i)*100
        scores[gbc_results[i]['disease_name']] = scores.get(gbc_results[i]['disease_name'], 0) + (3-i)*100
        total += scores[svc_results[i]['disease_name']]
        total += scores[gnb_results[i]['disease_name']]
        total += scores[mnb_results[i]['disease_name']]
        total += scores[dtc_results[i]['disease_name']]
        total += scores[rfc_results[i]['disease_name']]
        total += scores[gbc_results[i]['disease_name']]
    top3 = sorted(scores, key = scores.get, reverse=True)[:3]
    final_results = []
    for e in top3:
        disease_name = e
        medicines = m_di[e][:3]
        final_results.append({"disease_name":disease_name,
                                "medicines":medicines})
    return final_results
        

In [51]:
print("final results : ")
print("top 3 possibilities : ")
res = get_final_result(test1)
for e in res:
    print(e)

final results : 
top 3 possibilities : 
{'disease_name': 'Dengue', 'medicines': ['Dengvaxia', 'Corticosteroids', 'corticosteroid']}
{'disease_name': 'Malaria', 'medicines': ['Atovaquone / proguanil', 'Doxycycline', 'Coartem']}
{'disease_name': '(vertigo) Paroymsal  Positional Vertigo', 'medicines': ['NA']}
