# Patient's condition prediction using NLP
### Aayush Dip Giri
### Ayush Upreti
### Shiva Raj Bhatt

# Task
## Project Goal
The goal of this project is to analyze the symptoms using Natural Language Processing. The project will be evaluated using both the accuracy of correct prediction. There will be predictions for disease like Anxity, Birth Control, Depression, Diabetes, Type 2, etc. The study will be completed in few steps as mentioned below. This will give an explanation as to how words are extracted can be used to predict the medical condition of a patient. Our group plans to use several types of analyses to help predict the medical condition based on symptoms  TFIDF tokenizers and different classification algorithm.

# Dataset
Our dataset was described by ourselves since we were not able to the proper disease symptoms data in the internet. It consist of two column that has disease and its symptoms. There is a total of 414 datas.

In [1]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
tfidf = TfidfVectorizer()

In [3]:
def clean(dis):
    app_l=[]
    stopword= set(stopwords.words('english'))
    for val in dis:
       review = val[0].lower()
       review = re.sub('[-,./!@#$%^]'," ",review)
       review = word_tokenize(review)
       review = [v for v in review if not v in stopword]
       review = ' '.join(review)
       app_l.append(review)
    print(app_l)
    return app_l

In [4]:
def testing_algorithm(x_train,y_train):
    #-------------------------------------------------------------------
    svc = SVC(kernel='rbf',degree=2)
    svc.fit(x_train,y_train)
    algo1 = str(svc.score(x_train,y_train))

    knn = KNeighborsClassifier(n_neighbors=50,leaf_size=50)
    knn.fit(x_train,y_train)
    algo2 = str(knn.score(x_train,y_train))

    dtc = DecisionTreeClassifier(max_depth=50,min_samples_split=5)
    dtc.fit(x_train,y_train)
    algo3 = str(dtc.score(x_train,y_train))
 
    rfc = RandomForestClassifier(max_depth=50,min_samples_split=5)
    rfc.fit(x_train,y_train)
    algo4 = str(rfc.score(x_train,y_train))
  
    adb = AdaBoostClassifier(n_estimators=80,learning_rate=1)
    adb.fit(x_train,y_train)
    algo5 = str(adb.score(x_train,y_train))
    
    print("Accuracy of SVC=> ",algo1)
    print("Accuracy of KNeighborsClassifier=>",algo2)
    print("Accuracy of DecisionTreeClassifier=>",algo3)
    print("Accuracy of RandomForestClassifier=>",algo4)
    print("Accuracy of AdaBoostClassifier=>",algo5)

In [5]:
def get_prediction(x_data_tfidf,y_data,y_dict,query):
    testing_algorithm(x_data_tfidf,y_data)
    svc = SVC(kernel='rbf',degree=2)
    svc.fit(x_data_tfidf,y_data)
    knn = KNeighborsClassifier(n_neighbors=50,leaf_size=50)
    knn.fit(x_data_tfidf,y_data)
    dtc = DecisionTreeClassifier(max_depth=50,min_samples_split=5)
    dtc.fit(x_data_tfidf,y_data)
    rfc = RandomForestClassifier(max_depth=50,min_samples_split=5)
    rfc.fit(x_data_tfidf,y_data)
    adb = AdaBoostClassifier(n_estimators=80,learning_rate=1)
    adb.fit(x_data_tfidf,y_data)

    
    #----------------------------------------------------
    #for svc
    loc = np.array([query])
    msg = tfidf.transform(loc.ravel())
    prediction = svc.predict(msg)

    er = str()
    flag=0
    for dis,ser in y_dict.items():
        if(ser == prediction[0]):
            er = dis
            flag=1
            break
    
    que1=str()
    if(flag==1):
        que1 = str("According to SVC"+'\n'+"You may have "+str(er)+'\n\n')
    else:
        que1 = "System cannot understand your symptoms"

#----------------------------------------------------
#for knn
    prediction = knn.predict(msg)

    er = str()
    flag=0
    for dis,ser in y_dict.items():
        if(ser == prediction[0]):
            er = dis
            flag=1
            break
    
    que2=str()
    if(flag==1):
        que2 = str("According to K Nearest Neighbour"+'\n'+"You may have "+str(er)+'\n\n')
        #que="hello"
    else:
        que2 = "System cannot understand your symptoms"
#----------------------------------------------------
#for dtc
    prediction = dtc.predict(msg)

    er = str()
    flag=0
    for dis,ser in y_dict.items():
        if(ser == prediction[0]):
            er = dis
            flag=1
            break
    
    que3=str()
    if(flag==1):
        que3 = str("According to Decison Tree"+'\n'+"You may have "+str(er)+'\n\n')
    else:
        que3 = "System cannot understand your symptoms"
#----------------------------------------------------
#for rfc
    prediction = rfc.predict(msg)

    er = str()
    flag=0
    for dis,ser in y_dict.items():
        if(ser == prediction[0]):
            er = dis
            flag=1
            break
    
    que4=str()
    if(flag==1):
        que4 = str("According to Random Forest Classifier"+'\n'+"You may have "+str(er)+'\n\n')
    else:
        que4 = "System cannot understand your symptoms"
#----------------------------------------------------
#for adb
    prediction = adb.predict(msg)

    er = str()
    flag=0
    for dis,ser in y_dict.items():
        if(ser == prediction[0]):
            er = dis
            flag=1
            break
    
    que5=str()
    if(flag==1):
        que5 = str("According to ADB Classifier"+'\n'+"You may have "+str(er)+'\n\n')
    else:
        que5 = "System cannot understand your symptoms"
    
    return que1+que2+que3+que4+que5

In [6]:
def detect_dis(disease):
    #train
    data = pd.read_csv("clean_data2.csv")
    dis = data.iloc[:,1:2].values
    sym = data.iloc[:,2].values  
    x_data = clean(dis)
    y_data = np.array(range(len(np.unique(sym))))   
    x_data_tfidf= tfidf.fit_transform(x_data)
    y_dict = {n:i for i,n in enumerate(sym)} 
    #----------------------------------------------------
    query=(disease)
    #----------------------------------------------------
    result = get_prediction(x_data_tfidf,y_data,y_dict,query)
    return(result)

In [7]:
with open('symptom.txt', 'r') as file:
    symptoms_asstring = file.read().replace('\n', '')


["pulsating feeling stomach ( abdomen ) usually near belly button 's usually noticeable touch persistent back pain persistent abdominal pain", 'mall deep holes surface skin look like skin punctured sharp object boxcar scars – round oval depressions craters skin', 'rapid weight loss recurring fever profuse night sweats extreme unexplained tiredness prolonged swelling lymph glands armpits groin neck diarrhea lasts week sores mouth anus genitals pneumonia red brown pink purplish blotches skin inside mouth nose eyelids memory loss depression neurologic disorders', 'abdominal pain high temperature ( fever ) nausea vomiting sweating loss appetite yellowing skin whites eyes ( jaundice )', 'pale skin feeling tired breathless repeated infections short space time unusual frequent bleeding bleeding gums nose bleeds high temperature ( fever ) 38c ( 100 4f ) night sweats bone joint pain easily bruised skin swollen lymph nodes ( glands ) abdominal pain – caused swollen liver spleen unexplained weigh

Accuracy of SVC=>  0.9685230024213075
Accuracy of KNeighborsClassifier=> 0.014527845036319613
Accuracy of DecisionTreeClassifier=> 0.27602905569007263
Accuracy of RandomForestClassifier=> 0.9685230024213075
Accuracy of AdaBoostClassifier=> 0.16707021791767554
According to SVC
You may have Breast cancer (female)

According to K Nearest Neighbour
You may have Acne

According to Decison Tree
You may have Allergic rhinitis

According to Random Forest Classifier
You may have Cowpox

According to ADB Classifier
You may have Congenital syphilis


