In [153]:
# import nltk
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')

In [3]:
# !python -m spacy download en_core_web_sm

# Imports

In [130]:
import csv
import json
import string
import re
import itertools
import inflect # convert number into words

import pandas as pd
import numpy as np

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.wsd import lesk
from nltk.corpus import wordnet

from spacy.lang.en.stop_words import STOP_WORDS
from sentence_transformers import SentenceTransformer, util
import tensorflow as tf
import transformers

import spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
model = SentenceTransformer('stsb-roberta-large')

In [71]:
sentences1 = "chest ache"
sentences2 = "chest"

# encode list of sentences to get their embeddings
embedding1 = model.encode(sentences1, convert_to_tensor=True)
embedding2 = model.encode(sentences2, convert_to_tensor=True)

# compute similarity scores of two embeddings
cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
print("Sentence 1:", sentences1)
print("Sentence 2:", sentences2)
print("Similarity Score:", cosine_scores.item())
print()

Sentence 1: chest ache
Sentence 2: chest
Similarity Score: 0.6829466223716736



In [73]:
DATASETS_DIR = "public/datasets/disease and symptoms/"

In [74]:
df_tr = pd.read_csv(DATASETS_DIR + 'Training.csv').dropna(how='all', axis=1)

In [75]:
df_tr.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection


In [76]:
df_tr.shape

(4920, 133)

In [77]:
df_tr.iloc[-1]

itching                        0
skin_rash                      1
nodal_skin_eruptions           0
continuous_sneezing            0
shivering                      0
                          ...   
inflammatory_nails             0
blister                        1
red_sore_around_nose           1
yellow_crust_ooze              1
prognosis               Impetigo
Name: 4919, Length: 133, dtype: object

In [78]:
df_tt = pd.read_csv(DATASETS_DIR + 'Testing.csv').dropna(how='all', axis=1)

In [79]:
df_tt.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal infection
1,0,0,0,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Allergy
2,0,0,0,0,0,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,GERD
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Chronic cholestasis
4,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,Drug Reaction


# 1 preprocess text

1. Converting to lowercase
2. Converting digits to words
3. Remove punctuation an whitespace
4. Removing default stopwords
5. Lemmatization

## 1.1 Converting to lowercase
Happy > happy

In [144]:
def text_lowercase(text):
    return text.lower()

## 1.2 Converting digits to words
3 > three

In [145]:
def convert_number(text):
    p = inflect.engine()
    temp_str = text.split()

    new_string = []

    for word in temp_str:
        if word.isdigit():
            temp = p.number_to_words(word)
            new_string.append(temp)

        else:
            new_string.append(word)

    temp_str = ' '.join(new_string)
    return temp_str

## 1.3 Remove punctuation and whitespace
itching   ! > itching

In [146]:
def remove_punctuation(text):
    text = text.replace('_', ' ')
    translator = str.maketrans('', '', string.punctuation)
    return text.translate(translator)

In [147]:
def remove_whitespace(text):
    return  " ".join(text.split())

## 1.4 Removing default stopwords

In [160]:
def remove_stopwords(text):
    stop_words = set(stopwords.words("english"))
    word_tokens = word_tokenize(text)
    filtered_text = [word for word in word_tokens if word not in stop_words]
    return " ".join(filtered_text)

## 1.5 Lemmatization
itching > itch

In [149]:
def lemmatize_word(text):
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(text)

    lemmas = [lemmatizer.lemmatize(word, pos ='v') for word in word_tokens]
    return " ".join(lemmas)

In [165]:
def preprocess_sym(text):
    """
    Combining all preprocessing steps.
    """
    text = text_lowercase(text)
    text = convert_number(text)
    text = remove_punctuation(text)
    text = remove_whitespace(text)
    text = remove_stopwords(text)
    text = lemmatize_word(text)

    return text

## 1.6 Preprocessing all symptoms

In [171]:
symp = []
disease = []
for i in range(len(df_tr)):
    symp.append(df_tr.columns[df_tr.iloc[i] == 1].to_list())
    disease.append(df_tr.iloc[i, -1])

In [172]:
# preprocessing all symptoms
all_symp = []
for sis in symp:
    temp = []
    for s in sis:
        temp.append(preprocess_sym(s))
    all_symp.append(temp)

In [173]:
all_symp

[['itch', 'skin rash', 'nodal skin eruptions', 'dischromic patch'],
 ['skin rash', 'nodal skin eruptions', 'dischromic patch'],
 ['itch', 'nodal skin eruptions', 'dischromic patch'],
 ['itch', 'skin rash', 'dischromic patch'],
 ['itch', 'skin rash', 'nodal skin eruptions'],
 ['skin rash', 'nodal skin eruptions', 'dischromic patch'],
 ['itch', 'nodal skin eruptions', 'dischromic patch'],
 ['itch', 'skin rash', 'dischromic patch'],
 ['itch', 'skin rash', 'nodal skin eruptions'],
 ['itch', 'skin rash', 'nodal skin eruptions', 'dischromic patch'],
 ['continuous sneeze', 'shiver', 'chill', 'water eye'],
 ['shiver', 'chill', 'water eye'],
 ['continuous sneeze', 'chill', 'water eye'],
 ['continuous sneeze', 'shiver', 'water eye'],
 ['continuous sneeze', 'shiver', 'chill'],
 ['shiver', 'chill', 'water eye'],
 ['continuous sneeze', 'chill', 'water eye'],
 ['continuous sneeze', 'shiver', 'water eye'],
 ['continuous sneeze', 'shiver', 'chill'],
 ['continuous sneeze', 'shiver', 'chill', 'water eye

In [174]:
# associates each preprocessed symp with the disease
col_dict = dict(zip(disease, all_symp))

In [175]:
col_dict

{'Fungal infection': ['itch',
  'skin rash',
  'nodal skin eruptions',
  'dischromic patch'],
 'Allergy': ['continuous sneeze', 'shiver', 'chill', 'water eye'],
 'GERD': ['stomach pain',
  'acidity',
  'ulcers tongue',
  'vomit',
  'cough',
  'chest pain'],
 'Chronic cholestasis': ['itch',
  'vomit',
  'yellowish skin',
  'nausea',
  'loss appetite',
  'abdominal pain',
  'yellow eye'],
 'Drug Reaction': ['itch',
  'skin rash',
  'stomach pain',
  'burn micturition',
  'spot urination'],
 'Peptic ulcer diseae': ['vomit',
  'indigestion',
  'loss appetite',
  'abdominal pain',
  'passage gas',
  'internal itch'],
 'AIDS': ['muscle waste',
  'patch throat',
  'high fever',
  'extra marital contact'],
 'Diabetes ': ['fatigue',
  'weight loss',
  'restlessness',
  'lethargy',
  'irregular sugar level',
  'blur distort vision',
  'obesity',
  'excessive hunger',
  'increase appetite',
  'polyuria'],
 'Gastroenteritis': ['vomit', 'sink eye', 'dehydration', 'diarrhoea'],
 'Bronchial Asthma': 

# 3 - syntactic similarity

In [30]:
def pattern_set(str1, str2):
    list1 = str1.split(' ')
    list2 = str2.split(' ')
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(list1) + len(list2)) - intersection
    return float(intersection) / union

In [31]:
# syn similarity with the corpus
def syntactic_similarity(symp_t, corpus):
    most_sim = []
    poss_sym = []

    for symp in corpus:
        d = pattern_set(symp_t, symp)
        most_sim.append(d)

    order = np.argsort(most_sim)[::-1].tolist()

    for i in order:
        if DoesExist(symp_t):
            return 1, [corpus[i]]

        if corpus[i] not in poss_sym and most_sim[i] != 0:
            poss_sym.append(corpus[i])

    if len(poss_sym):
        return 1, poss_sym


    else:
        return 0, None

In [32]:
# Returns all the subsets of this set. This is a generator.
def powerset(seq):
    if len(seq) <= 1:
        yield seq
        yield []
    else:
        for item in powerset(seq[1:]):
            yield [seq[0]]+item
            yield item

In [33]:
# Sort list based on length
def sort(a):
    for i in range(len(a)):
        for j in range(i+1,len(a)):
            if len(a[j])>len(a[i]):
                a[i],a[j]=a[j],a[i]
    a.pop()
    return a

In [34]:
# find all permutations of a list
def permutations(s):
    permutations = list(itertools.permutations(s))
    return([' '.join(permutation) for permutation in permutations])

In [35]:
def DoesExist(txt):
    txt=txt.split(' ')
    combinations = [x for x in powerset(txt)]
    sort(combinations)

    for comb in combinations :
        # print(permutations(comb))
        for sym in permutations(comb):
            if sym in all_symp_pr:
                # print(sym)
                return sym
    return False

In [36]:
DoesExist('worried')

False

In [37]:
preprocess_sym('really worried')

'worried'

In [38]:
syntactic_similarity(preprocess_sym('nervous'), all_symp_pr)

(0, None)

In [39]:
def check_pattern(inp,dis_list):
    import re
    pred_list=[]
    ptr=0
    patt = "^" + inp + "$"
    regexp = re.compile(inp)
    for item in dis_list:
        if regexp.search(item):
            pred_list.append(item)
    if(len(pred_list)>0):
        return 1,pred_list
    else:
        return ptr,None

In [40]:
check_pattern('nail', all_symp_pr)

(1, ['brittle nail', 'small dent nail', 'inflammatory nail'])

# 4 - semantic similarity

In [61]:
model = SentenceTransformer('stsb-roberta-large')

In [67]:
def sentence_similarity(sentence1, sentence2):
    # encode list of sentences to get their embeddings
    embedding1 = model.encode(sentences1, convert_to_tensor=True)
    embedding2 = model.encode(sentences2, convert_to_tensor=True)

    # compute similarity scores of two embeddings
    cosine_scores = util.pytorch_cos_sim(embedding1, embedding2)
    return cosine_scores

In [69]:
sentence_similarity('tensed', 'nervous')

tensor([[0.9028]])

In [63]:
def semantic_similarity(symp_t, corpus):
    sims = {}
    for symp in corpus:
        d = sentence_similarity(symp_t, symp)
        sims[symp] = d

    return sims

In [64]:
all_symp_pr.sort()
semantic_similarity('tensed', all_symp_pr)

{'abdominal pain': 0.9028000831604004,
 'abnormal menstruation': 0.9028000831604004,
 'acidity': 0.9028000831604004,
 'acute liver failure': 0.9028000831604004,
 'altered sensorium': 0.9028000831604004,
 'anxiety': 0.9028000831604004,
 'belly pain': 0.9028000831604004,
 'blackhead': 0.9028000831604004,
 'bladder discomfort': 0.9028000831604004,
 'blister': 0.9028000831604004,
 'blood sputum': 0.9028000831604004,
 'bloody stool': 0.9028000831604004,
 'blur distorted vision': 0.9028000831604004,
 'breathlessness': 0.9028000831604004,
 'brittle nail': 0.9028000831604004,
 'bruise': 0.9028000831604004,
 'burn micturition': 0.9028000831604004,
 'chest pain': 0.9028000831604004,
 'chill': 0.9028000831604004,
 'cold hand feet': 0.9028000831604004,
 'coma': 0.9028000831604004,
 'congestion': 0.9028000831604004,
 'constipation': 0.9028000831604004,
 'continuous feel urine': 0.9028000831604004,
 'continuous sneeze': 0.9028000831604004,
 'cough': 0.9028000831604004,
 'cramp': 0.9028000831604004,


In [56]:
all_symp_pr.sort()

In [53]:
WSD('season', 'apply spices to the chicken to season it').definition()

'a recurrent time marked by major holidays'

In [57]:
all_symp_pr

['abdominal pain',
 'abnormal menstruation',
 'acidity',
 'acute liver failure',
 'altered sensorium',
 'anxiety',
 'belly pain',
 'blackhead',
 'bladder discomfort',
 'blister',
 'blood sputum',
 'bloody stool',
 'blur distorted vision',
 'breathlessness',
 'brittle nail',
 'bruise',
 'burn micturition',
 'chest pain',
 'chill',
 'cold hand feet',
 'coma',
 'congestion',
 'constipation',
 'continuous feel urine',
 'continuous sneeze',
 'cough',
 'cramp',
 'dark urine',
 'dehydration',
 'depression',
 'diarrhoea',
 'dischromic patch',
 'distention abdomen',
 'dizziness',
 'dry tingling lip',
 'enlarge thyroid',
 'excessive hunger',
 'extra marital contact',
 'family history',
 'fast heart rate',
 'fatigue',
 'fluid overload',
 'fluid overload',
 'foul smell urine',
 'headache',
 'high fever',
 'hip joint pain',
 'history alcohol consumption',
 'increase appetite',
 'indigestion',
 'inflammatory nail',
 'internal itching',
 'irregular sugar level',
 'irritability',
 'irritation anus',
 

In [None]:
def suggest_syn(sym):
    symp = []
    synonyms = wordnet.synsets(sym)
    lemmas=[word.lemma_names() for word in synonyms]
    lemmas = list(set(chain(*lemmas)))
    for e in lemmas:
        res,sym1=semantic_similarity(e,all_symp_pr)
        if res != 0:
            symp.append(sym1)
    return list(set(symp))

In [None]:
suggest_syn('worried')

In [None]:
# recoit client_symptoms et renvoit un dataframe avec 1 pour les symptoms associees
def OHV(cl_sym,all_sym):
    l=np.zeros([1,len(all_sym)])
    for sym in cl_sym:
        l[0,all_sym.index(sym)]=1
    return pd.DataFrame(l, columns =all_symp)

In [None]:
def contains(small, big):
    a=True
    for i in small:
        if i not in big:
            a=False
    return a

In [None]:
def possible_diseases(l):
    poss_dis=[]
    for dis in set(disease):
        if contains(l,symVONdisease(df_tr,dis)):
            poss_dis.append(dis)
    return poss_dis

In [None]:
set(disease)

In [None]:
#recoit une maladie renvoit tous les sympts
def symVONdisease(df,disease):
    ddf=df[df.prognosis==disease]
    m2 = (ddf == 1).any()
    return m2.index[m2].tolist()
    

In [None]:
symVONdisease(df_tr,'Jaundice')

# V- Prediction Model (KNN & DT)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

In [None]:
X_train=df_tr.iloc[:,:-1]
X_test=df_tt.iloc[:,:-1]
y_train = df_tr.iloc[:,-1]
y_test = df_tt.iloc[:,-1]

In [None]:
knn_clf=KNeighborsClassifier()
knn_clf.fit(X_train,y_train)

In [None]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train)

In [None]:
print(classification_report(y_test,knn_clf.predict(X_test)))

In [None]:
print(classification_report(y_test,dt_clf.predict(X_test)))

##  VI- SEVERITY / DESCRIPTION / PRECAUTION

In [None]:
severityDictionary=dict()
description_list = dict()
precautionDictionary=dict()

def getDescription():
    global description_list
    with open(DATASETS_DIR + 'symptom_Description.csv') as csv_file:
        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            _description={row[0]:row[1]}
            description_list.update(_description)




def getSeverityDict():
    global severityDictionary
    with open(DATASETS_DIR + 'symptom_severity.csv') as csv_file:

        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        try:
            for row in csv_reader:
                _diction={row[0]:int(row[1])}
                severityDictionary.update(_diction)
        except:
            pass


def getprecautionDict():
    global precautionDictionary
    with open(DATASETS_DIR + 'symptom_precaution.csv') as csv_file:

        csv_reader = csv.reader(csv_file, delimiter=',')
        line_count = 0
        for row in csv_reader:
            _prec={row[0]:[row[1],row[2],row[3],row[4]]}
            precautionDictionary.update(_prec)


In [None]:
getSeverityDict()
getprecautionDict()
getDescription()

In [None]:
severityDictionary

In [None]:
def calc_condition(exp,days):
    sum=0
    for item in exp:
        if item in severityDictionary.keys():
            sum=sum+severityDictionary[item]
    if((sum*days)/(len(exp))>13):
        return 1
        print("You should take the consultation from doctor. ")
    else:
        return 0
        print("It might not be that bad but you should take precautions.")


# Chat

In [None]:
def getInfo():
    # name=input("Name:")
    print("Your Name \n\t\t\t\t\t\t",end="=>")
    name=input("")
    print("hello ",name)
    return str(name)

In [None]:
def related_sym(psym1):
    if len(psym1)==1:
        return psym1[0]
    print("searches related to input: ")
    for num,it in enumerate(psym1):
        print(num,")",clean_symp(it))
    if num!=0:
        print(f"Select the one you meant (0 - {num}):  ", end="")
        conf_inp = int(input(""))
    else:
        conf_inp=0

    disease_input=psym1[conf_inp]
    return disease_input

In [None]:
def main_sp(name,all_symp_col):
    #main Idea: At least two initial sympts to start with
    
    #get the 1st syp ->> process it ->> check_pattern ->>> get the appropriate one (if check_pattern==1 == similar syntaxic symp found)
    print("Enter the main symptom you are experiencing Mr/Ms "+name+"  \n\t\t\t\t\t\t",end="=>")
    sym1 = input("")
    sym1=preprocess_sym(sym1)
    sim1,psym1=syntactic_similarity(sym1,all_symp_pr)
    if sim1==1:
        psym1=related_sym(psym1)
    
    #get the 2nd syp ->> process it ->> check_pattern ->>> get the appropriate one (if check_pattern==1 == similar syntaxic symp found)

    print("Enter a second symptom you are experiencing Mr/Ms "+name+"  \n\t\t\t\t\t\t",end="=>")
    sym2=input("")
    sym2=preprocess_sym(sym2)
    sim2,psym2=syntactic_similarity(sym2,all_symp_pr)
    if sim2==1:
        psym2=related_sym(psym2)
        
    #if check_pattern==0 no similar syntaxic symp1 or symp2 ->> try semantic similarity
    
    if sim1==0 or sim2==0:
        sim1,psym1=semantic_similarity(sym1,all_symp_pr)
        sim2,psym2=semantic_similarity(sym2,all_symp_pr)
        
        #if semantic sim syp1 ==0 (no symp found) ->> suggest possible data symptoms based on all data and input sym synonymes
        if sim1==0:
            sugg=suggest_syn(sym1)
            print('Are you experiencing any ')
            for res in sugg:
                print(res)
                inp=input('')
                if inp=="yes":
                    psym1=res
                    sim1=1
                    break
                
        #if semantic sim syp2 ==0 (no symp found) ->> suggest possible data symptoms based on all data and input sym synonymes
        if sim2==0:
            sugg=suggest_syn(sym2)
            for res in sugg:
                inp=input('Do you feel '+ res+" ?(yes or no) ")
                if inp=="yes":
                    psym2=res
                    sim2=1
                    break
        #if no syntaxic semantic and suggested sym found return None and ask for clarification

        if sim1==0 and sim2==0:
            return None,None
        else:
            # if at least one sym found ->> duplicate it and proceed
            if sim1==0:
                psym1=psym2
            if sim2==0:
                psym2=psym1
    #create patient symp list
    all_sym=[col_dict[psym1],col_dict[psym2]]
    #predict possible diseases
    diseases=possible_diseases(all_sym)
    stop=False
    print("Are you experiencing any ")
    for dis in diseases:
        print(diseases)
        if stop==False:
            for sym in symVONdisease(df_tr,dis):
                if sym not in all_sym:
                    print(clean_symp(sym)+' ?')
                    while True:
                        inp=input("")
                        if(inp=="yes" or inp=="no"):
                            break
                        else:
                            print("provide proper answers i.e. (yes/no) : ",end="")
                    if inp=="yes":
                        all_sym.append(sym)
                        diseases=possible_diseases(all_sym)
                        if len(diseases)==1:
                            stop=True 
    return knn_clf.predict(OHV(all_sym,all_symp_col)),all_sym

    
    

In [None]:
def chat_sp():
    a=True
    while a:
        name=getInfo()
        result,sym=main_sp(name,all_symp_col)
        if result == None :
            ans3=input("can you specify more what you feel or tap q to stop the conversation")
            if ans3=="q":
                a=False
            else:
                continue

        else:
            print("you may have "+result[0])
            print(description_list[result[0]])
            an=input("how many day do you feel those symptoms ?")
            if calc_condition(sym,int(an))==1:
                print("you should take the consultation from doctor")
            else : 
                print('Take following precautions : ')
                for e in precautionDictionary[result[0]]:
                    print(e)
            print("do you need another medical consultation (yes or no)? ")
            ans=input()
            if ans!="yes":
                a=False
                print("!!!!! thanks for using ower application !!!!!! ")


In [None]:
df_tr.iloc[-1]

In [None]:
# import joblib
# knn_clf = joblib.load('model/knn.pkl')  

In [None]:
symVONdisease(df_tr,"Jaundice")

In [None]:
knn_clf.predict(OHV(['fatigue', 'weight_loss', 'itching','high_fever'],all_symp_col))

In [None]:
d=df_tr[df_tr.iloc[:,-1]=="Fungal infection"].sum(axis=0)

In [None]:
cl=df_tr.columns

In [None]:
pp=d!=0

In [None]:
cl[pp]

In [None]:
d[pp].drop('prognosis')

In [None]:
chat_sp()