# Diagnosis System version 2
This version will be able to recognize vague inputs through synonyms. The main diagnosis process is excuted though a weight matrix which is gathered from a word embedding process. The weight matrix will be adaptive, which means it will update as diagnosis goes.

In [1]:
# load data
import pandas as pd
import numpy as np
from gensim.models import Word2Vec

WM = pd.read_csv('../WeightMatrix/Dis_Sym_30.csv', index_col=0)

model = Word2Vec.load('../WordEmbedding/word2vec_models/word2vec_bmc_30.model')

dis2sym = pd.read_csv('../UMLS/dis_symptom.csv', header=None)

dis2sym.fillna(method='ffill',inplace=True)

umls_dis = {}
umls_sym = {}
dis_num = {}
for i in dis2sym.index:
    temp = dis2sym.loc[i][0]
    items = temp.split('^')
    item = items[0].strip('UMLS:').split('_')
    if len(item) != 2: continue
    umls_dis[item[0]] = item[1]
    dis_num[item[0]] = int(dis2sym.loc[i][1])
for i in dis2sym.index:
    temp = dis2sym.loc[i][2]
    items = temp.split('^')
    item = items[0].strip('UMLS:').split('_')
    if len(item) != 2: continue
    umls_sym[item[0]] = item[1]
    
rev_sym = {v: k for k, v in umls_sym.items()}
rev_dis = {v: k for k, v in umls_dis.items()}



  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


In [53]:
#dis_num

In [2]:
def initial_input():
    # initial input part
    gendermap = {'F':'Female', 'M': 'Male'}
    print('Please type in the gender for the patient. F for female and M for male')
    g = input()
    gender = gendermap[g]
    print('Please type in the age for the patient in years.')
    age = int(input())
    print('What symptom do you have?')
    sym = input()
    
    return gender, age, sym

In [3]:
model.wv.most_similar('UMLS_C0010200', topn=20)

[('UMLS_C0037383', 0.7227292656898499),
 ('sneezing', 0.7105668783187866),
 ('UMLS_C0013404', 0.7058680057525635),
 ('throat', 0.6963392496109009),
 ('breathlessness', 0.6704373359680176),
 ('whooping', 0.6695073246955872),
 ('wheeze', 0.6689133644104004),
 ('catarrh', 0.6580345630645752),
 ('UMLS_C0010200UMLS_C0010200', 0.65207839012146),
 ('UMLS_C0010200ing', 0.651138961315155),
 ('expectoration', 0.6495738625526428),
 ('breathing', 0.6430724263191223),
 ('dyspnoea', 0.6369428634643555),
 ('UMLS_C0392680', 0.6353326439857483),
 ('UMLS_C0043144', 0.6294315457344055),
 ('symptom', 0.6250048875808716),
 ('sore', 0.6245930790901184),
 ('UMLS_C0848340', 0.6242419481277466),
 ('stuffiness', 0.6222015023231506),
 ('UMLS_C0232292', 0.6214612722396851)]

In [4]:
import re
def findsynonym(sym):
    if sym in rev_sym:
        return rev_sym[sym]
    
    for tup in model.wv.most_similar(sym, topn=20):
        if 'UMLS' in tup[0]:
            pattern = re.compile('C[1234567890]*')
            symp = re.findall(pattern, tup[0])
            return symp[0]

In [5]:
def SelectedMatrix(sym):
    selected = WM[WM[sym] != 0]
    selected = selected.drop(columns=[sym])
    for c in selected.columns:
        if sum(selected[c]) == 0:
            selected.drop(columns=[c],inplace=True)
    return selected
    

In [6]:
def renorm(dia):
    dia.sort_values(ascending=False, inplace=True)
    temp = dia**2
    
    s = sum(temp[:5])
    return temp/s

In [7]:
def diagnosis():
    
    gender, age, sym = initial_input()
    
    sym = findsynonym(sym)
    
    selected = SelectedMatrix(sym)
    
    #The response vector
    res = pd.Series(index=WM.columns, data=[0]*len(WM.columns))
    res[sym] = 1
    
    #Diagnosis process
    while True:
        dia = WM.dot(res)
        dia = renorm(dia)
        #print(sorted(dia,reverse=True))
        if max(dia) > 0.5:
            print('-----------------------------------------------------------')
            print('Diagnosis results:')
            for i in range(len(dia)):
                if i < 5:
                    print(umls_dis[dia.keys()[i]], ':%2d'%(dia[i]*100), '%')
            print('-----------------------------------------------------------')        
            return 'Diagnosis done'
        if len(selected) == 1:
            return 'Diagnosis fail'
            
        #choose the most relevant symptom to ask: The symptom that are least shared with other diseases
        next_i = selected.columns[0]
        s = 0
        for i in selected.columns:
            if selected.iloc[0][i] > 0:
                pri = selected[i].value_counts()[0]
                if pri > s:
                    s = pri
                    next_i = i
         
        print('-----------------------------------------------------------')
        print('Do you have the following symptom: (Y for Yes and N for No)')
        print(umls_sym[next_i])
        
        answer = input()
        while answer != 'Y' and answer != 'N':
            answer = input()
              
        selected.drop(columns=[next_i], inplace = True)
        if answer == 'Y':
            res[next_i] = 2
        else:
            res[next_i] = 0
            selected.drop(selected.index[0], inplace=True)
    

In [8]:
diagnosis()

Please type in the gender for the patient. F for female and M for male
M
Please type in the age for the patient in years.
24
What symptom do you have?
cough




-----------------------------------------------------------
Do you have the following symptom: (Y for Yes and N for No)
yellow sputum
Y
-----------------------------------------------------------
Do you have the following symptom: (Y for Yes and N for No)
green sputum
Y
-----------------------------------------------------------
Do you have the following symptom: (Y for Yes and N for No)
malaise
Y
-----------------------------------------------------------
Diagnosis results:
pneumonia :53 %
asthma :13 %
hepatitis B :11 %
influenza :11 %
colitis :10 %
-----------------------------------------------------------


'Diagnosis done'

In [2]:
import pandas as pd
dis2sym = pd.read_csv('/home/chaozhang/czhang/UMLS/dis_symptom.csv', header=None)


In [46]:
print('%')

%
