# Model version 1
This is a Demo based on UMLS codes. Including 150 common diseases. This demo is interactive and adaptive. Given the input from the patients or physicians, a question is asked based on their answer, and a diagnosis is made if certain crtiria is met. 

In [11]:
import numpy as np
import pandas as pd

In [12]:
dis2sym = pd.read_csv('../UMLS/dis_symptom.csv', header=None)

### Transfrom the disease-symptom file into dictionaries for code-terms association

In [13]:
dis2sym.fillna(method='ffill',inplace=True)
umls_dis = {}
umls_sym = {}
for i in dis2sym.index:
    temp = dis2sym.loc[i][0]
    items = temp.split('^')
    item = items[0].strip('UMLS:').split('_')
    if len(item) != 2: continue
    umls_dis[item[0]] = item[1]
for i in dis2sym.index:
    temp = dis2sym.loc[i][2]
    items = temp.split('^')
    item = items[0].strip('UMLS:').split('_')
    if len(item) != 2: continue
    umls_sym[item[0]] = item[1]

In [14]:
rev_sym = {v: k for k, v in umls_sym.items()}

### Construct a matrix for disease-symptoms association

In [15]:
M = pd.DataFrame(index=umls_dis.keys(), columns=umls_sym.keys())
for i in dis2sym.index:
    temp = dis2sym.loc[i][0]
    items = temp.split('^')
    item = items[0].strip('UMLS:').split('_')
    if len(item) != 2: continue
    dis = item[0]
    
    temp = dis2sym.loc[i][2]
    items = temp.split('^')
    item = items[0].strip('UMLS:').split('_')
    if len(item) != 2: continue
    sym = item[0]
    
    M.loc[dis][sym] = 1.0


In [16]:
M.fillna(0,inplace=True)
for i in M.index:
    s = sum(M.loc[i])
    for j in M.columns:
        M.loc[i][j] /= s
M.head()

Unnamed: 0,C0008031,C0392680,C0012833,C0004093,C0085639,C0039070,C0042571,C0038990,C0030252,C0027497,...,C0474505,C0240805,C0020639,C0556346,C0000727,C0740844,C0425491,C0456091,C0231441,C0455204
C0020538,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,0.083333,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C0011847,0.071429,0.071429,0.0,0.071429,0.0,0.0,0.071429,0.071429,0.0,0.071429,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C0011570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C0010054,0.111111,0.111111,0.0,0.0,0.0,0.0,0.0,0.111111,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
C0032285,0.0,0.052632,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
M.to_csv('Dis_Sym_Matrix.csv')

## The diagnosis function

In [23]:
def diagnosis():
    # initial input part
    gendermap = {'0':'Female', '1': 'Male'}
    print('Please type in the gender for the patient. 0 for female and 1 for male')
    g = input()
    gender = gendermap[g]
    print('Please type in the age for the patient in years.')
    age = int(input())
    print('What symptom do you have?')
    sym = input()
    
    sym = rev_sym[sym]
    
    selected = M[M[sym] != 0]
    selected = selected.drop(columns=[sym])
    for c in selected.columns:
        if sum(selected[c]) == 0:
            selected.drop(columns=[c],inplace=True)
    
    #The response vector
    res = pd.Series(index=M.columns, data=[0]*len(M.columns))
    res[sym] = 1
    
    while True:
        dia = M.dot(res)
        dia = dia
        #print(sorted(dia,reverse=True))
        if max(dia) > 0.5:
            dia.sort_values(ascending=False, inplace=True)
            for i in range(len(dia)):
                if i < 5:
                    print(umls_dis[dia.keys()[i]], ':', dia[i])
            return 'Diagnosis done'
        if len(selected) == 1:
            return 'Diagnosis fail'
            
        #choose the most relevant symptom to ask: The symptom that are least shared with other diseases
        next_i = selected.columns[0]
        s = 0
        for i in selected.columns:
            if selected.iloc[0][i] > 0:
                if selected[i].value_counts()[0] > s:
                    s = selected[i].value_counts()[0]
                    next_i = i
         
        print('-----------------------------------------------------------')
        print('Do you have the following symptom: (Y for Yes and N for No)')
        print(umls_sym[next_i])
        
        answer = input()
        while answer != 'Y' and answer != 'N':
            answer = input()
              
        selected.drop(columns=[next_i], inplace = True)
        if answer == 'Y':
            res[next_i] = 2
        else:
            res[next_i] = 0
            selected.drop(selected.index[0], inplace=True)
    

In [24]:
diagnosis()

Please type in the gender for the patient. 0 for female and 1 for male
1
Please type in the age for the patient in years.
24
What symptom do you have?
cough
-----------------------------------------------------------
Do you have the following symptom: (Y for Yes and N for No)
yellow sputum
Y
-----------------------------------------------------------
Do you have the following symptom: (Y for Yes and N for No)
green sputum
N
-----------------------------------------------------------
Do you have the following symptom: (Y for Yes and N for No)
jugular venous distention
N
-----------------------------------------------------------
Do you have the following symptom: (Y for Yes and N for No)
non-productive cough
Y
-----------------------------------------------------------
Do you have the following symptom: (Y for Yes and N for No)
symptom aggravating factors
Y
asthma : 0.555555555556
pneumonia : 0.263157894737
embolism pulmonary : 0.25
Pneumocystis carinii pneumonia : 0.222222222222
hepati

'Diagnosis done'

In [20]:
a = pd.DataFrame(data=[[1,2],[2,3]], index = ['a','b'])
b = pd.Series(data=[1,2])

In [22]:
a[0].value_counts()[2]

1

In [None]:
b

In [None]:
c = a.dot(b)
c = c/c.sum()

In [None]:
gendermap = {'0':'Female', '1': 'Male'}
print('Please type in the gender for the patient. 0 for female and 1 for male')
g = input()
gender = gendermap[g]
print('Please type in the age for the patient in years.')
age = int(input())
print('What symptom do you have?')
sym = input()

sym = rev_sym[sym]

selected = M[M[sym] != 0]

In [None]:
selected.info()

In [None]:
selected = selected.drop(columns=[sym])

In [None]:
for c in selected.columns:
    if sum(selected[c]) == 0:
        selected.drop(columns=[c],inplace=True)

In [25]:
import sys; print("Python", sys.version)

Python 3.6.3 |Anaconda custom (64-bit)| (default, Nov  3 2017, 19:19:16) 
[GCC 7.2.0]


In [26]:
import gensim; print("gensim", gensim.__version__)

gensim 3.1.0


In [30]:
import threading
threading.activeCount()

5

In [35]:
!conda --version

conda 4.5.11


In [36]:
!python --version

Python 3.6.3 :: Anaconda custom (64-bit)


In [37]:
!gensim --version

/bin/sh: gensim: command not found
