# Diagnosis System version 2 -- test
This is a testing version for our diagnosis system. Virtual patients are generated and diagnosed with the system, and the accuracy is calculated.

In [3]:
import pandas as pd
import numpy as np

WM = pd.read_csv('../WeightMatrix/Dis_Sym_30.csv', index_col=0)

dis2sym = pd.read_csv('../UMLS/dis_symptom.csv', header=None)

dis2sym.fillna(method='ffill',inplace=True)

umls_dis = {}
umls_sym = {}
dis_num = {}
for i in dis2sym.index:
    temp = dis2sym.loc[i][0]
    items = temp.split('^')
    item = items[0].strip('UMLS:').split('_')
    if len(item) != 2: continue
    umls_dis[item[0]] = item[1]
    dis_num[item[0]] = int(dis2sym.loc[i][1])
for i in dis2sym.index:
    temp = dis2sym.loc[i][2]
    items = temp.split('^')
    item = items[0].strip('UMLS:').split('_')
    if len(item) != 2: continue
    umls_sym[item[0]] = item[1]
    
rev_sym = {v: k for k, v in umls_sym.items()}
rev_dis = {v: k for k, v in umls_dis.items()}

In [4]:
patients = {}

for i in WM.index:
    patients[i] = []
    for j in WM.columns:
        if WM.loc[i][j] != 0:
            patients[i].append(j)
#patients

In [5]:
#patients

In [6]:
def SelectedMatrix(sym):
    selected = WM[WM[sym] != 0]
    selected = selected.drop(columns=[sym])
    for c in selected.columns:
        if sum(selected[c]) == 0:
            selected.drop(columns=[c],inplace=True)
    return selected
    

In [7]:
def renorm(dia):
    for c in dia.index:
        dia[c] *= dis_num[c]**(1/3)
    dia.sort_values(ascending=False, inplace=True)
    temp = dia**3
    
    s = sum(temp[:5])
    return temp/s

In [25]:
def diagnosis(dis, sym):
        
    selected = SelectedMatrix(sym)
    
    #The response vector
    res = pd.Series(index=WM.columns, data=[0]*len(WM.columns))
    res[sym] = 1
    
    #Diagnosis process
    output = [0,0] # result: accuracy, number of questions asked
    while True:
        dia = WM.dot(res)
        if len(selected) == 1:
            dia[selected.index[0]] = 1
            dia.sort_values(ascending=False, inplace=True)
            if dia.keys()[0] == dis:
                output[0] = 1
                return output
            else:
                output[0] = -1
                return output
        elif len(selected.columns) == 1:
            output[0] = 0
            return output
        
        output[1] += 1
        #choose the most relevant symptom to ask: The symptom that are least shared with other diseases
        next_i = selected.columns[0]
        s = 100       
        for i in selected.columns:   
            if 0 in selected[i].value_counts():
                pri = abs(selected[i].value_counts()[0] - len(selected)/2)
                if pri < s:
                    s = pri
                    next_i = i      
            else:
                res[next_i] = 1
                selected = selected[selected[next_i]!=0]  
        
        if next_i in patients[dis]:
            res[next_i] = 1
            selected = selected[selected[next_i]!=0]
        else:
            res[next_i] = 0
            selected = selected[selected[next_i]==0]
            
        selected.drop(columns=[next_i], inplace = True)
    

In [26]:
diagnosis('C0020538', 'C0039070')

[1, 4]

In [10]:
#patients.items()

In [27]:
def performance():
    perf = pd.DataFrame(columns = ['result' ,'number of question'])
    N = 0
    for d, v in patients.items():
        for s in v:
            N += 1
            perf.loc[N] = diagnosis(d,s)
            if N%100 == 0: print(N)
    perf.to_csv('testlog2.csv')

In [28]:
%%time
import time
performance()

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
CPU times: user 7h 28min 17s, sys: 17h 44min 33s, total: 1d 1h 12min 51s
Wall time: 2h 28min 43s


In [None]:
a = pd.read_csv('testlog.csv')
b = pd.read_csv('testlog2.csv')

In [48]:
N = 0
t = 0
f = 0
for d, v in patients.items():
    for s in v:
        N += 1
        if diagnosis(d,s) == 1:
            t += 1
        if diagnosis(d,s) == 0:
            f += 1
print('total:', N)
print('Correct:', t)
print('Wrong:', N-t-f)
print('fail:', f)

total: 1854
Correct: 684
Wrong: 1170
fail: 0


In [43]:
N = 0
t = 0
f = 0
for d, v in patients.items():
    for s in v:
        N += 1
        if diagnosis(d,s) == 1:
            t += 1
        if diagnosis(d,s) == 0:
            f += 1
print('total:', N)
print('Correct:', t)
print('Wrong:', N-t-f)
print('fail:', f)

total: 1854
Correct: 1854
Wrong: 0
fail: 0


In [None]:
#cube, 0.8
N = 0
t = 0
f = 0
for d, v in patients.items():
    for s in v:
        N += 1
        if diagnosis(d,s) == 1:
            t += 1
        if diagnosis(d,s) == 0:
            f += 1
print('total:', N)
print('Correct:', t)
print('Wrong:', N-t-f)
print('fail:', f)

In [None]:
#cube, 0.5
N = 0
t = 0
f = 0
for d, v in patients.items():
    for s in v:
        N += 1
        if diagnosis(d,s) == 1:
            t += 1
        if diagnosis(d,s) == 0:
            f += 1
print('total:', N)
print('Correct:', t)
print('Wrong:', N-t-f)
print('fail:', f)

In [None]:
#square, 0.5
N = 0
t = 0
f = 0
for d, v in patients.items():
    for s in v:
        N += 1
        if diagnosis(d,s) == 1:
            t += 1
        if diagnosis(d,s) == 0:
            f += 1
print('total:', N)
print('Correct:', t)
print('Wrong:', N-t-f)
print('fail:', f)

In [None]:
#quad, 0.8
N = 0
t = 0
f = 0
for d, v in patients.items():
    for s in v:
        N += 1
        if diagnosis(d,s) == 1:
            t += 1
        if diagnosis(d,s) == 0:
            f += 1
print('total:', N)
print('Correct:', t)
print('Wrong:', N-t-f)
print('fail:', f)

In [46]:
1041/1854

0.5614886731391586

In [12]:
import pandas as pd
a = pd.DataFrame(index = [1,2], columns = ['a','b', 'c', 'd'], data = [[1,2,3,4],[1,2,3,4]])
b = pd.DataFrame(index = [1], columns = ['a'], data = [2])

In [14]:
for i in b.index:
    for j in b.columns:
        if b.loc[i][j]:
            a.loc[i][j] += b.loc[i,j]

In [16]:
a.loc[3]['a'] = 3

KeyError: 'the label [3] is not in the [index]'