In [None]:
import pickle
import pandas as pd
import numpy as np
import re

# CCS

## dimSize

In [2]:
# dimSize
seqs = pickle.load(open('../process_mimic/CCS.seqs', 'rb'))
codeSet = set()
for patient in seqs:
    for visit in patient:
        for code in visit:
            codeSet.add(code)
print(max(codeSet) + 1)

272


## Frequencies

In [3]:
# first rank all labels by their frequencies
df_freq = pd.DataFrame(np.arange(0,272), columns=['code'])
codeList = []
for patient in seqs: 
        for visit in patient: 
            for code in visit:
                codeList.append(code)
temp = pd.DataFrame(pd.value_counts(codeList)/len(codeList), columns=['freq'])
df_freq = df_freq.join(temp, how='left')
df_freq = df_freq.sort_values(by=['freq'], ascending=False)
# equally divide them into five groups
df_freq['group'] = 1
df_freq.loc[df_freq['freq']>np.quantile(df_freq['freq'], 0.2),'group'] = 2
df_freq.loc[df_freq['freq']>np.quantile(df_freq['freq'], 0.4),'group'] = 3
df_freq.loc[df_freq['freq']>np.quantile(df_freq['freq'], 0.6),'group'] = 4
df_freq.loc[df_freq['freq']>np.quantile(df_freq['freq'], 0.8),'group'] = 5
df_freq

Unnamed: 0,code,freq,group
0,0,0.042324,5
12,12,0.042063,5
44,44,0.034841,5
14,14,0.030623,5
3,3,0.028307,5
...,...,...,...
247,247,0.000004,1
242,242,0.000004,1
265,265,0.000004,1
270,270,0.000004,1


## testSet

In [4]:
testSet = np.array(pickle.load(open('testSet_CCS', 'rb')))
testSet.shape

(3,)

In [5]:
len(testSet[1])

1130

In [6]:
max(len(i) for i in testSet[1])

34

In [7]:
for i in range(len(testSet[1])):
    if len(testSet[1][i]) > 2:
        print(i,testSet[1][i])
        break

764 [[1, 123, 12, 14], [202, 214, 183, 183, 50, 122, 14, 1, 129], [14, 15, 154, 17, 15, 99, 99, 99, 1, 14, 124, 84, 40, 18]]


## testPred

In [8]:
testPred = np.array(pickle.load(open('testPred_CCS', 'rb')))
testPred.shape

(33, 1130, 272)

In [9]:
testPred[:,0].shape

(33, 272)

In [10]:
testPred[:,764]

array([[8.54155552e-03, 1.11913936e-01, 2.35966600e-02, ...,
        5.24361584e-05, 3.04615151e-05, 3.04776115e-05],
       [9.46822969e-03, 5.22517795e-02, 1.68086964e-02, ...,
        1.48974532e-04, 8.30134534e-05, 8.30784141e-05],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       ...,
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
       [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
        0.00000000e+00, 0.00000000e+00, 0.00000000e+00]])

In [11]:
L2 = testPred[:,1016][0]
sortAsc = np.argsort(L2)
sortDesc = list(sortAsc)
sortDesc.reverse()
sortDesc[:20]

[114,
 12,
 56,
 39,
 35,
 14,
 44,
 28,
 9,
 18,
 42,
 103,
 45,
 3,
 37,
 36,
 119,
 53,
 2,
 58]

## Auccary

For every nonzero entry z in every real label vector y, counter[z] += 1 if top 20 indices of y_hat contain z, accuracy_z_@20=counter[z] / #occurrences[z]; similarly we can calculate accuracy@20 for groups of labels by (sum_z counter[z]) / (sum_z #occurrences[z]).

In [12]:
df_freq['y'] = 0
df_freq['top20'] = 0

n_person = testPred.shape[1]
for patient in range(n_person): 
        n_pred = len(testSet[1][patient]) - 1
        for visit in range(n_pred):
            label = testSet[1][patient][visit+1]
            prediction = testPred[:,patient][visit]
            sortAsc = np.argsort(prediction)
            sortDesc = list(sortAsc)
            sortDesc.reverse()
            top20 = sortDesc[:20]
            for code in label:
                df_freq.loc[code,'y'] += 1
                if code in top20:
                    df_freq.loc[code,'top20'] += 1
df_freq

Unnamed: 0,code,freq,group,y,top20
0,0,0.042324,5,1070,992
12,12,0.042063,5,1136,1074
44,44,0.034841,5,920,920
14,14,0.030623,5,758,709
3,3,0.028307,5,669,624
...,...,...,...,...,...
247,247,0.000004,1,0,0
242,242,0.000004,1,0,0
265,265,0.000004,1,0,0
270,270,0.000004,1,0,0


In [14]:
auc = []
for i in range(1,6):
    temp = df_freq[df_freq['group']==i]
    auc.append(sum(temp['top20'])/sum(temp['y']))
    
res = pd.DataFrame(columns=['0-20', '20-40', '40-60', '60-80', '80-100'])
res.loc['auc'] = auc
res

Unnamed: 0,0-20,20-40,40-60,60-80,80-100
auc,0.0,0.038991,0.195719,0.211983,0.699736


# MIMIC

In [16]:
# dimSize
seqs = pickle.load(open('../process_mimic/mimic.3digitICD9.seqs', 'rb'))
codeSet = set()
for patient in seqs:
    for visit in patient:
        for code in visit:
            codeSet.add(code)
print(max(codeSet) + 1)

942


In [17]:
# first rank all labels by their frequencies
df_freq = pd.DataFrame(np.arange(0,942), columns=['code'])
codeList = []
for patient in seqs: 
        for visit in patient: 
            for code in visit:
                codeList.append(code)
temp = pd.DataFrame(pd.value_counts(codeList)/len(codeList), columns=['freq'])
df_freq = df_freq.join(temp, how='left')
df_freq = df_freq.sort_values(by=['freq'], ascending=False)
# equally divide them into five groups
df_freq['group'] = 1
df_freq.loc[df_freq['freq']>np.quantile(df_freq['freq'], 0.2),'group'] = 2
df_freq.loc[df_freq['freq']>np.quantile(df_freq['freq'], 0.4),'group'] = 3
df_freq.loc[df_freq['freq']>np.quantile(df_freq['freq'], 0.6),'group'] = 4
df_freq.loc[df_freq['freq']>np.quantile(df_freq['freq'], 0.8),'group'] = 5
df_freq

Unnamed: 0,code,freq,group
6,6,0.031150,5
14,14,0.028785,5
13,13,0.028613,5
57,57,0.027955,5
39,39,0.027837,5
...,...,...,...
840,840,0.000004,1
839,839,0.000004,1
836,836,0.000004,1
723,723,0.000004,1


In [18]:
testSet = np.array(pickle.load(open('testSet', 'rb')))
testPred = np.array(pickle.load(open('testPred', 'rb')))
df_freq['y'] = 0
df_freq['top20'] = 0

n_person = testPred.shape[1]
for patient in range(n_person): 
        n_pred = len(testSet[1][patient]) - 1
        for visit in range(n_pred):
            label = testSet[1][patient][visit+1]
            prediction = testPred[:,patient][visit]
            sortAsc = np.argsort(prediction)
            sortDesc = list(sortAsc)
            sortDesc.reverse()
            top20 = sortDesc[:20]
            for code in label:
                df_freq.loc[code,'y'] += 1
                if code in top20:
                    df_freq.loc[code,'top20'] += 1
df_freq

Unnamed: 0,code,freq,group,y,top20
6,6,0.031150,5,675,628
14,14,0.028785,5,672,652
13,13,0.028613,5,679,659
57,57,0.027955,5,678,676
39,39,0.027837,5,651,618
...,...,...,...,...,...
840,840,0.000004,1,0,0
839,839,0.000004,1,0,0
836,836,0.000004,1,0,0
723,723,0.000004,1,0,0


In [19]:
auc = []
for i in range(1,6):
    temp = df_freq[df_freq['group']==i]
    auc.append(sum(temp['top20'])/sum(temp['y']))
    
res = pd.DataFrame(columns=['0-20', '20-40', '40-60', '60-80', '80-100'])
res.loc['auc'] = auc
res

Unnamed: 0,0-20,20-40,40-60,60-80,80-100
auc,0.0,0.0,0.016349,0.093987,0.547594
