In [1]:
# Similar patients
# Time series can be applied with two approaches: 1. similarity-based(distance) and 2. feature-based

In [2]:
from gensim.corpora import Dictionary
from gensim.models import ldamodel
from gensim.matutils import kullback_leibler, jaccard, hellinger, sparse2full
import numpy

## Diagnoses

In [3]:
diagnoses = pd.read_csv('/Users/grace/mimic/csv/DIAGNOSES_ICD.csv', sep=',')
diagnoses[diagnoses.ICD9_CODE.isin(['77181', '99591', '99592', '67020', '67022', '67024'])].shape

(5409, 5)

In [4]:
sepsis_patients = diagnoses[diagnoses.ICD9_CODE.isin(['77181', '99591', '99592', '67020', '67022', '67024'])]['SUBJECT_ID'].unique()
sepsis_patients

array([  117,   124,    64, ..., 95803, 97143, 97158])

In [5]:
#seed patient
unique_d_for_25030 = diagnoses[diagnoses.SUBJECT_ID==25030]['ICD9_CODE'].unique()
unique_d_for_25030

array(['0389', '4275', '78551', '4260', '4210', '40391', '25041', '2767',
       'V4975', '99592', '41092', '99662', '03842', '2869', '99681',
       '4254', '2851', 'V5867', '51889', '7904', '71941', '0383', '70703',
       '78552', '07070', '28521', '40301', '43491', '4372', '2720',
       '25051', '2761', '36201', '431', '3314', '7070', '25081', '78039',
       '51881'], dtype=object)

In [6]:
patients = {}
max_d = 0
for i, grp in diagnoses[diagnoses.SUBJECT_ID.isin(sepsis_patients)].groupby('SUBJECT_ID'):
#     print(i)
    if i == 25030:
        continue
    else:
        patients[i] = list(map(lambda x: str(x), grp.ICD9_CODE.unique()))

        if max_d < len(grp.ICD9_CODE.unique()):
            max_d = len(grp.ICD9_CODE.unique())
#     print(i)
#     print(grp.ICD9_CODE.unique())

max_d

144

In [7]:
def jaccard_index(first_set, second_set):
    """ Computes jaccard index of two sets
        Arguments:
          first_set(set):
          second_set(set):
        Returns:
          index(float): Jaccard index between two sets; it is 
            between 0.0 and 1.0
    """
    # If both sets are empty, jaccard index is defined to be 1
    index = 1.0
    if first_set or second_set:
        index = (float(len(first_set.intersection(second_set))) 
             / len(first_set.union(second_set)))

    return index

In [8]:
first_set = set(patients[21])
second_set = set(patients[38])
index = jaccard_index(first_set, second_set)
print(index)

0.09523809523809523


In [9]:
len(patients)

4780

In [10]:
#find similar patients to 25030

d_for_25030 = set(unique_d_for_25030)

def compute_jaccard(base_set, rest_dic):
    jac_dic = []
    for i in rest_dic.keys():
        jac_dic.append({'SUBJECT_ID': i,
        'jaccard_index': jaccard_index(base_set, set(rest_dic[i])),
        'set':rest_dic[i]})

    return jac_dic

jac_dic = compute_jaccard(d_for_25030, patients)

In [11]:
jac_df = pd.DataFrame(jac_dic)
# jac_df.head()

Unnamed: 0,SUBJECT_ID,jaccard_index,set
0,21,0.090909,"[41071, 78551, 5781, 5849, 40391, 4280, 4592, ..."
1,38,0.04,"[60883, 0389, 99592, 5849, 4280, 42731, 9982, ..."
2,61,0.032258,"[20280, 2880, 2875, 5781, 2851, 2639, 9998, 57..."
3,62,0.0,"[1125, 03849, 7100, 99591, 2859, 7140]"
4,64,0.021277,"[03811, 99591, 30400, 1120, 5111, 7895, 71107,..."


In [17]:
# jac_df[jac_df.jaccard_index>0].head()

Unnamed: 0,SUBJECT_ID,jaccard_index,set
0,21,0.090909,"[41071, 78551, 5781, 5849, 40391, 4280, 4592, 5070, 42731, 4271, 41401, 25000, 28521, 1122, 2720, 2749, V1046, 43889, 0388, 78552, 70709, 5119, 6823, 99859, 00845, 5720, 99592, V0980, 2859, 185, 4439, 2449, E8788]"
1,38,0.04,"[60883, 0389, 99592, 5849, 4280, 42731, 9982, 9974, 5601, 6084, E8708, 41400, V4581]"
2,61,0.032258,"[20280, 2880, 2875, 5781, 2851, 2639, 9998, 5790, 6930, 2848, 99685, 56983, 5672, 2762, 0389, 99591, 2763, 7994, 1120, 2765, 4589, 42731, 2554, 3510, 0880]"
4,64,0.021277,"[03811, 99591, 30400, 1120, 5111, 7895, 71107, 71104, 4210]"
5,85,0.014925,"[0389, 486, V4281, 23875, 42731, 41400, 3004, 2724, 2449, 60001, 78843, 32723, 3320, 3051, V1079, V4581, V433, 99591, 5853, 4241, 20280, V4282, 99811, 30000, 53081, 60000, E8798, 41401, 4412]"


In [13]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_colwidth', -1)  

In [16]:
jac_df[jac_df.jaccard_index>0.1].sort_values(by='jaccard_index', ascending=False)[:10]

Unnamed: 0,SUBJECT_ID,jaccard_index,set
1038,12733,0.208333,"[44023, 4280, 5856, 40391, 70714, 25000, V1082, 0389, 51881, 42842, 78552, 4271, 41071, 00845, 72886, 7854, 99592, 2720, V4975, 4439, V4581, 412, 70705, 70703, 70720, 0272, 1125, 71103, 99662, 6824, 99702, 43491, 4275, 6822, 570, 70723, 94421, E9248, 28521, 43310, V5867, 25080, 25050, 36201, 99769, 5789, 2851, 4589]"
146,1795,0.203704,"[0389, 41071, 4280, 78552, 40391, 5856, 44024, 2761, 4254, 5119, 570, 431, 2869, 25070, 99592, 4589, 41401, 25060, 25050, 5363, 36201, 2720, 04111, 73300, V5867, V090]"
1404,17564,0.2,"[99662, 03811, 99592, 78552, 5990, 70714, 40391, 5856, 2761, 4538, 2851, 00845, 0417, 44023, 25040, V5867, 45184, 73028, 72290, 4592, 4280, 0389, 51881, 2767, 5070, 70703, 37601]"
4185,82512,0.190476,"[4241, 5845, 0389, 99592, 51881, 486, 4254, 42822, 70714, 4271, 2851, 99681, 5119, 2762, 2761, V707, V4501, V4973, 4019, 2720, 4168, 4439, V1251, 41401, V4582, 42731, 4280, 32723, 58381, 25041, 4275, 42741, 2875, E8780, 78551, 2767]"
1568,19632,0.175439,"[431, 496, 4280, 3314, 4439, 25000, 53081, 4019, 99662, 03811, 99592, 51881, 78552, 40391, 2874, 4538, V4581, 28521, 25060, 3572, 25040, 4271, 2639, 2869, V090, V4975, V4976, V180]"
1604,20133,0.174419,"[4373, 486, 4280, 99681, 5849, 25001, 4538, 2762, 07070, 5119, 99811, 2930, 32723, 4019, V5867, 25013, 5856, 42822, 4254, 25043, V4511, 25083, E9323, 28521, 07054, 3542, 7810, V1582, 4809, V420, 25051, 36201, 25081, 42823, 51881, 40391, 7830, 34982, E8780, V5865, 2767, 2724, V454, 53081, 79092, 45821, 27652, 4275, V667, V4986, 2760, 0389, 78552, 5121, 2867, 57510, 99592, 78065, E8798, V4502, 5730, 5715]"
1634,20546,0.173913,"[0389, 5849, 51881, 5070, 48241, 2767, 78039, 2851, 5780, 78552, 99592, 7070, 2762, 5789, 2859]"
425,4962,0.171875,"[07044, 5856, 40391, 5723, 78959, 2762, 2930, 45821, 5712, 25000, 28521, V4511, 0389, 78552, 486, 2866, 5724, 5849, 5789, 2851, 99592, 07054, 33829, 51881, 30500, 07070, 78039, 25022, 2761, 5715, 2875, 3558, 2767, E8801, 81408, 81502]"
682,8452,0.168675,"[25013, 27651, 5849, 40391, 99681, 2767, V5867, 41401, 25063, 5363, 25053, 36201, 3572, 03811, 78552, 51881, 41071, 5856, 6827, V420, 42822, 78959, 99592, 25061, 70715, 4280, 25051, V4582, 4439, 2384, 53570, 4264, 4168, 6111, 2728, 78791, 2768, 2859, V4511, V1254, 45821, 25541, E8791, 412, 28521, 27652, 78551, 7854, 2762, 73007, 00845, 25071, 25081, 7354, 27800, 0389, 6826, 4275]"
4050,78474,0.166667,"[0389, 51881, 78552, 5070, 570, 70724, 3481, 5849, 42822, 2761, 2869, 2639, 99592, 4275, 78551, 4280, 4019, 60000, 70703, 70707, 70709, 70720, 2875, V667]"
