In [1]:
import numpy as np
import pandas as pd
import re
from os.path import join

from tools import *

In [2]:
## rename EXTRACT_FEATURES to DIMENSION_REDUCTION
read_prefix = "/data/MIMIC3/"
write_prefix = "/data/liu/LDA"

## Read Data

In [2]:
diag_df = read_data(join(read_prefix,"DIAGNOSES_ICD"))
diag_df.head()

In [None]:
pres_df = read_data(join(read_prefix,"PRESCRIPTIONS"),dtype={'NDC':str})
pres_df.head()

## Get Matrix of All Patients and All Items

In [None]:
diag_disease_all = diag_df[['SUBJECT_ID','ICD9_CODE','HADM_ID']].dropna().drop_duplicates()
pres_drug_all = pres_df[['SUBJECT_ID','NDC','HADM_ID']].dropna().drop_duplicates()
pres_drug_all = pres_drug_all[pres_drug_all['NDC']!='0']

In [None]:
print("Diagnoses: %d patients and %d diseases" 
      %(len(diag_disease_all.SUBJECT_ID.unique()),len(diag_disease_all.ICD9_CODE.unique())))
print("Prescriptions: %d patients and %d diseases" 
      %(len(pres_drug_all.SUBJECT_ID.unique()),len(pres_drug_all.NDC.unique())))

In [None]:
## randomly select one HADM_ID for each patient
diag_hadm_all = inner_join(diag_disease_all[['SUBJECT_ID','HADM_ID']].drop_duplicates(),\
                           pres_drug_all[['SUBJECT_ID','HADM_ID']].drop_duplicates(),'SUBJECT_ID')
diag_hadm_all = diag_hadm_all[diag_hadm_all['HADM_ID_x']==diag_hadm_all['HADM_ID_y']]
diag_hadm_all.head()

In [None]:
size = 1        # sample size
fn = lambda obj: obj.loc[np.random.choice(obj.index, size),:]
hadm_all = diag_hadm_all.groupby('SUBJECT_ID', as_index=False).apply(fn)['HADM_ID_x']

In [None]:
diag_hadm = diag_disease_all[diag_disease_all['HADM_ID'].isin(hadm_all)][['SUBJECT_ID','ICD9_CODE']]
pres_hadm = pres_drug_all[pres_drug_all['HADM_ID'].isin(hadm_all)][['SUBJECT_ID','NDC']]
diag_hadm.head()

In [None]:
diag_hadm['VALUE']=1
pres_hadm['VALUE']=1

In [None]:
diag_matrix = diag_hadm.pivot(index='SUBJECT_ID', columns='ICD9_CODE', values='VALUE').fillna(0).reset_index()
diag_matrix.head()

In [None]:
pres_matrix = pres_hadm.pivot(index='SUBJECT_ID', columns='NDC', values='VALUE').fillna(0).reset_index()
pres_matrix.head()

In [None]:
print("diag_shape: %s \npres_shape: %s"%(diag_matrix.shape,pres_matrix.shape))

In [None]:
write2file(diag_matrix,join(write_prefix,"diag_matrix"))
write2file(pres_matrix,join(write_prefix,"pres_matrix"))

## Sampling

In [None]:
diag_disease_all = diag_df[['ICD9_CODE']].dropna().drop_duplicates()
pres_drug_all = pres_df[['NDC']].dropna().drop_duplicates()
sample_disease_1000 = diag_disease_all.sample(n=1000, random_state=2019)
sample_drug_1000 = pres_drug_all.sample(n=1000, random_state=2019)

In [None]:
diag_df_1000 = diag_df[diag_df['ICD9_CODE'].isin(sample_disease_1000['ICD9_CODE'])]
pres_df_1000 = pres_df[pres_df['NDC'].isin(sample_drug_1000['NDC'])]

write2file(diag_df_1000,join(write_prefix,"diag_df_1kdisease"))
write2file(pres_df_1000,join(write_prefix,"pres_df_1kdrug"))

In [None]:
## select common 1000 users after filter 1000 drugs and diseases
common_user_all = inner_join(diag_df_1000[['SUBJECT_ID']].drop_duplicates(),\
                             pres_df_1000[['SUBJECT_ID']].drop_duplicates(),'SUBJECT_ID')
sample_user_1000 = common_user_all.sample(n=1000, random_state=2019)
diag_df_1k_1k = diag_df_1000[diag_df_1000['SUBJECT_ID'].isin(sample_user_1000['SUBJECT_ID'])]
pres_df_1k_1k = pres_df_1000[pres_df_1000['SUBJECT_ID'].isin(sample_user_1000['SUBJECT_ID'])]

In [None]:
write2file(diag_df_1k_1k,join(write_prefix,"diag_df_1k_1k"))
write2file(pres_df_1k_1k,join(write_prefix,"pres_df_1k_1k"))

## Diag_df_1000 -> patients*clusters  *  cluster*elements

In [None]:
diag_df_1k_1k = read_data(join(write_prefix,"diag_df_1k_1k"))
pres_df_1k_1k = read_data(join(write_prefix,"pres_df_1k_1k"))

In [None]:
diag_df_1k_1k.head()

In [None]:
## randomly select one HADM_ID for each patient
diag_hadm_all = inner_join(diag_df_1k_1k[['SUBJECT_ID','HADM_ID']].drop_duplicates(),\
                           pres_df_1k_1k[['SUBJECT_ID','HADM_ID']].drop_duplicates(),'SUBJECT_ID')
diag_hadm_all = diag_hadm_all[diag_hadm_all['HADM_ID_x']==diag_hadm_all['HADM_ID_y']]
diag_hadm_all.head()

In [None]:
size = 1        # sample size
fn = lambda obj: obj.loc[np.random.choice(obj.index, size),:]
sample_hadm_1k = diag_hadm_all.groupby('SUBJECT_ID', as_index=False).apply(fn)['HADM_ID_x']

In [None]:
diag_df_1k_1k_hadm = diag_df_1k_1k[diag_df_1k_1k['HADM_ID']\
                                   .isin(sample_hadm_1k)][['SUBJECT_ID','ICD9_CODE']].drop_duplicates()
diag_df_1k_1k_hadm['VALUE']=1

In [None]:
diag_matrix = diag_df_1k_1k_hadm.pivot(index='SUBJECT_ID', columns='ICD9_CODE', values='VALUE').fillna(0).reset_index()
print(diag_matrix.shape)
diag_matrix.head()

In [None]:
pres_df_1k_1k_hadm = pres_df_1k_1k[pres_df_1k_1k['HADM_ID']\
                                   .isin(sample_hadm_1k)][['SUBJECT_ID','NDC']].drop_duplicates()
pres_df_1k_1k_hadm['VALUE']=1


In [None]:
pres_matrix = pres_df_1k_1k_hadm.pivot(index='SUBJECT_ID', columns='NDC', values='VALUE').fillna(0).reset_index()
print(pres_matrix.shape)
pres_matrix.head()


In [None]:
write2file(diag_matrix,join(write_prefix,"diag_matrix"))
write2file(pres_matrix,join(write_prefix,"pres_matrix"))

## Running LDA 

In [17]:
## Select top10 drugs in sider4
sideffect_prefix = '/data/liu/adverse_events'
ade_df = read_data(join(sideffect_prefix, 'ndc_icd9_side_effects'), dtype={'NDC':str, 'ICD_CODE':str})
ade_df.head()

  if (yield from self.run_code(code, result)):


Unnamed: 0,NDC,drugbank_id,UMLS_CUI,ICD_CODE,SNOMED_CID,ICD9_CODE
0,3029305,DB00620,C0000731,789.36,60728008,78936
1,3049420,DB00620,C0000731,789.36,60728008,78936
2,9001104,DB00620,C0000731,789.36,60728008,78936
3,9001201,DB00620,C0000731,789.36,60728008,78936
4,9003101,DB00620,C0000731,789.36,60728008,78936


In [31]:
top10_NDC_ade = ade_df.groupby('NDC')['ICD9_CODE'].count().reset_index(name='count').sort_values(['count'], ascending=False).head(10)
top10_NDCs = top10_NDC_ade['NDC']
write2file(top10_NDC_ade,join(write_prefix,'top10_NDC_ade'))

In [3]:
diag_matrix = read_data(join(write_prefix, "diag_matrix"))
pres_matrix = read_data(join(write_prefix,"pres_matrix"))
diag_matrix.head()

Unnamed: 0,SUBJECT_ID,0030,0031,0038,0039,0041,0048,0051,00581,0059,...,V8745,V8801,V8811,V8812,V8821,V9010,V902,V9039,V9081,V9103
0,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
from sklearn.decomposition import LatentDirichletAllocation
from scipy.spatial import distance

In [None]:
# from numpy.linalg import norm

# def cosine_sim(v1, v2):
#     return np.matmul(v1,v2)/(norm(v1)*norm(v2))

# def euclidean(v1, v2):
#     return norm(v1-v2)


In [35]:
n_comp = 6
lda = LatentDirichletAllocation(n_components=n_comp, random_state=2019)
#                                 ,doc_topic_prior=0.01,topic_word_prior=0.0001)
pres_Z = lda.fit_transform(pres_matrix.iloc[:,1:]) 
pres_Y = lda.components_

In [5]:
n_comp = 6
lda_2 = LatentDirichletAllocation(n_components=n_comp, random_state=2019)
#                                   ,doc_topic_prior=0.001,topic_word_prior=0.00001)
diag_Z = lda_2.fit_transform(diag_matrix.iloc[:,1:]) 
diag_Y = lda_2.components_

In [41]:
all_NDCs = pres_matrix.columns[1:]
all_NDC_map = {}
for i in range(len(all_NDCs)):
    all_NDC_map[all_NDCs[i]]=i
    
    
top10_NDC_indexes = [all_NDC_map[ndc] for ndc in top10_NDCs]
top10_NDC_indexes

[408, 409, 2731, 1835, 1834, 2205, 2092, 1716, 2849, 3770]

In [44]:
np.sum(pres_Y[:,top10_NDC_indexes],axis=1)

array([110.14552219, 195.81035247,  70.36553255, 731.88292703,
         6.71897983, 212.07668593])

In [82]:
def test_lda(diag_Z, pres_Z, pres_index, n_comp =2):
    
    l = []
    for i in range(n_comp):
        sim_i_j = distance.euclidean(diag_Z[:,i],pres_Z[:,pres_index])
        l = l+[sim_i_j]
        print("Distance between diag_%d: %f"%(i, sim_i_j))
    print("Min:%f arg_min:%d"%(np.min(l),np.argmin(l)))

In [83]:
## Selected diag 3 and pres 2
test_lda(diag_Z,pres_Z,3,6)

Distance between diag_0: 104.995317
Distance between diag_1: 96.885347
Distance between diag_2: 100.242077
Distance between diag_3: 107.200800
Distance between diag_4: 105.076139
Distance between diag_5: 84.545936
Min:84.545936 arg_min:5


In [None]:
# for i in range(n_comp):
#     top_indices = np.where(pres_Y[i] >= 30)
# top_pres_indices = pres_Y[1].argsort()[-100:][::-1]
# print(top_indices)
# print(len(top_indices))

In [None]:
# all_NDCs = pres_matrix.columns[1:]
# select_NDCs = [all_NDCs[i] for i in top_pres_indices]
# write2file(pd.DataFrame({'NDC':select_NDCs}),join(write_prefix,'top100_ndc'))

In [23]:
all_ICD9s = diag_matrix.columns[1:]
def get_top_diag(feature_index):
    top_diag_indices = diag_Y[feature_index].argsort()[-1000:][::-1]
    select_ICD9s = [all_ICD9s[i] for i in top_diag_indices]
    return select_ICD9s
# write2file(pd.DataFrame({'ICD9_CODE':select_ICD9s}),join(write_prefix,'top1000_icd9'))

In [75]:
top_diag_indices[:10]

array([6289, 6112, 4028, 6294, 6293, 3987, 3999, 4068, 6388, 3969])

## Validation

In [56]:
select_ICD9s = read_data(join(write_prefix,'top1000_icd9'),dtype={'ICD9_CODE':str})
# select_NDCs = read_data(join(write_prefix,'top100_ndc'),dtype={'NDC':str})
select_ICD9s.head()

Unnamed: 0,ICD9_CODE
0,V290
1,V053
2,7742
3,V3001
4,V3000


In [18]:
# select_NDCs['NDC'][:10]
top10_NDC_ade = read_data(join(write_prefix,'top10_NDC_ade'),dtype={'NDC':str})

In [19]:
real_diseases  = ade_df[ade_df['NDC'].isin(top10_NDC_ade['NDC'])]
real_diseases.head()

Unnamed: 0,NDC,drugbank_id,UMLS_CUI,ICD_CODE,SNOMED_CID,ICD9_CODE
38,24208063562,DB00620,C0000731,789.36,60728008,78936
87,24208063562,DB00741,C0000731,789.36,60728008,78936
208,25152031,DB00188,C0000731,789.36,60728008,78936
209,25152531,DB00188,C0000731,789.36,60728008,78936
211,472011720,DB00188,C0000731,789.36,60728008,78936


In [20]:
actual_ICD9s = real_diseases['ICD9_CODE'].unique()
len(actual_ICD9s)

800

In [35]:
for i in range(n_comp):
#     print(len(set(get_top_diag(i)[:100])-set(actual_ICD9s)))
    matched_disease = set(get_top_diag(i)[:100]).intersection(set(actual_ICD9s))
    print(len(matched_disease))

35
34
28
47
51
4


In [33]:
for i in range(n_comp):
#     print(len(set(get_top_diag(i)[:100])-set(actual_ICD9s)))
    print(len(set(get_top_diag(i)).intersection(set(actual_ICD9s))))

214
194
162
231
244
65


In [40]:
matched_disease

{'36221', '4589', '53081', '7852'}

In [41]:
mimic_disease = read_data(join(read_prefix,"D_ICD_DIAGNOSES"),dtype={'ICD9_CODE':str})
mimic_disease[mimic_disease['ICD9_CODE'].isin(matched_disease)]

Unnamed: 0,ROW_ID,ICD9_CODE,SHORT_TITLE,LONG_TITLE
3561,3945,36221,Retrolental fibroplasia,Retrolental fibroplasia
5016,5396,4589,Hypotension NOS,"Hypotension, unspecified"
6009,6013,53081,Esophageal reflux,Esophageal reflux
13137,12986,7852,Cardiac murmurs NEC,Undiagnosed cardiac murmurs


In [50]:
# mimic_drugs = read_data(join(read_prefix,"PRESCRIPTIONS"),dtype={'NDC':str})[['NDC','DRUG']].drop_duplicates()
 mimic_drugs[mimic_drugs['NDC'].isin(top10_NDC_ade['NDC'])]

Unnamed: 0,NDC,DRUG
1394,338114403,Amino Acids 4.25% W/ Dextrose 5%
3989,264934155,Starter PN D10
4952,264934155,Starter PN D5
30497,61958070101,Emtricitabine-Tenofovir
42287,338114703,Amino Acids 5%-Dextrose 15%
69573,25152531,Celecoxib
86309,25152531,Celebrex
170619,61958070101,Emtricitabine-Tenofovir (Truvada)
208002,25152031,Celebrex
505887,49702020218,LaMIVudine-Zidovudine (Combivir)


## Draft

In [None]:
def test_lda(diag_params, pres_params, n_comp =2):
    
#     n_comp = 2
    ## doc_topic_prior, control the sparsity of probability
    diag_topic, diag_word = diag_params
    lda = LatentDirichletAllocation(n_components=n_comp, random_state=2019,\
                                    doc_topic_prior=diag_topic,topic_word_prior=diag_word)
    diag_Z = lda.fit_transform(diag_matrix.iloc[:,1:]) 
    # print(diag_Z.shape)
#     print("The first 5 rows of diagnoses_Z:\n %s"%diag_Z[:5])

    pres_topic, pres_word = pres_params
    lda_2 = LatentDirichletAllocation(n_components=n_comp, random_state=2019,\
                                      doc_topic_prior=pres_topic,topic_word_prior=pres_word)
    pres_Z = lda_2.fit_transform(pres_matrix.iloc[:,1:]) 
    # print(pres_Z.shape)
#     print("The first 5 rows of prescriptions_Z:\n %s"%pres_Z[:5])

    for i in range(n_comp):
        l = []
        for j in range(n_comp):
            sim_i_j = cosine_sim(diag_Z[:,i],pres_Z[:,j])
    #         print(sim_i_j)
            l = l + [sim_i_j]
        current_range = np.max(l)-np.min(l)
        print("Similarity between diag_%d and pres_j(j∈[0,%d): %s"%(i,n_comp, l))
        print("Max:%f  Range:%f arg_max:%d"%(np.max(l),current_range,np.argmax(l)))
        
    return diag_Z, pres_Z

In [None]:
test_lda([0.1,0.001],[0.1,0.001],2)

In [None]:
for n_comp in range(2,11):
    print("=============================================================\nn_comp: %d"%n_comp)
    test_lda([0.1,0.001],[0.1,0.001],n_comp)    

In [None]:
## fix params for diagnoses, tune params for prescriptions
## increase by 0.1
for param in np.arange(0.2,1,0.2):
    print("pres_params: %s"%([param,param*0.01]))
    test_lda([0.1,0.001],[param,param*0.01],6)
## decrease by 0.01
for param in np.arange(0.01,0.1,0.02):
    print("pres_params: %s"%([param,param*0.01]))
    test_lda([0.1,0.001],[param,param*0.01],6) 

In [None]:
diag_Y = lda.components_
diag_Y