In [1]:
import os, sys, yaml,argparse
from tqdm import tqdm_notebook as tqdm

from utils.data_analysis import *
from utils.preprocessing import *
from utils.readers import *
from utils.read_csv import *
from utils.subject_split import *


In [2]:
cfg = yaml.load(open("./config.yaml","r"), Loader=yaml.FullLoader)
mimic3_path = cfg["mimic3_path"]
output_path = cfg["output_path"]
output_subject_path = cfg["output_subject_path"]
phenotype_definitions = cfg["phenotype_definitions"]
itemids_file = cfg["itemids_file"]
reference_range_file = cfg["variable_ranges"]
event_tables = cfg["event_tables"]

In [5]:
definitions = yaml.load(open(phenotype_definitions, 'r'), Loader=yaml.FullLoader)
icd9_codes = set()
for dx in definitions:
    for code in definitions[dx]['codes']:
        icd9_codes.add(code)

# DIAGNOSES_ICD

In [9]:
df_diagnosis = pd.read_csv(os.path.join(mimic3_path,"DIAGNOSES_ICD.csv"),low_memory=False)
df_diagnosis

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,1297,109,172335,1.0,40301
1,1298,109,172335,2.0,486
2,1299,109,172335,3.0,58281
3,1300,109,172335,4.0,5855
4,1301,109,172335,5.0,4254
...,...,...,...,...,...
651042,639798,97503,188195,2.0,20280
651043,639799,97503,188195,3.0,V5869
651044,639800,97503,188195,4.0,V1279
651045,639801,97503,188195,5.0,5275


In [10]:
code_diagnoses = df_diagnosis.ICD9_CODE.unique()

In [11]:
print("#icd ",len(code_diagnoses))
print("#icd not in icds ",len(set(code_diagnoses) - set(icd9_codes)))
for i in code_diagnoses:
    if i in icd9_codes:
        print(i)

#icd  6985
#icd not in icds  1
40301
486
58281
5855
4254
2762
7100
2767
7243
45829
2875
28521
28529
27541
5856
58381
5589
32723
22804
33829
78900
79092
V4511
53100
41071
2859
41401
725
1915
3314
53081
4111
48283
2720
3051
1940
1977
2553
4240
5845
99859
6822
5119
5990
4280
34982
4019
V1000
V453
V5865
0413
2518
E9320
V3001
V053
V290
5715
7895
07054
2851
2765
25000
570
07044
5712
5849
5724
5118
2867
51882
0389
2800
2639
2761
99592
30393
5723
2449
1122
V3000
88122
9032
9033
9551
9555
9552
E956
30590
7452
7622
41011
4412
496
07070
4478
41402
44021
5854
9971
4400
40390
2749
42731
V1046
V1011
V1209
V090
56983
56089
49121
99831
42833
41400
V4581
43331
99812
436
43320
49390
99671
4241
4168
V3101
76503
769
7793
7742
7707
76524
77081
77981
7470
7766
7726
77989
36221
19889
1972
V103
74689
7661
V293
85300
78039
E8889
5185
73028
25060
3572
77214
76518
76528
430
2939
4439
V1582
3963
3659
135
311
E9352
72141
2148
72401
4375
42732
24290
60000
V4582
1890
03811
99591
30400
1120
5111
71107
71104
80012
305

E9888
37556
46430
6040
6952
76499
V7651
7149
71903
V023
5989
V1088
3442
9947
70901
V626
2442
81412
81519
8442
8361
95215
E8282
3580
34981
V188
20005
2537
34541
V1241
78891
7723
V5849
81406
81407
28489
99563
3562
74741
64844
74362
72705
60010
37800
1610
29421
7613
53400
4219
6190
3419
V4361
28652
9995
51189
58189
78799
95214
33372
1464
27411
V560
1568
80324
92231
7758
34551
74687
7428
7981
47811
7818
43813
30022
30580
E9293
30502
5804
76508
7792
66932
64292
65441
64682
V5862
29041
E8854
65613
4748
1400
67403
56202
55200
6203
38900
V448
43840
7284
27903
20890
20192
29384
V5812
53781
78863
8930
33818
90453
8441
86103
75430
5562
4958
81001
75016
4479
53210
1418
20281
4541
78760
34481
80341
5283
0846
30541
3093
53520
8793
9839
9767
E9507
E9581
9899
55300
47411
3315
3952
1543
5935
5239
37730
V4984
47820
V653
7949
E9192
9038
71954
9059
E9470
80160
E9651
81512
4848
74789
74609
90301
9534
7817
9557
7589
E8126
6205
V549
78459
E9364
V0950
E8131
80223
78834
7791
3910
7282
9140
49302
7620
63572
635

# PROCEDURES_ICD

In [12]:
df_proce = pd.read_csv(os.path.join(mimic3_path,"PROCEDURES_ICD.csv"),low_memory=False)
df_proce

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,944,62641,154460,3,3404
1,945,2592,130856,1,9671
2,946,2592,130856,2,3893
3,947,55357,119355,1,9672
4,948,55357,119355,2,331
...,...,...,...,...,...
240090,228330,67415,150871,5,3736
240091,228331,67415,150871,6,3893
240092,228332,67415,150871,7,8872
240093,228333,67415,150871,8,3893


In [13]:
code_proce = df_proce.ICD9_CODE.unique()
print("#icd ",len(code_proce))
print("#icd not in icds ",len(set(code_proce) - set(icd9_codes)))
for i in code_proce:
    if i in icd9_codes:
        print(i)

#icd  2009
#icd not in icds  2009
