In [25]:
import pandas as pd
import os

data_dir = "../data/mimic-iv/physionet.org/files/mimiciv/3.1/"
icd_file = "diagnoses_icd.csv.gz"

icd_codes = {
    "diagnosis": ["Scoliosis", 
                  "Kyphosis", 
                  "Lodorosis", 
                  "Dorsopathy",
                 "Vertebral Fracture",
                 "Spondylosis",
                 "Spondyloisthesis"
                 ],
    "icd_10_code": ["M41", 
                    "M40.0", 
                    "M40.5",
                    "M40-3, M50-4",
                    "S12, S22.0, S32.0-2",
                    "M47",
                    "M43.1"
                   ],
    "icd_10_regex": ["^M41[0-9]*",
                    "^M40.0[0-9]*",
                    "^M40.5[0-9]*",
                    "(^M4[0-4][0-9]*)|(^M5[0-4][0-9]*)",
                    "^S12[0-9]*|^S220[0-9]*|^S32[0-2][0-9]*",
                    "^M47[0-9]*",
                    "^M431[0-9]*"
                    ],
    "icd_9_code": ["737.3",
                   "737.1",
                   "737.2",
                   "720-724",
                   "805-806, 839.0-5",
                   "721",
                   "756.12, 738.4"
                  ],
    "icd_9_regex": [
        "^7373[0-9]*",
        "^7311[0-9]*",
        "^7322[0-9]*",
        "^72[0-4][0-9]*",
        "(^80[5-6][0-9]*)|(^839[0-5][0-9]*)",
        "^721[0-9]*",
        "(^75612[0-9]*)|(^7384[0-9]*)"
    ],
}

icd_codes = pd.DataFrame.from_dict(icd_codes)
print(icd_codes)

data = pd.read_csv(os.path.join(data_dir, icd_file))
icd_9_data = data[data['icd_version'] == 9]
icd_10_data = data[data['icd_version'] == 10]

            diagnosis          icd_10_code  \
0           Scoliosis                  M41   
1            Kyphosis                M40.0   
2           Lodorosis                M40.5   
3          Dorsopathy         M40-3, M50-4   
4  Vertebral Fracture  S12, S22.0, S32.0-2   
5         Spondylosis                  M47   
6    Spondyloisthesis                M43.1   

                             icd_10_regex        icd_9_code  \
0                              ^M41[0-9]*             737.3   
1                            ^M40.0[0-9]*             737.1   
2                            ^M40.5[0-9]*             737.2   
3       (^M4[0-4][0-9]*)|(^M5[0-4][0-9]*)           720-724   
4  ^S12[0-9]*|^S220[0-9]*|^S32[0-2][0-9]*  805-806, 839.0-5   
5                              ^M47[0-9]*               721   
6                             ^M431[0-9]*     756.12, 738.4   

                          icd_9_regex  
0                         ^7373[0-9]*  
1                         ^7311[0-9]*  
2     

In [26]:
import pandas as pd
import os

data_dir = "../data/mimic-iv/physionet.org/files/mimiciv/3.1/"
data_file = "hosp/d_labitems.csv.gz"

# labs
data = pd.read_csv(os.path.join(data_dir, data_file))
print(data.to_string())

      itemid                                       label                fluid    category
0      50801                  Alveolar-arterial Gradient                Blood   Blood Gas
1      50802                                 Base Excess                Blood   Blood Gas
2      50803         Calculated Bicarbonate, Whole Blood                Blood   Blood Gas
3      50804                        Calculated Total CO2                Blood   Blood Gas
4      50805                           Carboxyhemoglobin                Blood   Blood Gas
5      50806                       Chloride, Whole Blood                Blood   Blood Gas
6      50808                                Free Calcium                Blood   Blood Gas
7      50809                                     Glucose                Blood   Blood Gas
8      50810                      Hematocrit, Calculated                Blood   Blood Gas
9      50811                                  Hemoglobin                Blood   Blood Gas
10     508

In [27]:
print(data['category'].unique())

['Blood Gas' 'Chemistry' 'Hematology']


In [28]:
data_file = "hosp/diagnoses_icd.csv.gz"

data = pd.read_csv(os.path.join(data_dir, data_file))
print(data)

         subject_id   hadm_id  seq_num icd_code  icd_version
0          10000032  22595853        1     5723            9
1          10000032  22595853        2    78959            9
2          10000032  22595853        3     5715            9
3          10000032  22595853        4    07070            9
4          10000032  22595853        5      496            9
...             ...       ...      ...      ...          ...
6364483    19999987  23865745        7    41401            9
6364484    19999987  23865745        8    78039            9
6364485    19999987  23865745        9     0413            9
6364486    19999987  23865745       10    36846            9
6364487    19999987  23865745       11     7810            9

[6364488 rows x 5 columns]


In [29]:
print(icd_10_data)

         subject_id   hadm_id  seq_num icd_code  icd_version
40         10000084  23052089        1    G3183           10
41         10000084  23052089        2    F0280           10
42         10000084  23052089        3     R441           10
43         10000084  23052089        4     R296           10
44         10000084  23052089        5     E785           10
...             ...       ...      ...      ...          ...
6364453    19999828  29734428       18    Z9049           10
6364454    19999828  29734428       19   Z87891           10
6364455    19999828  29734428       20    B9620           10
6364456    19999828  29734428       21    Z1611           10
6364457    19999828  29734428       22    I9581           10

[3455747 rows x 5 columns]


In [30]:
print(icd_9_data['icd_code'].value_counts())

icd_code
4019     102368
2724      67293
53081     48628
25000     43077
42731     37070
          ...  
71431         1
E0100         1
09484         1
5270          1
E8736         1
Name: count, Length: 9143, dtype: int64


In [31]:
print(icd_9_data[icd_9_data['icd_code'].str.contains("7373")])

         subject_id   hadm_id  seq_num icd_code  icd_version
114        10000690  23280645       15    73730            9
7561       10011365  26948481        8    73730            9
14834      10022880  27708593        2    73730            9
29993      10046440  22950616        4    73730            9
36220      10057009  20254095        9    73730            9
...             ...       ...      ...      ...          ...
6316465    19924481  23990922        6    73730            9
6331810    19949814  28015124        7    73730            9
6335803    19956723  27397573       19    73730            9
6341654    19965443  27874810       14    73739            9
6346017    19971734  29379955        9    73730            9

[1115 rows x 5 columns]


In [32]:
print(icd_10_data['icd_code'].value_counts())

icd_code
E785       84570
I10        83775
Z87891     62806
K219       56157
F329       41876
           ...  
M66332         1
S01522A        1
M4655          1
T452X1A        1
O359XX2        1
Name: count, Length: 19440, dtype: int64


In [33]:
print(icd_10_data[icd_10_data['icd_code'].str.contains("^M41[0-9]*", regex=True)])

         subject_id   hadm_id  seq_num icd_code  icd_version
10006      10014451  21157275        2     M419           10
10532      10014967  25888738        3    M4156           10
27720      10041429  28466281        2     M419           10
36228      10057009  28491028        4    M4150           10
36728      10058522  26150386       21     M419           10
...             ...       ...      ...      ...          ...
6320294    19930893  24642043       10    M4180           10
6323020    19933841  29103434        2    M4186           10
6347819    19973795  23822974        3     M419           10
6357788    19990427  29695607       14     M419           10
6364451    19999828  29734428       16     M419           10

[1015 rows x 5 columns]


In [34]:
for diagnosis in icd_codes.itertuples():
    icd_9_regex = diagnosis.icd_9_code
    icd_10_regex = diagnosis.icd_10_code
    filtered_icd9 = icd_9_data[icd_9_data['icd_code'].str.contains(icd_9_regex)]
    filtered_icd10 = icd_10_data[icd_10_data['icd_code'].str.contains(icd_10_regex)]
    print(diagnosis.diagnosis, len(filtered_icd9.subject_id.unique()), len(filtered_icd10.subject_id.unique()))

Scoliosis 29 806
Kyphosis 42 202
Lodorosis 15 7
Dorsopathy 0 0
Vertebral Fracture 0 0
Spondylosis 4835 2816
Spondyloisthesis 0 1


In [35]:
data = icd_9_data[icd_9_data['icd_code'].str.contains("^7373[0-9]*")]
unique_ids = data["subject_id"].unique()
print(len(unique_ids))
print(data["icd_code"].value_counts())

805
icd_code
73730    1018
73739      67
73734      19
73732       3
73733       1
Name: count, dtype: int64


In [36]:
data = icd_10_data[icd_10_data['icd_code'].str.contains("^M41[0-9]*")]
unique_ids = data["subject_id"].unique()
print(len(unique_ids))
print(data["icd_code"].value_counts())

806
icd_code
M419      808
M4186      88
M4180      24
M4185      17
M4156      15
M4126      11
M4187       9
M4120       7
M4140       7
M41129      4
M4135       4
M4150       4
M4134       4
M4184       3
M4182       2
M4124       2
M4152       1
M4154       1
M41126      1
M41124      1
M4183       1
M41127      1
Name: count, dtype: int64


In [37]:
import re
import pandas as pd
import os

data_dir = "../data/mimic-iv/physionet.org/files/mimic-iv-note/2.2/note/"
data_file = "discharge.csv.gz"

data = pd.read_csv(os.path.join(data_dir, data_file))

positive_keywords = [
    r"\bposture\b",
    r"stooped",
    r"kyphotic",
    r"slouched",
    r"hunched",
    r"poor posture",
    r"flexed posture",
    r"slumped posture",
    r"extended posture",
    r"bent posture",
    r"erect posture",
    r"forward head posture",
    r"asymmetrical posture",
    r"decortiate posture",
]

exclude_keywords = [
    r"normal posture",
    r"normal gait and posture",
    r"normal stance and posture",
    r"posture normal",
    r"good posture",
    r"appropriate posture",
    r"no postural abnormality",
    r"no abnormal posture"
]

pos_pattern = re.compile("|".join(positive_keywords), re.IGNORECASE)
exc_pattern = re.compile("|".join(exclude_keywords), re.IGNORECASE)

abnormal = data['text'].str.contains(pos_pattern, regex=True)
normal = data['text'].str.contains(exc_pattern, regex=True)
posture_notes = data[abnormal & ~normal]
print(posture_notes)
print(len(posture_notes))

FileNotFoundError: [Errno 2] No such file or directory: '../data/mimic-iv/physionet.org/files/mimic-iv-note/2.2/note/discharge.csv.gz'

In [None]:
print(len(posture_notes['subject_id'].unique()))
posture_notes.to_csv("../output/abnormal_posture_notes.csv")

2695


In [None]:
normal_posture_notes = data[normal & ~abnormal]
print(len(normal_posture_notes))
normal_posture_notes.to_csv("../output/normal_posture_notes.csv")

2


In [None]:
import pandas as pd
import os

data_dir = "../data/mimic-iv/physionet.org/files/mimiciv/3.1/"
icd_file = "hosp/diagnoses_icd.csv.gz"
drg_codes = "hosp/drgcodes.csv.gz"

posture_icd_codes_regex = [
    "R293"
    "78192"
]

data = pd.read_csv(os.path.join(data_dir, icd_file))
drg_data = pd.read_csv(os.path.join(data_dir, drg_codes))

In [None]:
abnormal_posture_patients = data[data['icd_code'].str.contains("^R293|^78192", regex=True)]['subject_id']

for i in list(abnormal_posture_patients):
    print(data[data['subject_id'] == i])

        subject_id   hadm_id  seq_num icd_code  icd_version
129928    10218466  23510074        1    M4126           10
129929    10218466  23510074        2      D62           10
129930    10218466  23510074        3    F5000           10
129931    10218466  23510074        4    M5116           10
129932    10218466  23510074        5   M48061           10
129933    10218466  23510074        6     M545           10
129934    10218466  23510074        7     R293           10
129935    10218466  23510074        8   M47896           10
129936    10218466  23510074        9      I10           10
129937    10218466  23510074       10     F909           10
129938    10218466  23510074       11     K219           10
129939    10218466  23510074       12     F329           10
129940    10218466  23510074       13     D649           10
129941    10218466  27209421        1     R109           10
129942    10218466  27209421        2     R933           10
129943    10218466  27209421        3   