In [8]:
import spacy
import pandas as pd
import os
import re
import numpy as np

In [100]:
data_dir = "../data/mimic-iv/physionet.org/files/mimic-iv-note/2.2/note/"
data_file = "discharge.csv.gz"

data = pd.read_csv(os.path.join(data_dir, data_file))

In [101]:
posture_keyword = [
    r"\bposture\b",
]

In [102]:
pos_pattern = re.compile("|".join(positive_keywords), re.IGNORECASE)
exc_pattern = re.compile("|".join(exclude_keywords), re.IGNORECASE)

abnormal = data['text'].str.contains(pos_pattern, regex=True)
normal = data['text'].str.contains(exc_pattern, regex=True)
posture_notes = data[abnormal & ~normal]
print(posture_notes)
print(len(posture_notes))
posture_notes.to_csv("../output/posture_mention_notes.csv", index=False)

               note_id  subject_id   hadm_id note_type  note_seq  \
270      10006825-DS-8    10006825  29669544        DS         8   
399     10011938-DS-19    10011938  24772774        DS        19   
439     10013502-DS-20    10013502  20363238        DS        20   
535      10015487-DS-8    10015487  20588720        DS         8   
734      10022041-DS-3    10022041  28909879        DS         3   
...                ...         ...       ...       ...       ...   
330801  19968936-DS-12    19968936  20663275        DS        12   
331085  19978248-DS-30    19978248  28868652        DS        30   
331193   19981958-DS-8    19981958  21856502        DS         8   
331279  19985349-DS-10    19985349  20308632        DS        10   
331602  19994259-DS-21    19994259  21818613        DS        21   

                  charttime            storetime  \
270     2150-08-10 00:00:00  2150-08-10 22:56:00   
399     2132-01-30 00:00:00  2132-01-31 16:46:00   
439     2159-01-13 00:00:00

In [5]:
!uv run spacy download en_core_web_sm # need to run everytime because not persistent in nix environment


[38;5;1m✘ No compatible package found for 'en_core_sci_sm' (spaCy v3.8.8)[0m



In [48]:
import spacy
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

nlp = spacy.load("en_core_web_sm")  # biomedical model

def extract_posture_descriptors(text):
    doc = nlp(text)
    descriptors = []
    for token in doc:
        if token.lemma_.lower() != "posture":
            continue
        # Find adjectives directly modifying 'posture'
        for child in token.children:
            if child.dep_ == "amod":
                # include left modifiers (advmod/compound)
                modifier_tokens = [t.text for t in child.lefts if t.dep_ in ("advmod", "compound")]
                modifier_tokens.append(child.text)
    
                # include negation or determiner if present
                for det in token.children:
                    if det.dep_ in ("det", "neg"):
                        modifier_tokens.insert(0, det.text)
    
                # join into phrase
                phrase = " ".join(modifier_tokens)
                descriptors.append(phrase)

    # deduplicate
    descriptors = list(dict.fromkeys(descriptors))

    return descriptors

model = SentenceTransformer("emilyalsentzer/Bio_ClinicalBERT")

No sentence-transformers model found with name emilyalsentzer/Bio_ClinicalBERT. Creating a new one with mean pooling.


In [103]:
data = pd.read_csv("../output/posture_mention_notes.csv")
print(data)

             note_id  subject_id   hadm_id note_type  note_seq  \
0      10006825-DS-8    10006825  29669544        DS         8   
1     10011938-DS-19    10011938  24772774        DS        19   
2     10013502-DS-20    10013502  20363238        DS        20   
3      10015487-DS-8    10015487  20588720        DS         8   
4      10022041-DS-3    10022041  28909879        DS         3   
...              ...         ...       ...       ...       ...   
1919  19968936-DS-12    19968936  20663275        DS        12   
1920  19978248-DS-30    19978248  28868652        DS        30   
1921   19981958-DS-8    19981958  21856502        DS         8   
1922  19985349-DS-10    19985349  20308632        DS        10   
1923  19994259-DS-21    19994259  21818613        DS        21   

                charttime            storetime  \
0     2150-08-10 00:00:00  2150-08-10 22:56:00   
1     2132-01-30 00:00:00  2132-01-31 16:46:00   
2     2159-01-13 00:00:00  2159-01-13 09:49:00   
3     2

In [104]:
descriptors = []
for idx, row in data.iterrows():
    descriptors.append(extract_posture_descriptors(row['text']))

In [105]:
data['posture_descriptors'] = descriptors

In [106]:
all_descriptors = [d for desc_list in data['posture_descriptors'] for d in desc_list]
descriptor_counts = pd.Series(all_descriptors).value_counts()
print(descriptor_counts.to_string())

normal                     102
good                        42
stooped                     31
upright                     26
well groomed                25
poor                        24
kyphotic                    20
flexed                      19
a flexed                    18
stated                      16
a stooped                   12
intermittent                12
Kyphotic                    12
fair                        11
grooming                     8
supine                       8
an upright                   8
rigid                        7
a forward                    7
tearful                      6
groomed                      6
hunched                      5
Slightly stooped             5
anxious                      5
seated                       5
intense                      5
contracted                   5
cooperative                  5
standing                     4
appropriate                  4
mild                         4
clean                        4
a more e

In [107]:
data.to_csv("../output/posture_mention_notes.csv", index=False)

In [34]:
normal_posture_descriptors = [
    r'normal posture',
    r'good posture',
    r'upright posture',
    r"normal gait and posture",
    r"normal stance and posture",
    r"posture normal",
    r"good posture",
    r"appropriate posture",
    r"no postural abnormality",
    r"no abnormal posture"
]

abnormal_posture_descriptors = {
    "kyphotic": r'\bkyphotic\b',
    "stooped": r'\bstoop(ed)?\b',
    "hunched": r'\bhunch(ed)?\b',
    "flexed": r'flexed posture',
    "rigid": r'rigid posture',
    "contracted": r'contracted posture',
    "poor": r'poor posture',
}

severity_descriptors = [
    "mild",
    "slightly",
    "somewhat",
    "fair",
    "severe",
    "very",
    "exaggerated",
    "intense",
]

normal_posture_regex = re.compile("|".join(normal_posture_descriptors), re.IGNORECASE)

In [5]:
data_dir = "../data/mimic-iv/physionet.org/files/mimic-iv-note/2.2/note/"
data_file = "discharge.csv.gz"

data = pd.read_csv(os.path.join(data_dir, data_file))

KeyError: 'descriptor'

In [28]:
data['descriptor'] = np.empty((len(data), 0)).tolist()

normal_idx = data['text'].str.contains(normal_posture_regex, regex=True)

labeled_index = normal_idx
for i in data["descriptor"][normal_idx]:
    i.append("normal")

for key, value in abnormal_posture_descriptors.items():
    abnormal_posture_regex = re.compile(value, re.IGNORECASE)
    abnormal_idx = data['text'].str.contains(abnormal_posture_regex, regex=True)
    labeled_index = labeled_index | abnormal_idx
    for i in data['descriptor'][abnormal_idx]:
        

labeled_data = data[labeled_index]
labeled_data

Unnamed: 0,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text,descriptor
114,10002930-DS-13,10002930,22733922,DS,13,2198-05-04 00:00:00,2198-05-04 19:18:00,\nName: ___ Unit No: ___\...,[normal]
220,10005606-DS-17,10005606,29646384,DS,17,2143-12-16 00:00:00,2143-12-16 09:33:00,\nName: ___ ___ No: ___\n \nA...,[kyphotic]
285,10007920-DS-20,10007920,26693451,DS,20,2136-08-30 00:00:00,2136-09-05 12:44:00,\nName: ___. Unit No: ___\n \...,[hunched]
536,10015487-DS-9,10015487,23914645,DS,9,2172-09-22 00:00:00,2172-09-22 12:27:00,\nName: ___ Unit No: ___...,[normal]
870,10026754-DS-3,10026754,22691839,DS,3,2136-08-18 00:00:00,2136-08-18 13:27:00,\nName: ___ Unit No: ___\...,[normal]
...,...,...,...,...,...,...,...,...,...
330692,19965533-DS-9,19965533,25388109,DS,9,2188-02-05 00:00:00,2188-02-05 11:54:00,\nName: ___ Unit No: ...,[hunched]
330776,19968054-DS-19,19968054,27515437,DS,19,2174-12-17 00:00:00,2174-12-17 12:46:00,\nName: ___ Unit No: ___\n...,[flexed]
331296,19985545-DS-22,19985545,28568303,DS,22,2144-02-06 00:00:00,2144-02-06 22:12:00,\nName: ___ Unit No: ...,[normal]
331320,19985885-DS-9,19985885,25048461,DS,9,2176-08-23 00:00:00,2176-08-26 10:10:00,\nName: ___ Unit No: _...,[kyphotic]


In [29]:
labeled_data.to_csv("../output/labeled_posture.csv", index=False)

In [32]:
all_descriptors = [d for desc_list in data['descriptor'] for d in desc_list]
descriptor_counts = pd.Series(all_descriptors).value_counts()
print(descriptor_counts.to_string())

normal        954
kyphotic      713
hunched       273
stooped       257
flexed         26
poor           13
contracted      6
rigid           6


In [38]:
data = pd.read_csv("../output/labeled_posture.csv")
print(len(data['subject_id'].unique()))

2015


In [48]:
data_dir = "../data/mimic-iv/physionet.org/files/mimiciv/3.1/hosp/"
admission_file = "admissions.csv.gz"
posture_subject_ids = data['subject_id'].unique()

admission_data = pd.read_csv(os.path.join(data_dir, admission_file))
print(labeled_data['text'])

KeyError: 0