In [8]:
import spacy
import pandas as pd
import os
import re
import numpy as np

In [100]:
data_dir = "../data/mimic-iv/physionet.org/files/mimic-iv-note/2.2/note/"
data_file = "discharge.csv.gz"

data = pd.read_csv(os.path.join(data_dir, data_file))

In [101]:
posture_keyword = [
    r"\bposture\b",
]

In [102]:
pos_pattern = re.compile("|".join(positive_keywords), re.IGNORECASE)
exc_pattern = re.compile("|".join(exclude_keywords), re.IGNORECASE)

abnormal = data['text'].str.contains(pos_pattern, regex=True)
normal = data['text'].str.contains(exc_pattern, regex=True)
posture_notes = data[abnormal & ~normal]
print(posture_notes)
print(len(posture_notes))
posture_notes.to_csv("../output/posture_mention_notes.csv", index=False)

               note_id  subject_id   hadm_id note_type  note_seq  \
270      10006825-DS-8    10006825  29669544        DS         8   
399     10011938-DS-19    10011938  24772774        DS        19   
439     10013502-DS-20    10013502  20363238        DS        20   
535      10015487-DS-8    10015487  20588720        DS         8   
734      10022041-DS-3    10022041  28909879        DS         3   
...                ...         ...       ...       ...       ...   
330801  19968936-DS-12    19968936  20663275        DS        12   
331085  19978248-DS-30    19978248  28868652        DS        30   
331193   19981958-DS-8    19981958  21856502        DS         8   
331279  19985349-DS-10    19985349  20308632        DS        10   
331602  19994259-DS-21    19994259  21818613        DS        21   

                  charttime            storetime  \
270     2150-08-10 00:00:00  2150-08-10 22:56:00   
399     2132-01-30 00:00:00  2132-01-31 16:46:00   
439     2159-01-13 00:00:00

In [5]:
!uv run spacy download en_core_web_sm # need to run everytime because not persistent in nix environment


[38;5;1m✘ No compatible package found for 'en_core_sci_sm' (spaCy v3.8.8)[0m



In [48]:
import spacy
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans

nlp = spacy.load("en_core_web_sm")  # biomedical model

def extract_posture_descriptors(text):
    doc = nlp(text)
    descriptors = []
    for token in doc:
        if token.lemma_.lower() != "posture":
            continue
        # Find adjectives directly modifying 'posture'
        for child in token.children:
            if child.dep_ == "amod":
                # include left modifiers (advmod/compound)
                modifier_tokens = [t.text for t in child.lefts if t.dep_ in ("advmod", "compound")]
                modifier_tokens.append(child.text)
    
                # include negation or determiner if present
                for det in token.children:
                    if det.dep_ in ("det", "neg"):
                        modifier_tokens.insert(0, det.text)
    
                # join into phrase
                phrase = " ".join(modifier_tokens)
                descriptors.append(phrase)

    # deduplicate
    descriptors = list(dict.fromkeys(descriptors))

    return descriptors

model = SentenceTransformer("emilyalsentzer/Bio_ClinicalBERT")

No sentence-transformers model found with name emilyalsentzer/Bio_ClinicalBERT. Creating a new one with mean pooling.


In [103]:
data = pd.read_csv("../output/posture_mention_notes.csv")
print(data)

             note_id  subject_id   hadm_id note_type  note_seq  \
0      10006825-DS-8    10006825  29669544        DS         8   
1     10011938-DS-19    10011938  24772774        DS        19   
2     10013502-DS-20    10013502  20363238        DS        20   
3      10015487-DS-8    10015487  20588720        DS         8   
4      10022041-DS-3    10022041  28909879        DS         3   
...              ...         ...       ...       ...       ...   
1919  19968936-DS-12    19968936  20663275        DS        12   
1920  19978248-DS-30    19978248  28868652        DS        30   
1921   19981958-DS-8    19981958  21856502        DS         8   
1922  19985349-DS-10    19985349  20308632        DS        10   
1923  19994259-DS-21    19994259  21818613        DS        21   

                charttime            storetime  \
0     2150-08-10 00:00:00  2150-08-10 22:56:00   
1     2132-01-30 00:00:00  2132-01-31 16:46:00   
2     2159-01-13 00:00:00  2159-01-13 09:49:00   
3     2

In [104]:
descriptors = []
for idx, row in data.iterrows():
    descriptors.append(extract_posture_descriptors(row['text']))

In [105]:
data['posture_descriptors'] = descriptors

In [106]:
all_descriptors = [d for desc_list in data['posture_descriptors'] for d in desc_list]
descriptor_counts = pd.Series(all_descriptors).value_counts()
print(descriptor_counts.to_string())

normal                     102
good                        42
stooped                     31
upright                     26
well groomed                25
poor                        24
kyphotic                    20
flexed                      19
a flexed                    18
stated                      16
a stooped                   12
intermittent                12
Kyphotic                    12
fair                        11
grooming                     8
supine                       8
an upright                   8
rigid                        7
a forward                    7
tearful                      6
groomed                      6
hunched                      5
Slightly stooped             5
anxious                      5
seated                       5
intense                      5
contracted                   5
cooperative                  5
standing                     4
appropriate                  4
mild                         4
clean                        4
a more e

In [107]:
data.to_csv("../output/posture_mention_notes.csv", index=False)

In [10]:
import pandas as pd
import re
import os

normal_posture_descriptors = [
    r'normal posture',
    r'good posture',
    r'upright posture',
    r"normal gait and posture",
    r"normal stance and posture",
    r"posture normal",
    r"good posture",
    r"appropriate posture",
    r"no postural abnormality",
    r"no abnormal posture"
]

abnormal_posture_descriptors = {
    "kyphotic": r'\bkyphotic\b',
    "stooped": r'\bstoop(?:ed)?\b',
    "hunched": r'\bhunch(?:ed)?\b',
    "flexed": r'flexed posture',
    "rigid": r'rigid posture',
    "contracted": r'contracted posture',
    "poor": r'poor posture',
}

severity_descriptors = [
    r"mild(?:ly)?",
    r"slight(?:ly)?",
    r"somewhat",
    r"fair(?:ly)?",
    r"severe(?:ly)?",
    r"very",
    r"exaggerated",
    r"intens(?:ly|e)?",
]

severity_patterns = [(re.compile(p, flags=re.I), p) for p in severity_descriptors]

# Precompile severity
severity_re = r"(" + "|".join(severity_descriptors) + r")"

# Precompile normal
normal_patterns = [re.compile(p, flags=re.I) for p in normal_posture_descriptors]


# ----- Main function -----

def find_posture_and_severity(text):
    text_l = text.lower()
    results = []

    # ---------- NORMAL POSTURE ----------
    for patt in normal_patterns:
        if patt.search(text_l):
            results.append({"type": "normal"})
            break  # only add once regardless of multiple matches

    # ---------- ABNORMAL POSTURE ----------
    for label, desc_pat in abnormal_posture_descriptors.items():

        # severity before descriptor
        combined = re.compile(rf"{severity_re}\s+{desc_pat}", re.I)
        simple = re.compile(desc_pat, re.I)

        # First: severity + descriptor
        for m in combined.finditer(text_l):
            severity = m.group(1)
            results.append({
                "type": "abnormal",
                "descriptor": label,
                "severity": severity
            })

        # If none with severity, add simple descriptor
        if not any(r.get("descriptor") == label for r in results):
            for m in simple.finditer(text_l):
                results.append({
                    "type": "abnormal",
                    "descriptor": label,
                    "severity": None
                })

    return results or None





In [5]:
data_dir = "../data/mimic-iv/physionet.org/files/mimic-iv-note/2.2/note/"
data_file = "discharge.csv.gz"

data = pd.read_csv(os.path.join(data_dir, data_file))

In [11]:
# Apply to DataFrame
data["posture_labels"] = data["text"].apply(find_posture_and_severity)


Unnamed: 0,note_id,subject_id,hadm_id,note_type,note_seq,charttime,storetime,text,posture_labels
0,10000032-DS-21,10000032,22595853,DS,21,2180-05-07 00:00:00,2180-05-09 15:26:00,\nName: ___ Unit No: _...,
1,10000032-DS-22,10000032,22841357,DS,22,2180-06-27 00:00:00,2180-07-01 10:15:00,\nName: ___ Unit No: _...,
2,10000032-DS-23,10000032,29079034,DS,23,2180-07-25 00:00:00,2180-07-25 21:42:00,\nName: ___ Unit No: _...,
3,10000032-DS-24,10000032,25742920,DS,24,2180-08-07 00:00:00,2180-08-10 05:43:00,\nName: ___ Unit No: _...,
4,10000084-DS-17,10000084,23052089,DS,17,2160-11-25 00:00:00,2160-11-25 15:09:00,\nName: ___ Unit No: __...,
...,...,...,...,...,...,...,...,...,...
331788,19999828-DS-6,19999828,29734428,DS,6,2147-08-04 00:00:00,2147-08-12 15:36:00,\nName: ___ Unit No: ___...,
331789,19999828-DS-7,19999828,25744818,DS,7,2149-01-18 00:00:00,2149-01-19 07:03:00,\nName: ___ Unit No: ___...,
331790,19999840-DS-20,19999840,26071774,DS,20,2164-07-28 00:00:00,2164-07-29 14:52:00,\nName: ___ Unit No: ___\...,
331791,19999840-DS-21,19999840,21033226,DS,21,2164-09-17 00:00:00,2164-09-18 01:36:00,\nName: ___ Unit No: ___\...,


In [16]:
filtered_data = data[data['posture_labels'].notna()]
filtered_data.to_csv("../output/labeled_posture.csv", index=False)

In [18]:
from collections import Counter
import pandas as pd

# Flatten results into a list of descriptors
abnormal_list = []

for labels in filtered_data["posture_labels"].dropna():
    for item in labels:
        if item["type"] == "abnormal":
            abnormal_list.append(item["descriptor"])

descriptor_counts = pd.Series(Counter(abnormal_list)).sort_values(ascending=False)
print(descriptor_counts)

kyphotic      867
stooped       323
hunched       308
flexed         30
poor           16
rigid           7
contracted      6
dtype: int64


In [38]:
data = pd.read_csv("../output/labeled_posture.csv")
print(len(data['subject_id'].unique()))

2015


In [48]:
data_dir = "../data/mimic-iv/physionet.org/files/mimiciv/3.1/hosp/"
admission_file = "admissions.csv.gz"
posture_subject_ids = data['subject_id'].unique()

admission_data = pd.read_csv(os.path.join(data_dir, admission_file))
print(labeled_data['text'])

KeyError: 0