# Posture-Based Random Forest (90-Day Mortality)

Train a Random Forest classifier that predicts 90-day mortality from posture-derived labels plus basic demographics.


## Imports and notebook configuration

In [15]:
import ast
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score,
    classification_report,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import StratifiedKFold, cross_validate, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:,.2f}'.format)


## Load the abridged posture dataset

In [16]:
data_path = Path('../output/posture_data_preprocessed_abridged.csv')
if not data_path.exists():
    raise FileNotFoundError(f'Data file not found at {data_path!s}.')

df = pd.read_csv(data_path)
print(f'Source rows: {len(df):,}')
df.head()


Source rows: 2,249


Unnamed: 0,subject_id,hadm_id,gender,age,posture_labels,hospital_stay_duration,90_day_mort
0,10002930,22733922,F,53,{'normal'},11,False
1,10005606,29646384,M,38,{'kyphotic'},10,False
2,10007920,26693451,M,56,{'hunched'},2,False
3,10015487,23914645,M,52,{'normal'},24,False
4,10026754,22691839,F,58,{'normal'},9,False


## Feature engineering

* Parse the set-like `posture_labels` column to a single categorical label.
* Convert `90_day_mort` to a binary target and drop rows missing the required predictors.


In [17]:
def extract_primary_label(value: object) -> str:
    """Return the first posture label found in the serialized set/iterable."""
    if isinstance(value, str):
        value = value.strip()
        if not value:
            return 'unknown'
        try:
            parsed = ast.literal_eval(value)
        except (SyntaxError, ValueError):
            return value.lower()
        if isinstance(parsed, (set, list, tuple)) and parsed:
            first = next(iter(parsed))
            return str(first).lower()
        return str(parsed).lower()
    if isinstance(value, (set, list, tuple)) and value:
        return str(next(iter(value))).lower()
    return 'unknown'


def mortality_to_int(value: object) -> int:
    if isinstance(value, str):
        value = value.strip().lower()
        if value in {'true', '1', 't', 'y'}:
            return 1
        if value in {'false', '0', 'f', 'n'}:
            return 0
    return int(bool(value))

df['posture_label'] = df['posture_labels'].apply(extract_primary_label)
df['mortality_90d'] = df['90_day_mort'].apply(mortality_to_int)

required_columns = ['gender', 'age', 'posture_label', 'mortality_90d']
df_model = df.dropna(subset=required_columns).copy()

print(f'Rows available for modeling: {len(df_model):,}')
print('90-day mortality distribution:')
print(df_model['mortality_90d'].value_counts().rename({0: 'survived', 1: 'died'}))

summary_columns = ['gender', 'age', 'posture_label', 'mortality_90d']
df_model[summary_columns].describe(include='all')


Rows available for modeling: 2,249
90-day mortality distribution:
mortality_90d
survived    2104
died         145
Name: count, dtype: int64


Unnamed: 0,gender,age,posture_label,mortality_90d
count,2249,2249.0,2249,2249.0
unique,2,,23,
top,F,,normal,
freq,1267,,941,
mean,,57.44,,0.06
std,,23.01,,0.25
min,,18.0,,0.0
25%,,37.0,,0.0
50%,,59.0,,0.0
75%,,78.0,,0.0


## Train/test split (80/20) and Random Forest training

In [18]:
feature_columns = ['gender', 'age', 'posture_label']
target_column = 'mortality_90d'

X = df_model[feature_columns]
y = df_model[target_column]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

categorical_features = ['gender', 'posture_label']
numeric_features = ['age']

preprocess = ColumnTransformer(
    transformers=[
        ('categorical', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('numeric', 'passthrough', numeric_features),
    ],
    verbose_feature_names_out=False,
)

rf_classifier = RandomForestClassifier(
    n_estimators=500,
    random_state=42,
    n_jobs=-1,
    min_samples_leaf=2,
    class_weight='balanced',
)

rf_pipeline = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', rf_classifier),
])

rf_pipeline.fit(X_train, y_train)
predictions = rf_pipeline.predict(X_test)
probabilities = rf_pipeline.predict_proba(X_test)[:, 1]

accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, zero_division=0)
recall = recall_score(y_test, predictions, zero_division=0)
f1 = f1_score(y_test, predictions, zero_division=0)

print('Random Forest classification performance (held-out 20% test set)')
print(f'  Accuracy : {accuracy:,.3f}')
print(f'  Precision: {precision:,.3f}')
print(f'  Recall   : {recall:,.3f}')
print(f'  F1-score : {f1:,.3f}')

print('\nClassification report:\n')
print(classification_report(y_test, predictions, digits=3))

cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm,
    index=pd.Index(['Actual 0', 'Actual 1'], name='True'),
    columns=pd.Index(['Pred 0', 'Pred 1'], name='Predicted'),
)
display(cm_df)


Random Forest classification performance (held-out 20% test set)
  Accuracy : 0.811
  Precision: 0.122
  Recall   : 0.310
  F1-score : 0.175

Classification report:

              precision    recall  f1-score   support

           0      0.947     0.846     0.893       421
           1      0.122     0.310     0.175        29

    accuracy                          0.811       450
   macro avg      0.534     0.578     0.534       450
weighted avg      0.894     0.811     0.847       450



Predicted,Pred 0,Pred 1
True,Unnamed: 1_level_1,Unnamed: 2_level_1
Actual 0,356,65
Actual 1,20,9


## Rigorous accuracy evaluation

Stratified 5-fold cross-validation scores the classifier on non-overlapping patient folds (hundreds per fold) to provide a more robust estimate of mortality-prediction accuracy.


In [19]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scoring = {
    'accuracy': 'accuracy',
    'precision': 'precision',
    'recall': 'recall',
    'f1': 'f1',
    'roc_auc': 'roc_auc',
}

cv_results = cross_validate(
    rf_pipeline,
    X,
    y,
    cv=cv,
    scoring=scoring,
    n_jobs=-1,
    return_train_score=False,
)

fold_metrics = pd.DataFrame({
    metric: cv_results[f'test_{metric}']
    for metric in scoring
})
fold_metrics.index = [f'fold_{i}' for i in range(1, len(fold_metrics) + 1)]

summary = fold_metrics.agg(['mean', 'std']).rename(index={'mean': 'mean', 'std': 'std'})

print(f'Evaluated {len(y)} patients via stratified 5-fold cross-validation.')
display(fold_metrics)
display(summary)


Evaluated 2249 patients via stratified 5-fold cross-validation.


Unnamed: 0,accuracy,precision,recall,f1,roc_auc
fold_1,0.83,0.16,0.38,0.22,0.75
fold_2,0.8,0.14,0.41,0.21,0.8
fold_3,0.85,0.21,0.48,0.29,0.83
fold_4,0.8,0.16,0.48,0.24,0.78
fold_5,0.81,0.15,0.45,0.23,0.8


Unnamed: 0,accuracy,precision,recall,f1,roc_auc
mean,0.82,0.16,0.44,0.24,0.79
std,0.02,0.03,0.04,0.03,0.03


## Feature importance

Inspect which encoded features the Random Forest relied on the most.

In [20]:
preprocessor = rf_pipeline.named_steps['preprocess']
model = rf_pipeline.named_steps['model']
feature_names = preprocessor.get_feature_names_out()
feature_importances = (
    pd.DataFrame({
        'feature': feature_names,
        'importance': model.feature_importances_,
    })
    .sort_values(by='importance', ascending=False)
    .reset_index(drop=True)
)
feature_importances.head(10)


Unnamed: 0,feature,importance
0,age,0.63
1,posture_label_normal,0.12
2,posture_label_kyphotic,0.11
3,posture_label_stooped,0.03
4,posture_label_flexed,0.02
5,posture_label_hunched,0.02
6,gender_M,0.01
7,gender_F,0.01
8,posture_label_mild kyphotic,0.01
9,posture_label_very kyphotic,0.01


## Using the trained model for new patients

In [21]:
example_patients = pd.DataFrame([
    {'gender': 'F', 'age': 72, 'posture_label': 'kyphotic'},
    {'gender': 'M', 'age': 55, 'posture_label': 'normal'},
])
probabilities = rf_pipeline.predict_proba(example_patients)[:, 1]

results = example_patients.copy()
results['mortality_prob'] = probabilities
results


Unnamed: 0,gender,age,posture_label,mortality_prob
0,F,72,kyphotic,0.77
1,M,55,normal,0.02
