# Posture-Based Random Forest (Age Cohorts)

Train Random Forest regressors that model hospital stay duration and surface feature importance overall and within age cohorts.


## Imports and notebook configuration

In [1]:
import ast
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:,.2f}'.format)


## Load the abridged posture dataset

In [2]:
data_path = Path('../output/posture_data_preprocessed_abridged.csv')
if not data_path.exists():
    raise FileNotFoundError(f'Data file not found at {data_path!s}.')

df = pd.read_csv(data_path)
print(f'Source rows: {len(df):,}')
df.head()


Source rows: 2,249


Unnamed: 0,subject_id,hadm_id,gender,age,posture_labels,hospital_stay_duration,90_day_mort
0,10002930,22733922,F,53,{'normal'},11,False
1,10005606,29646384,M,38,{'kyphotic'},10,False
2,10007920,26693451,M,56,{'hunched'},2,False
3,10015487,23914645,M,52,{'normal'},24,False
4,10026754,22691839,F,58,{'normal'},9,False


## Feature engineering

* Parse the set-like `posture_labels` column to a single categorical label.
* Introduce broad age cohorts (<=40, 41-65, 66+) to stratify downstream analysis.
* Keep the core predictors (gender, age, posture) and drop rows missing required values.


In [3]:
def extract_primary_label(value: object) -> str:
    """Return the first posture label found in the serialized set/iterable."""
    if isinstance(value, str):
        value = value.strip()
        if not value:
            return 'unknown'
        try:
            parsed = ast.literal_eval(value)
        except (SyntaxError, ValueError):
            return value.lower()
        if isinstance(parsed, (set, list, tuple)) and parsed:
            first = next(iter(parsed))
            return str(first).lower()
        return str(parsed).lower()
    if isinstance(value, (set, list, tuple)) and value:
        return str(next(iter(value))).lower()
    return 'unknown'

df['posture_label'] = df['posture_labels'].apply(extract_primary_label)

required_columns = ['gender', 'age', 'posture_label', 'hospital_stay_duration']
df_model = df.dropna(subset=required_columns).copy()

age_bins = [0, 40, 65, 120]
age_labels = ['<=40', '41-65', '66+']
df_model = df_model[df_model['age'].between(age_bins[0], age_bins[-1], inclusive='both')]
df_model['age_cohort'] = pd.cut(
    df_model['age'], bins=age_bins, labels=age_labels, include_lowest=True, right=True
)

print(f'Rows available for modeling: {len(df_model):,}')
print('Age cohort counts:')
print(df_model['age_cohort'].value_counts().sort_index())

summary_columns = ['gender', 'age', 'posture_label', 'hospital_stay_duration']
df_model[summary_columns].describe(include='all')


Rows available for modeling: 2,249
Age cohort counts:
age_cohort
<=40     627
41-65    700
66+      922
Name: count, dtype: int64


Unnamed: 0,gender,age,posture_label,hospital_stay_duration
count,2249,2249.0,2249,2249.0
unique,2,,23,
top,F,,normal,
freq,1267,,938,
mean,,57.44,,7.65
std,,23.01,,11.35
min,,18.0,,0.0
25%,,37.0,,2.0
50%,,59.0,,5.0
75%,,78.0,,9.0


## Train/test split (80/20) and Random Forest training

In [4]:
feature_columns = ['gender', 'age', 'posture_label']
target_column = 'hospital_stay_duration'

categorical_features = ['gender', 'posture_label']
numeric_features = ['age']

def build_rf_pipeline() -> Pipeline:
    preprocess = ColumnTransformer(
        transformers=[
            ('categorical', OneHotEncoder(handle_unknown='ignore'), categorical_features),
            ('numeric', 'passthrough', numeric_features),
        ],
        verbose_feature_names_out=False,
    )

    rf_regressor = RandomForestRegressor(
        n_estimators=400,
        random_state=42,
        n_jobs=-1,
        min_samples_leaf=2,
    )

    return Pipeline(steps=[
        ('preprocess', preprocess),
        ('model', rf_regressor),
    ])


def train_and_evaluate(X, y, *, test_size=0.2, random_state=42):
    pipeline = build_rf_pipeline()
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, shuffle=True
    )
    pipeline.fit(X_train, y_train)
    predictions = pipeline.predict(X_test)

    metrics = {
        'train_rows': len(y_train),
        'test_rows': len(y_test),
        'mae': mean_absolute_error(y_test, predictions),
        'rmse': np.sqrt(mean_squared_error(y_test, predictions)),
        'r2': r2_score(y_test, predictions),
    }
    return pipeline, metrics


X = df_model[feature_columns]
y = df_model[target_column]
rf_pipeline, global_metrics = train_and_evaluate(X, y)

print('Random Forest performance (held-out 20% test set)')
print(f"  Train rows: {global_metrics['train_rows']:,}")
print(f"  Test rows : {global_metrics['test_rows']:,}")
print(f"  MAE       : {global_metrics['mae']:,.2f} days")
print(f"  RMSE      : {global_metrics['rmse']:,.2f} days")
print(f"  R^2       : {global_metrics['r2']:,.3f}")


Random Forest performance (held-out 20% test set)
  Train rows: 1,799
  Test rows : 450
  MAE       : 6.88 days
  RMSE      : 12.51 days
  R^2       : -0.112


## Feature importance

Inspect which encoded features the Random Forest relied on the most.

In [5]:
preprocessor = rf_pipeline.named_steps['preprocess']
model = rf_pipeline.named_steps['model']
feature_names = preprocessor.get_feature_names_out()
feature_importances = (
    pd.DataFrame({
        'feature': feature_names,
        'importance': model.feature_importances_,
    })
    .sort_values(by='importance', ascending=False)
    .reset_index(drop=True)
)
feature_importances.head(10)


Unnamed: 0,feature,importance
0,age,0.7
1,posture_label_normal,0.07
2,gender_M,0.05
3,gender_F,0.05
4,posture_label_kyphotic,0.04
5,posture_label_hunched,0.04
6,posture_label_stooped,0.03
7,posture_label_flexed,0.01
8,posture_label_slight stooped,0.01
9,posture_label_mild kyphotic,0.0


## Age cohort analysis

Train separate Random Forest models within each age cohort to compare performance and surface which encoded variables carry the most weight.


In [6]:
cohort_results = []
cohort_importances = []
min_rows = 80  # ensure enough samples for a stable 80/20 split

for cohort_label, cohort_df in df_model.groupby('age_cohort', dropna=True):
    cohort_label = str(cohort_label)
    if len(cohort_df) < min_rows:
        print(f'Skipping cohort {cohort_label} (only {len(cohort_df)} rows).')
        continue

    X_cohort = cohort_df[feature_columns]
    y_cohort = cohort_df[target_column]
    pipeline, metrics = train_and_evaluate(X_cohort, y_cohort, random_state=42)

    cohort_results.append({
        'age_cohort': cohort_label,
        'rows': len(cohort_df),
        **{k: v for k, v in metrics.items() if k not in {'train_rows', 'test_rows'}}
    })

    feature_names = pipeline.named_steps['preprocess'].get_feature_names_out()
    importances = (
        pd.DataFrame({
            'feature': feature_names,
            'importance': pipeline.named_steps['model'].feature_importances_,
        })
        .sort_values(by='importance', ascending=False)
        .reset_index(drop=True)
    )
    importances['age_cohort'] = cohort_label
    cohort_importances.append(importances)

if cohort_results:
    cohort_metrics = pd.DataFrame(cohort_results).sort_values('age_cohort')
    display(cohort_metrics)
else:
    print('No cohorts met the minimum row threshold for training.')

if cohort_importances:
    combined = pd.concat(cohort_importances, ignore_index=True)
    top_features = (
        combined.sort_values(by=['age_cohort', 'importance'], ascending=[True, False])
        .groupby('age_cohort', group_keys=True)
        .head(5)
        .reset_index(drop=True)
    )
    display(top_features)
else:
    print('No feature importances to display.')


  for cohort_label, cohort_df in df_model.groupby('age_cohort', dropna=True):


Unnamed: 0,age_cohort,rows,mae,rmse,r2
1,41-65,700,8.63,13.35,-0.29
2,66+,922,5.18,8.81,0.01
0,<=40,627,5.29,7.75,-0.25


Unnamed: 0,feature,importance,age_cohort
0,age,0.57,41-65
1,posture_label_normal,0.09,41-65
2,gender_F,0.08,41-65
3,gender_M,0.08,41-65
4,posture_label_kyphotic,0.06,41-65
5,age,0.58,66+
6,posture_label_normal,0.13,66+
7,gender_M,0.07,66+
8,gender_F,0.07,66+
9,posture_label_kyphotic,0.05,66+


## Using the trained model for new patients

In [7]:
example_patients = pd.DataFrame([
    {'gender': 'F', 'age': 72, 'posture_label': 'kyphotic'},
    {'gender': 'M', 'age': 55, 'posture_label': 'normal'},
])
rf_pipeline.predict(example_patients)


array([4.09612603, 6.06582701])