# Posture-Based Random Forest Model

Train a Random Forest regressor that predicts hospital stay duration from posture-derived labels and patient metadata.

## Imports and notebook configuration

In [25]:
import ast
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder

pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', '{:,.2f}'.format)


## Load the abridged posture dataset

In [26]:
data_path = Path('../output/posture_data_preprocessed_abridged.csv')
if not data_path.exists():
    raise FileNotFoundError(f'Data file not found at {data_path!s}.')

df = pd.read_csv(data_path)
print(f'Source rows: {len(df):,}')
df.head()


Source rows: 2,249


Unnamed: 0,subject_id,hadm_id,gender,age,posture_labels,hospital_stay_duration,90_day_mort
0,10002930,22733922,F,53,{'normal'},11,False
1,10005606,29646384,M,38,{'kyphotic'},10,False
2,10007920,26693451,M,56,{'hunched'},2,False
3,10015487,23914645,M,52,{'normal'},24,False
4,10026754,22691839,F,58,{'normal'},9,False


## Feature engineering

* Parse the set-like `posture_labels` column to a single categorical label.
* Keep the core predictors (gender, age, posture) and drop rows that miss any required value.


In [27]:
def extract_primary_label(value: object) -> str:
    """Return the first posture label found in the serialized set/iterable."""
    if isinstance(value, str):
        value = value.strip()
        if not value:
            return 'unknown'
        try:
            parsed = ast.literal_eval(value)
        except (SyntaxError, ValueError):
            return value.lower()
        if isinstance(parsed, (set, list, tuple)) and parsed:
            first = next(iter(parsed))
            return str(first).lower()
        return str(parsed).lower()
    if isinstance(value, (set, list, tuple)) and value:
        return str(next(iter(value))).lower()
    return 'unknown'

df['posture_label'] = df['posture_labels'].apply(extract_primary_label)

required_columns = ['gender', 'age', 'posture_label', 'hospital_stay_duration']
df_model = df.dropna(subset=required_columns).copy()
print(f'Rows available for modeling: {len(df_model):,}')
df_model[required_columns].describe(include='all')


Rows available for modeling: 2,249


Unnamed: 0,gender,age,posture_label,hospital_stay_duration
count,2249,2249.0,2249,2249.0
unique,2,,22,
top,F,,normal,
freq,1267,,953,
mean,,57.44,,7.65
std,,23.01,,11.35
min,,18.0,,0.0
25%,,37.0,,2.0
50%,,59.0,,5.0
75%,,78.0,,9.0


## Train/test split (80/20) and Random Forest training

In [28]:
feature_columns = ['gender', 'age', 'posture_label']
target_column = 'hospital_stay_duration'

X = df_model[feature_columns]
y = df_model[target_column]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, shuffle=True
)

categorical_features = ['gender', 'posture_label']
numeric_features = ['age']

preprocess = ColumnTransformer(
    transformers=[
        ('categorical', OneHotEncoder(handle_unknown='ignore'), categorical_features),
        ('numeric', 'passthrough', numeric_features),
    ],
    verbose_feature_names_out=False,
)

rf_regressor = RandomForestRegressor(
    n_estimators=400,
    random_state=42,
    n_jobs=-1,
    min_samples_leaf=2,
)

rf_pipeline = Pipeline(steps=[
    ('preprocess', preprocess),
    ('model', rf_regressor),
])

rf_pipeline.fit(X_train, y_train)
predictions = rf_pipeline.predict(X_test)

mae = mean_absolute_error(y_test, predictions)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

print('Random Forest performance (held-out 20% test set)')
print(f'  MAE : {mae:,.2f} days')
print(f'  RMSE: {rmse:,.2f} days')
print(f'  R^2 : {r2:,.3f}')


Random Forest performance (held-out 20% test set)
  MAE : 6.83 days
  RMSE: 12.46 days
  R^2 : -0.104


## Feature importance

Inspect which encoded features the Random Forest relied on the most.

In [29]:
preprocessor = rf_pipeline.named_steps['preprocess']
model = rf_pipeline.named_steps['model']
feature_names = preprocessor.get_feature_names_out()
feature_importances = (
    pd.DataFrame({
        'feature': feature_names,
        'importance': model.feature_importances_,
    })
    .sort_values(by='importance', ascending=False)
    .reset_index(drop=True)
)
feature_importances.head(10)


Unnamed: 0,feature,importance
0,age,0.7
1,posture_label_normal,0.08
2,gender_M,0.05
3,gender_F,0.05
4,posture_label_kyphotic,0.04
5,posture_label_hunched,0.03
6,posture_label_stooped,0.02
7,posture_label_flexed,0.01
8,posture_label_slight stooped,0.01
9,posture_label_mild kyphotic,0.0


## Using the trained model for new patients

In [30]:
example_patients = pd.DataFrame([
    {'gender': 'F', 'age': 72, 'posture_label': 'kyphotic'},
    {'gender': 'M', 'age': 55, 'posture_label': 'normal'},
])
rf_pipeline.predict(example_patients)


array([4.08312919, 6.08004428])