In [77]:
import pandas as pd
import numpy as np
import os 
import sys
sys.path.append(os.path.abspath('..'))
import utils.utils as utils
from sklearn.preprocessing import MinMaxScaler

train = pd.read_csv('../data/mimic-iv-private/anchor_year_group_datasets/2014_-_2016/train_dataset.csv')
test = pd.read_csv('../data/mimic-iv-private/anchor_year_group_datasets/2017_-_2019/test_dataset.csv')

In [78]:
len(train['pain'].unique())

346

Normalize Vital Signs

In [79]:

# List of vital signs columns
vital_signs_cols = ['temperature', 'heartrate', 'resprate', 'o2sat', 'sbp', 'dbp', 'pain']

# Ensure vital signs are numeric and handle missing values
train[vital_signs_cols] = train[vital_signs_cols].apply(pd.to_numeric, errors='coerce')
test[vital_signs_cols] = test[vital_signs_cols].apply(pd.to_numeric, errors='coerce')

In [56]:
train['pain'].unique()

array([ 0.  ,  7.  , 10.  ,  6.  , 13.  ,   nan,  2.  ,  3.  ,  8.  ,
        9.  ,  5.  ,  1.  ,  4.  ,  1.5 ,  8.59,  8.6 ,  0.5 ,  7.5 ,
       11.  ,  4.5 ,  9.5 ,  6.5 ])

Fill in NAs

In [80]:
train[vital_signs_cols] = train[vital_signs_cols].fillna(train[vital_signs_cols].mean())
test[vital_signs_cols] = test[vital_signs_cols].fillna(test[vital_signs_cols].mean())

# Normalize vital signs using Min-Max scaling
scaler = MinMaxScaler()
vital_signs_normalized = scaler.fit_transform(train[vital_signs_cols])
vital_signs_normalized_test = scaler.fit_transform(test[vital_signs_cols])
print(f"NaN in embeddings: {np.isnan(vital_signs_normalized).any()}")
train[vital_signs_cols] = vital_signs_normalized
test[vital_signs_cols] = vital_signs_normalized_test


NaN in embeddings: False


In [81]:

train['gender'] = train['gender'].map({'F': 0, 'M': 1})
test['gender'] = test['gender'].map({'F': 0, 'M': 1})

In [82]:
from sklearn.preprocessing import OneHotEncoder

# Create the encoder; set sparse=False to get a dense array and handle_unknown to ignore unseen categories.
encoder = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

# Fit the encoder on the train data for the specified columns
train_categories = train[['arrival_transport', 'race']]
test_categories = test[['arrival_transport', 'race']]

encoder.fit(train_categories)

# Transform both train and test data
train_encoded = encoder.transform(train_categories)
test_encoded = encoder.transform(test_categories)

# Get the new column names from the encoder
encoded_columns = encoder.get_feature_names_out(['arrival_transport', 'race'])

# Convert the arrays to DataFrames with the appropriate index
train_encoded_df = pd.DataFrame(train_encoded, columns=encoded_columns, index=train.index)
test_encoded_df = pd.DataFrame(test_encoded, columns=encoded_columns, index=test.index)

# Drop the original columns and join the new dummy columns back to train and test
train = train.drop(columns=['arrival_transport', 'race']).join(train_encoded_df)
test = test.drop(columns=['arrival_transport', 'race']).join(test_encoded_df)

In [83]:
train = train.drop(columns=['subject_id', 'stay_id','chiefcomplaint','anchor_year_group','has_null'])
test = test.drop(columns=['subject_id', 'stay_id','chiefcomplaint','anchor_year_group','has_null'])

In [84]:
# import re

# # Define a function to extract numeric values or map text descriptions
# def clean_pain(value):
#     # Attempt to extract numeric values using regex
#     match = re.search(r'\d+(\.\d+)?', str(value))  # Matches numbers like '8', '9.5'
#     if match:
#         return float(match.group())
#     # Map text descriptions to numeric values
#     text_mapping = {
#         'none': 0, 'mild': 2, 'moderate': 5, 'severe': 8, 'very bad': 10,
#         'unbearable': 10, 'uncomfortable': 4, 'not bad': 1
#     }
#     value_lower = str(value).lower()
#     for key, num in text_mapping.items():
#         if key in value_lower:
#             return num
#     # Return NaN for unprocessable values
#     return None

# # Apply the function to the pain column
# train['pain'] = train['pain'].apply(clean_pain)
# test['pain'] = test['pain'].apply(clean_pain)
# # Fill missing values with the median pain value
# train['pain'] = train['pain'].fillna(train['pain'].median())
# test['pain'] = test['pain'].fillna(test['pain'].median())

#convert acuity to int
train['acuity'] = train['acuity'].apply(lambda x: int(x))
test['acuity'] = test['acuity'].apply(lambda x: int(x))
# Adjust class labels to start from 0
train['acuity'] = train['acuity'] - 1
test['acuity'] = test['acuity'] - 1


In [85]:
train['race_AMERICAN INDIAN/ALASKA NATIVE'].unique()

array([0., 1.])

In [87]:
train

Unnamed: 0,temperature,heartrate,resprate,o2sat,sbp,dbp,pain,acuity,gender,anchor_age,...,race_PATIENT DECLINED TO ANSWER,race_PORTUGUESE,race_SOUTH AMERICAN,race_UNABLE TO OBTAIN,race_UNKNOWN,race_WHITE,race_WHITE - BRAZILIAN,race_WHITE - EASTERN EUROPEAN,race_WHITE - OTHER EUROPEAN,race_WHITE - RUSSIAN
0,0.097900,0.467811,0.008348,0.099305,0.000761,0.001490,0.074627,2,0,28.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.098407,0.407725,0.009461,0.096326,0.000993,0.001475,0.000000,1,1,43.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.100132,0.330472,0.008348,0.095333,0.000940,0.001352,0.074627,2,0,48.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.099117,0.356223,0.009461,0.099305,0.000993,0.001336,0.052239,2,1,63.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.099117,0.399142,0.010574,0.098312,0.001026,0.001367,0.059701,2,1,60.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
104863,0.099117,0.321888,0.008348,0.099305,0.000907,0.001183,0.052239,3,0,22.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
104864,0.099138,0.362458,0.009203,0.097871,0.000892,0.001232,0.031683,0,0,67.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
104865,0.102059,0.493562,0.009461,0.092354,0.001019,0.001290,0.059701,2,1,60.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
104866,0.099117,0.454936,0.010574,0.097871,0.000834,0.001232,0.074627,0,1,58.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


### Logistic Regression

In [84]:
# Logistic Regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression


X_train = train.drop(columns=['acuity'])
y_train = train['acuity']
X_test = test.drop(columns=['acuity'])
y_test = test['acuity']

# Subsets of training data
train_sizes = [0.01, 0.1, 1.0]  # 1%, 10%, 100%
metrics_results = []

# Train and evaluate for each subset
for size in train_sizes:
    if size < 1.0:
        # Subset the training data for 1% or 10%
        X_train_subset, _, y_train_subset, _ = train_test_split(X_train, y_train, train_size=size, random_state=42)
    else:
        # Use the entire training set for 100%
        X_train_subset, y_train_subset = X_train, y_train
    
    # Train logistic regression
    model = LogisticRegression(max_iter=1000, random_state=42, n_jobs=4)
    model.fit(X_train_subset, y_train_subset)
    
    # Make predictions
    y_pred = model.predict(X_test)

    # Store results
    evaluation_metrics = utils.evaluate_predictions(y_pred, y_test, ordinal=True,flexibility=1, by_class=True)

    metrics_results.append({
        "Training Size": f"{int(size * 100)}%",
        "Metrics": evaluation_metrics
    })

# Display results
results_df = pd.DataFrame(metrics_results)
results_df


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    

Unnamed: 0,Training Size,Metrics
0,1%,"{'overall': {'accuracy': 0.5664, 'precision': ..."
1,10%,"{'overall': {'accuracy': 0.57, 'precision': 0...."
2,100%,"{'overall': {'accuracy': 0.5616, 'precision': ..."


In [86]:
results_df.loc[0,'Metrics']

{'overall': {'accuracy': 0.5664,
  'precision': 0.5064142127803721,
  'recall': 0.5664,
  'f1_score': 0.47597480192151725,
  'adjusted_accuracy': 0.97,
  'adjusted_precision': 0.9674674852150139,
  'adjusted_recall': 0.97,
  'adjusted_f1': 0.9614351248487119,
  'mae': 0.4636,
  'mse': 0.5236,
  'quadratic_kappa': 0.12403315745293209},
 'by_class': {'0': {'precision': 0.5714285714285714,
   'recall': 0.046511627906976744,
   'f1-score': 0.08602150537634409,
   'support': 86.0},
  '1': {'precision': 0.4980694980694981,
   'recall': 0.15412186379928317,
   'f1-score': 0.2354014598540146,
   'support': 837.0},
  '2': {'precision': 0.5743061772605192,
   'recall': 0.9210337401292176,
   'f1-score': 0.7074717397298043,
   'support': 1393.0},
  '3': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 177.0},
  '4': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 7.0},
  'accuracy': 0.5664,
  'macro avg': {'precision': 0.3287608493517177,
   'recall': 0.2243334463670954

In [61]:
pd.DataFrame({'yo': np.array(y_pred), 'yo1': 'a'}).groupby('yo').count()

Unnamed: 0_level_0,yo1
yo,Unnamed: 1_level_1
0,519
1,332
2,383
3,618
4,648


In [36]:
pd.DataFrame(y_pred+1).to_csv('lr_pred.csv', index=False)

In [62]:
print(results_df.loc[2,'Metrics'])

{'overall': {'accuracy': 0.2048, 'precision': 0.5185326864809315, 'recall': 0.2048, 'f1_score': 0.2558379230308958, 'adjusted_accuracy': 0.5944, 'adjusted_precision': 0.921639640385146, 'adjusted_recall': 0.5944, 'adjusted_f1': 0.7008622376025149, 'mae': 1.3288, 'mse': 2.6664, 'quadratic_kappa': 0.09429269068929624}, 'by_class': {'0': {'precision': 0.08477842003853564, 'recall': 0.5116279069767442, 'f1-score': 0.14545454545454545, 'support': 86.0}, '1': {'precision': 0.41566265060240964, 'recall': 0.16487455197132617, 'f1-score': 0.23609923011120615, 'support': 837.0}, '2': {'precision': 0.660574412532637, 'recall': 0.18162239770279973, 'f1-score': 0.2849099099099099, 'support': 1393.0}, '3': {'precision': 0.11812297734627832, 'recall': 0.4124293785310734, 'f1-score': 0.18364779874213835, 'support': 177.0}, '4': {'precision': 0.006172839506172839, 'recall': 0.5714285714285714, 'f1-score': 0.012213740458015267, 'support': 7.0}, 'accuracy': 0.2048, 'macro avg': {'precision': 0.2570622600

In [37]:
kate_df = pd.read_csv('../results/Triage-Private-Stratified/Triage-Private-Stratified_CoT_openai-gpt-4o-chat_json_detailed0_2500.csv')

In [43]:
kate_df['lr_pred'] = y_pred+1
kate_df[['Estimated_Acuity','lr_pred','acuity']].to_csv('2preds_1test.csv', index=False)

### XGBoost

In [67]:
from xgboost import XGBClassifier

from sklearn.metrics import (accuracy_score, f1_score, precision_score, recall_score, cohen_kappa_score, 
                             classification_report, mean_absolute_error, mean_squared_error)

# Subsets of training data
train_sizes = [0.01, 0.1, 1.0]  # 1%, 10%, 100%
metrics_results_xgb = []

# Train and evaluate for each subset
for size in train_sizes:
    if size < 1.0:
        # Subset the training data for 1% or 10%
        X_train_subset, _, y_train_subset, _ = train_test_split(X_train, y_train, train_size=size, random_state=42)
    else:
        # Use the entire training set for 100%
        X_train_subset, y_train_subset = X_train, y_train
    
    # Train XGBoost classifier
    model = XGBClassifier(use_label_encoder=False, eval_metric=cohen_kappa_score, random_state=42)
    model.fit(X_train_subset, y_train_subset)
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Store results
    evaluation_metrics = utils.evaluate_predictions(y_pred, y_test, ordinal=True,flexibility=1, by_class=True)

    metrics_results_xgb.append({
        "Training Size": f"{int(size * 100)}%",
        "Metrics": evaluation_metrics
    })

# Display results
results_df_xgb = pd.DataFrame(metrics_results_xgb)
results_df_xgb


Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
Parameters: { "use_label_encoder" } are not used.

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.cap

Unnamed: 0,Training Size,Metrics
0,1%,"{'overall': {'accuracy': 0.5648, 'precision': ..."
1,10%,"{'overall': {'accuracy': 0.5884, 'precision': ..."
2,100%,"{'overall': {'accuracy': 0.602, 'precision': 0..."


In [69]:
results_df_xgb.loc[2,'Metrics']

{'overall': {'accuracy': 0.602,
  'precision': 0.5516411100478469,
  'recall': 0.602,
  'f1_score': 0.5497740145090242,
  'adjusted_accuracy': 0.98,
  'adjusted_precision': 0.9774265382757239,
  'adjusted_recall': 0.98,
  'adjusted_f1': 0.9769243508357329,
  'mae': 0.4184,
  'mse': 0.46,
  'quadratic_kappa': 0.31933974771771656},
 'by_class': {'0': {'precision': 0.7454545454545455,
   'recall': 0.47674418604651164,
   'f1-score': 0.5815602836879432,
   'support': 86.0},
  '1': {'precision': 0.56,
   'recall': 0.3010752688172043,
   'f1-score': 0.3916083916083916,
   'support': 837.0},
  '2': {'precision': 0.6075187969924812,
   'recall': 0.8700646087580761,
   'f1-score': 0.7154663518299882,
   'support': 1393.0},
  '3': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 177.0},
  '4': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 7.0},
  'accuracy': 0.602,
  'macro avg': {'precision': 0.3825946684894054,
   'recall': 0.32957681272435846,
   'f1-score': 0.337

In [73]:
test.drop(columns=['acuity']).values.shape

(2500, 7)

### BioBERT

In [88]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

# Load training and test data
embeddings = np.load('../data/mimic-iv-private/train_full_chiefcomplaint_embeddings_reduced.npy', allow_pickle=True)
test_embeddings= np.load('../data/mimic-iv-private/test_chiefcomplaint_embeddings_reduced.npy', allow_pickle=True)

# Combine vitals with embeddings
X_train = np.hstack([train.drop(columns=['acuity']).values, embeddings])  # Combine vitals and embeddings
y_train = train['acuity'].values

X_test = np.hstack([test.drop(columns=['acuity']).values, test_embeddings])  # Replace with inference embeddings if different
y_test = test['acuity'].values

# Subsets of training data
train_sizes = [1.0]  # 1%, 10%, 100%
metrics_results = []

# Train and evaluate for each subset
X_train_subset, y_train_subset = X_train, y_train

# Train an MLP classifier
model = MLPClassifier(
    hidden_layer_sizes=(1000, 1000),  # Small network
    random_state=42,
    early_stopping=True
)
print("Training MLP...")
model.fit(X_train_subset, y_train_subset)

Training MLP...


In [75]:
from sklearn.metrics import classification_report, mean_squared_error, cohen_kappa_score

# Make predictions on the test set
y_pred = model.predict(X_test)

# Get classification report as a dictionary
evaluation_metrics = classification_report(y_test, y_pred, output_dict=True)

# Compute additional metrics

# Mean Squared Error (MSE)
mse = mean_squared_error(y_test, y_pred)

# Quadratic Weighted Kappa (QWK)
qwk = cohen_kappa_score(y_test, y_pred, weights='quadratic')

# Undertriage: Proportion of cases where predicted acuity is lower than true acuity
undertriage_rate = np.mean(y_pred < y_test)

# Overtriage: Proportion of cases where predicted acuity is higher than true acuity
overtriage_rate = np.mean(y_pred > y_test)

# Helper function to flatten the classification report dictionary
def flatten_classification_report(report):
    """
    Flattens the nested dictionary from sklearn's classification_report into a single-level dictionary.
    """
    flat_report = {}
    for key, value in report.items():
        if isinstance(value, dict):
            for metric, metric_value in value.items():
                flat_report[f"{key}_{metric}"] = metric_value
        else:
            flat_report[key] = value
    return flat_report

# Flatten the classification report and add the additional metrics
flat_metrics = flatten_classification_report(evaluation_metrics)
flat_metrics['MSE'] = mse
flat_metrics['QWK'] = qwk
flat_metrics['undertriage_rate'] = undertriage_rate
flat_metrics['overtriage_rate'] = overtriage_rate

# Create a DataFrame for a neat display of the metrics
results_df = pd.DataFrame([flat_metrics])
print(results_df)

   0_precision  0_recall  0_f1-score  0_support  1_precision  1_recall  \
0     0.565217  0.220339    0.317073       59.0     0.627841  0.610497   

   1_f1-score  1_support  2_precision  2_recall  ...  macro avg_f1-score  \
0    0.619048      362.0     0.681282  0.798419  ...            0.404855   

   macro avg_support  weighted avg_precision  weighted avg_recall  \
0             1000.0                 0.64473                0.656   

   weighted avg_f1-score  weighted avg_support    MSE       QWK  \
0               0.639527                1000.0  0.432  0.497833   

   undertriage_rate  overtriage_rate  
0             0.156            0.188  

[1 rows x 33 columns]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [76]:
display(results_df)

Unnamed: 0,0_precision,0_recall,0_f1-score,0_support,1_precision,1_recall,1_f1-score,1_support,2_precision,2_recall,...,macro avg_f1-score,macro avg_support,weighted avg_precision,weighted avg_recall,weighted avg_f1-score,weighted avg_support,MSE,QWK,undertriage_rate,overtriage_rate
0,0.565217,0.220339,0.317073,59.0,0.627841,0.610497,0.619048,362.0,0.681282,0.798419,...,0.404855,1000.0,0.64473,0.656,0.639527,1000.0,0.432,0.497833,0.156,0.188


In [76]:
utils.evaluate_predictions(y_pred,y_test,ordinal=True, by_class=True)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'overall': {'accuracy': 0.7036,
  'precision': 0.69382771984375,
  'recall': 0.7036,
  'f1_score': 0.6916408420473397,
  'adjusted_accuracy': 0.9916,
  'adjusted_precision': 0.9916944355529995,
  'adjusted_recall': 0.9916,
  'adjusted_f1': 0.991091579591339,
  'mae': 0.3048,
  'mse': 0.3216,
  'quadratic_kappa': 0.5809021487834903},
 'by_class': {'0': {'precision': 0.7894736842105263,
   'recall': 0.3488372093023256,
   'f1-score': 0.4838709677419355,
   'support': 86.0},
  '1': {'precision': 0.6786155747836835,
   'recall': 0.6559139784946236,
   'f1-score': 0.6670716889428918,
   'support': 837.0},
  '2': {'precision': 0.7272727272727273,
   'recall': 0.8155061019382628,
   'f1-score': 0.7688663282571913,
   'support': 1393.0},
  '3': {'precision': 0.4835164835164835,
   'recall': 0.24858757062146894,
   'f1-score': 0.3283582089552239,
   'support': 177.0},
  '4': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 7.0},
  'accuracy': 0.7036,
  'macro avg': {'precision': 0

In [88]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

# Load training and test data
embeddings = np.load('../data/mimic-iv-private/symptom_embeddings.npy', allow_pickle=True)
test_embeddings = np.load('../data/mimic-iv-private/symptom_embeddings_test.npy', allow_pickle=True)

# Combine vitals with embeddings
X_train = np.hstack([train.drop(columns=['acuity']).values, embeddings])  # Combine vitals and embeddings
y_train = train['acuity'].values

X_test = np.hstack([test.drop(columns=['acuity']).values, test_embeddings])  # Replace with inference embeddings if different
y_test = test['acuity'].values

# Subsets of training data
train_sizes = [0.01, 0.1]  # 1%, 10%, 100%
metrics_results = []

# Train and evaluate for each subset
for size in train_sizes:
    # Create a subset of the training data
    subset_size = int(len(X_train) * size)
    X_train_subset, _, y_train_subset, _ = train_test_split(
        X_train, y_train, train_size=subset_size, random_state=42, stratify=y_train
    )

    # Train an MLP classifier
    model = MLPClassifier(
        hidden_layer_sizes=(775, 64),  # Small network
        random_state=42,
        early_stopping=True
    )
    model.fit(X_train_subset, y_train_subset)

    # Make predictions
    y_pred = model.predict(X_test)

    # Evaluate the predictions
    evaluation_metrics = utils.evaluate_predictions(y_pred,y_test,ordinal=True, by_class=True)

    # Store results
    metrics_results.append(evaluation_metrics)

# Display results
results_df = pd.DataFrame(metrics_results)
print(results_df)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                             overall  \
0  {'accuracy': 0.6396, 'precision': 0.6202773153...   
1  {'accuracy': 0.6776, 'precision': 0.6687991886...   

                                            by_class  
0  {'0': {'precision': 0.42424242424242425, 'reca...  
1  {'0': {'precision': 0.53125, 'recall': 0.19767...  


In [91]:
results_df.loc[1]['overall']

{'accuracy': 0.6776,
 'precision': 0.6687991886557905,
 'recall': 0.6776,
 'f1_score': 0.6661262548531941,
 'adjusted_accuracy': 0.9864,
 'adjusted_precision': 0.9861556950476779,
 'adjusted_recall': 0.9864,
 'adjusted_f1': 0.985951754713654,
 'mae': 0.336,
 'mse': 0.3632,
 'quadratic_kappa': 0.5416948648827627}

In [92]:
results_df.loc[1]['by_class']

{'0': {'precision': 0.53125,
  'recall': 0.19767441860465115,
  'f1-score': 0.288135593220339,
  'support': 86.0},
 '1': {'precision': 0.6739446870451238,
  'recall': 0.5531660692951016,
  'f1-score': 0.6076115485564304,
  'support': 837.0},
 '2': {'precision': 0.7041383570105003,
  'recall': 0.8183776022972002,
  'f1-score': 0.7569721115537849,
  'support': 1393.0},
 '3': {'precision': 0.45962732919254656,
  'recall': 0.4180790960451977,
  'f1-score': 0.4378698224852071,
  'support': 177.0},
 '4': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 7.0},
 'accuracy': 0.6776,
 'macro avg': {'precision': 0.47379207464963413,
  'recall': 0.39745943724843014,
  'f1-score': 0.4181178151631523,
  'support': 2500.0},
 'weighted avg': {'precision': 0.6687991886557905,
  'recall': 0.6776,
  'f1-score': 0.6661262548531941,
  'support': 2500.0}}