

# Small_MIMIC Attempt

<hr>

***By:*** Laxmi Vijayan, Aganze Mihigo   

***Based On:*** Classifying Unstructured Clinical Notes via Automatic Weak Supervision
**Authors:** Arnab Dey, Chufan Gao, Mononito Goswami, correspondence to &lt;mgoswami@andrew.cmu.edu&gt;




##  2.3 Setting Up the Environment

Please uncomment the following cells and run the commands to begin setting up your environment.

In [None]:
!pip install pyyaml

In [None]:
!pip install sentence-transformers

In [None]:
!pip install snorkel transformers sentence-transformers cleantext pyhealth gdown

## 2.4 Mounting Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)

Mounted at /content/drive/


In [2]:
import sys
base_path = '/content/drive/MyDrive/KeyClass/'
sys.path.append(base_path + 'keyclass/')
sys.path.append(base_path + 'scripts/')
sys.path.append(base_path + 'data/')

import argparse
import pandas, plotly, matplotlib, seaborn
import label_data, encode_datasets, train_downstream_model
import torch
import pickle
import numpy as np
import os
from os.path import join, exists
from datetime import datetime
import utils
import models
import create_lfs
import train_classifier

# Input arguments
config_file_path = base_path+'/config_files/config_mimic.yml' # Specify path to the configuration file
random_seed = 0 # Random seed for experiments

args = utils.Parser(config_file_path=config_file_path).parse()

if args['use_custom_encoder']:
    model = models.CustomEncoder(pretrained_model_name_or_path=args['base_encoder'],
        device='cuda' if torch.cuda.is_available() else 'cpu')
else:
    model = models.Encoder(model_name=args['base_encoder'],
        device='cuda' if torch.cuda.is_available() else 'cpu')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [3]:
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
os.environ['TOKENIZERS_PARALLELISM'] = 'true'

In [None]:
for split in ['train', 'test']:
    sentences = utils.fetch_data(dataset=args['dataset'], split=split, path=args['data_path'])
    embeddings = model.encode(sentences=sentences, batch_size=args['end_model_batch_size'],
                                show_progress_bar=args['show_progress_bar'],
                                normalize_embeddings=args['normalize_embeddings'])
    with open(join(args['data_path'], args['dataset'], f'{split}_embeddings.pkl'), 'wb') as f:
        pickle.dump(embeddings, f)

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

<a id='label'></a>
### 2.5 Probabilistically Labeling the Data

*KeyClass* creates a matrix that represents how different labeling functions agree or disagree on labeling the training documents. Then, it uses the open-source label model implementation from the Snorkel Python library to turn these agreements and disagreements into probabilistic labels-- it assigns labels with a measure of uncertainty or confidence rather than just labeling them directly.

This approach helps in handling complex or noisy data where simple labeling might be difficult or unreliable.



In [5]:
import gc

gc.collect()

torch.cuda.empty_cache()

In [7]:
# Load training data
train_text = utils.fetch_data(dataset=args['dataset'], path=args['data_path'], split='train')

training_labels_present = False
if exists(join(args['data_path'], args['dataset'], 'train_labels.txt')):
    with open(join(args['data_path'], args['dataset'], 'train_labels.txt'), 'r') as f:
        y_train = f.readlines()
    y_train = np.array([int(i.replace('\n','')) for i in y_train])
    training_labels_present = True
else:
    y_train = None
    training_labels_present = False
    print('No training labels found!')

with open(join(args['data_path'], args['dataset'], 'train_embeddings.pkl'), 'rb') as f:
    X_train = pickle.load(f)

# Print dataset statistics
print(f"Getting labels for the {args['dataset']} data...")
print(f'Size of the data: {len(train_text)}')
if training_labels_present:
    print('Class distribution', np.unique(y_train, return_counts=True))

# Load label names/descriptions
label_names = []
for a in args:
    if 'target' in a: label_names.append(args[a])

# Creating labeling functions
labeler = create_lfs.CreateLabellingFunctions(base_encoder=args['base_encoder'],
                                            device=torch.device(args['device']),
                                            label_model=args['label_model'])
print(labeler.encoder)
proba_preds = labeler.get_labels(text_corpus=train_text, label_names=label_names, min_df=args['min_df'],
                                ngram_range=args['ngram_range'], topk=args['topk'], y_train=y_train,
                                label_model_lr=args['label_model_lr'], label_model_n_epochs=args['label_model_n_epochs'],
                                verbose=True, n_classes=args['n_classes'])

y_train_pred = np.argmax(proba_preds, axis=1)
train_labels = join(args['data_path'], args['dataset'], 'train_labels.txt')
np.savetxt(train_labels, y_train_pred, fmt='%d', delimiter='\n')

# Save the predictions
if not os.path.exists(args['preds_path']): os.makedirs(args['preds_path'])
with open(join(args['preds_path'], f"{args['label_model']}_proba_preds.pkl"), 'wb') as f:
    pickle.dump(proba_preds, f)

# Print statistics
print('Label Model Predictions: Unique value and counts', np.unique(y_train_pred, return_counts=True))
if training_labels_present:
    print('Label Model Training Accuracy', np.mean(y_train_pred==y_train))

    # Log the metrics
    training_metrics_with_gt = utils.compute_metrics(y_preds=y_train_pred, y_true=y_train, average=args['average'])
    utils.log(metrics=training_metrics_with_gt, filename='label_model_with_ground_truth',
        results_dir=args['results_path'], split='train')

Getting labels for the mimic data...
Size of the data: 10
Class distribution (array([ 0,  1,  4,  8, 10, 12, 14, 15, 16, 17]), array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]))
CustomEncoder(
  (model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
        

  0%|          | 0/100 [00:07<?, ?epoch/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 4.92 GiB. GPU 0 has a total capacity of 22.17 GiB of which 4.84 GiB is free. Process 1277457 has 17.32 GiB memory in use. Of the allocated memory 15.86 GiB is allocated by PyTorch, and 1.24 GiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Load testing data
test_text = utils.fetch_data(dataset=args['dataset'], path=args['data_path'], split='test')

with open(join(args['data_path'], args['dataset'], 'test_embeddings.pkl'), 'rb') as f:
    X_test = pickle.load(f)

# Print dataset statistics
print(f"Getting labels for the {args['dataset']} data...")
print(f'Size of the data: {len(test_text)}')

y_test = None

# Load label names/descriptions
label_names = []
for a in args:
    if 'target' in a: label_names.append(args[a])

# Creating labeling functions
labeler = create_lfs.CreateLabellingFunctions(base_encoder=args['base_encoder'],
                                            device=torch.device(args['device']),
                                            label_model=args['label_model'])

proba_preds = labeler.get_labels(text_corpus=test_text, label_names=label_names, min_df=args['min_df'],
                                ngram_range=args['ngram_range'], topk=args['topk'], y_train=y_test,
                                label_model_lr=args['label_model_lr'], label_model_n_epochs=args['label_model_n_epochs'],
                                verbose=True, n_classes=args['n_classes'])

y_test_pred = np.argmax(proba_preds, axis=1)
test_labels = join(args['data_path'], args['dataset'], 'test_labels.txt')
np.savetxt(test_labels, y_test_pred, fmt='%d', delimiter='\n')

# Save the predictions
if not os.path.exists(args['preds_path']): os.makedirs(args['preds_path'])
with open(join(args['preds_path'], f"{args['label_model']}_proba_preds.pkl"), 'wb') as f:
    pickle.dump(proba_preds, f)


<a id='exp_training'></a>
## 3. Experimentation: Training

<a id='downstream'></a>
### 3.1 Training the Downstream Model

Now, we have a proabilitistically labeled training dataset that can be used to train our downstream classfier. KeyClass uses the top-*k* documents with the most confident label estimates to train the classifier. This model will be saved under './models/{dataset_name}' as end_model and the date.

In [4]:
args = utils.Parser(config_file_path=config_file_path).parse()

# Set random seeds
random_seed = random_seed
torch.manual_seed(random_seed)
np.random.seed(random_seed)

X_train_embed_masked, y_train_lm_masked, y_train_masked, \
	X_test_embed, y_test, training_labels_present, \
	sample_weights_masked, proba_preds_masked = train_downstream_model.load_data(args)

# Train a downstream classifier

if args['use_custom_encoder']:
	encoder = models.CustomEncoder(pretrained_model_name_or_path=args['base_encoder'], device=args['device'])
else:
	encoder = models.Encoder(model_name=args['base_encoder'], device=args['device'])

classifier = models.FeedForwardFlexible(encoder_model=encoder,
										h_sizes=args['h_sizes'],
										activation=eval(args['activation']),
										device=torch.device(args['device']))
print('\n===== Training the downstream classifier =====\n')

model = train_classifier.train(model=classifier,
							device=torch.device(args['device']),
							X_train=X_train_embed_masked,
							y_train=y_train_lm_masked,
							sample_weights=sample_weights_masked if args['use_noise_aware_loss'] else None,
							epochs=args['end_model_epochs'],
							batch_size=args['end_model_batch_size'],
							criterion=eval(args['criterion']),
							raw_text=False,
							lr=eval(args['end_model_lr']),
							weight_decay=eval(args['end_model_weight_decay']),
							patience=args['end_model_patience'])

# # Saving the model
# if not os.path.exists(args['preds_path']): os.makedirs(args['preds_path'])
# with open(join(args['preds_path'], f"{args['label_model']}_proba_preds.pkl"), 'wb') as f:
#     pickle.dump(proba_preds, f)


# end_model_preds_train = model.predict_proba(torch.from_numpy(X_train_embed_masked), batch_size=512, raw_text=False)
# end_model_preds_test = model.predict_proba(torch.from_numpy(X_test_embed), batch_size=512, raw_text=False)


if not os.path.exists(args['model_path']): os.makedirs(args['model_path'])
current_time = datetime.now()
model_name = f'end_model_{current_time.strftime("%d-%b-%Y")}.pth'
print(f'Saving model {model_name}...')
with open(join(args['model_path'], model_name), 'wb') as f:
		torch.save(model, f)

end_model_preds_train = model.predict_proba(
		 																			 torch.from_numpy(X_train_embed_masked),
																					 batch_size=516, raw_text=False)
end_model_preds_test = model.predict_proba(torch.from_numpy(X_test_embed),
																						batch_size=516,
																						raw_text=False)

# Save the predictions
with open(join(args['preds_path'], 'end_model_preds_train.pkl'),
					'wb') as f:
		pickle.dump(end_model_preds_train, f)
with open(join(args['preds_path'], 'end_model_preds_test.pkl'), 'wb') as f:
		pickle.dump(end_model_preds_test, f)

# Print statistics
if training_labels_present:
		training_metrics_with_gt = utils.compute_metrics(
				y_preds=np.argmax(end_model_preds_train, axis=1),
				y_true=y_train_masked,
				average=args['average'])
		utils.log(metrics=training_metrics_with_gt,
							filename='end_model_with_ground_truth',
							results_dir=args['results_path'],
							split='train')

training_metrics_with_lm = utils.compute_metrics(y_preds=np.argmax(
		end_model_preds_train, axis=1),
																									y_true=y_train_lm_masked,
																									average=args['average'])
utils.log(metrics=training_metrics_with_lm,
					filename='end_model_with_label_model',
					results_dir=args['results_path'],
					split='train')

testing_metrics = utils.compute_metrics_bootstrap(
		y_preds=np.argmax(end_model_preds_test, axis=1),
		y_true=y_test,
		average=args['average'],
		n_bootstrap=args['n_bootstrap'],
		n_jobs=args['n_jobs'])
utils.log(metrics=testing_metrics,
					filename='end_model_with_ground_truth',
					results_dir=args['results_path'],
					split='test')

Confidence of least confident data point of class 0: 0.0
Confidence of least confident data point of class 1: 0.0
Confidence of least confident data point of class 2: 0.0
Confidence of least confident data point of class 3: 1.534037343171497e-304
Confidence of least confident data point of class 4: 0.0
Confidence of least confident data point of class 5: 3.33921741682772e-252
Confidence of least confident data point of class 6: 0.0
Confidence of least confident data point of class 7: 0.0
Confidence of least confident data point of class 8: 0.0
Confidence of least confident data point of class 9: 1.9832534801807736e-251
Confidence of least confident data point of class 10: 0.0
Confidence of least confident data point of class 11: 0.0
Confidence of least confident data point of class 12: 0.0
Confidence of least confident data point of class 13: 4.751756841670959e-306
Confidence of least confident data point of class 14: 7.34160645626098e-220
Confidence of least confident data point of cl

Epoch 19: 100%|██████████| 20/20 [00:01<00:00, 19.02batch/s, best_loss=2.88, running_loss=2.88, tolerance_count=2]


Saving model end_model_08-May-2024.pth...


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Saving results in /content/drive/MyDrive/KeyClass/results/mimic/train_end_model_with_ground_truth_08-May-2024-04_11_46.txt...
Saving results in /content/drive/MyDrive/KeyClass/results/mimic/train_end_model_with_label_model_08-May-2024-04_11_46.txt...


  pid = os.fork()
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    2.4s


Saving results in /content/drive/MyDrive/KeyClass/results/mimic/test_end_model_with_ground_truth_08-May-2024-04_11_52.txt...


[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    3.0s finished


<a id='self'></a>
### 3.2 Self-Training the Model
Lastly, KeyClass self-trains on the entire training dataset to refine the end model classifier further. It saves this model to the same location as end_model_with_self_training and the date.

In [5]:
# Fetching the raw text data for self-training
X_train_text = utils.fetch_data(dataset=args['dataset'], path=args['data_path'], split='train')
X_test_text = utils.fetch_data(dataset=args['dataset'], path=args['data_path'], split='test')

model = train_classifier.self_train(model=model,
									X_train=X_train_text,
									X_val=X_test_text,
									y_val=y_test,
									device=torch.device(args['device']),
									lr=eval(args['self_train_lr']),
									weight_decay=eval(args['self_train_weight_decay']),
									patience=args['self_train_patience'],
									batch_size=args['self_train_batch_size'],
									q_update_interval=args['q_update_interval'],
									self_train_thresh=eval(args['self_train_thresh']),
									print_eval=True)


end_model_preds_test = model.predict_proba(X_test_text, batch_size=args['self_train_batch_size'], raw_text=True)


# Print statistics
testing_metrics = utils.compute_metrics_bootstrap(y_preds=np.argmax(end_model_preds_test, axis=1),
													y_true=y_test,
													average=args['average'],
													n_bootstrap=args['n_bootstrap'],
													n_jobs=args['n_jobs'])


current_time = datetime.now()
model_name = f'end_model_self_trained_{current_time.strftime("%d %b %Y")}.pth'
print(f'Saving model {model_name}...')
with open(join(args['model_path'], model_name), 'wb') as f:
		torch.save(model, f)

end_model_preds_test = model.predict_proba(
		X_test_text, batch_size=args['self_train_batch_size'], raw_text=True)

# Save the predictions
with open(
				join(args['preds_path'], 'end_model_self_trained_preds_test.pkl'),
				'wb') as f:
		pickle.dump(end_model_preds_test, f)

# Print statistics
testing_metrics = utils.compute_metrics_bootstrap(
		y_preds=np.argmax(end_model_preds_test, axis=1),
		y_true=y_test,
		average=args['average'],
		n_bootstrap=args['n_bootstrap'],
		n_jobs=args['n_jobs'])
utils.log(metrics=testing_metrics,
					filename='end_model_with_ground_truth_self_trained',
					results_dir=args['results_path'],
					split='test')

0batch [00:00, ?batch/s]
[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  40 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.0s finished


Saving model end_model_self_trained_08 May 2024.pth...
Saving results in /content/drive/MyDrive/KeyClass/results/mimic/test_end_model_with_ground_truth_self_trained_08-May-2024-04_12_02.txt...


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  40 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.0s finished


<a id='exp_testing'></a>
## 4. Experimentation: Testing

In [7]:
end_model_path='/content/drive/MyDrive/KeyClass/models/mimic/end_model_08-May-2024.pth'
end_model_self_trained_path='/content/drive/MyDrive/KeyClass/models/mimic/end_model_self_trained_08 May 2024.pth'

args = utils.Parser(config_file_path=config_file_path).parse()

# Set random seeds
random_seed = random_seed
torch.manual_seed(random_seed)
np.random.seed(random_seed)

X_train_embed_masked, y_train_lm_masked, y_train_masked, \
	X_test_embed, y_test, training_labels_present, \
	sample_weights_masked, proba_preds_masked = train_downstream_model.load_data(args)

model = torch.load(end_model_path)

end_model_preds_train = model.predict_proba(torch.from_numpy(X_train_embed_masked), batch_size=512, raw_text=False)
end_model_preds_test = model.predict_proba(torch.from_numpy(X_test_embed), batch_size=512, raw_text=False)

# Print statistics
if training_labels_present:
	training_metrics_with_gt = utils.compute_metrics(y_preds=np.argmax(end_model_preds_train, axis=1),
														y_true=y_train_masked,
														average=args['average'])
	print('training_metrics_with_gt', training_metrics_with_gt)

training_metrics_with_lm = utils.compute_metrics(y_preds=np.argmax(end_model_preds_train, axis=1),
													y_true=y_train_lm_masked,
													average=args['average'])
print('training_metrics_with_lm', training_metrics_with_lm)

testing_metrics = utils.compute_metrics_bootstrap(y_preds=np.argmax(end_model_preds_test, axis=1),
													y_true=y_test,
													average=args['average'],
													n_bootstrap=args['n_bootstrap'],
													n_jobs=args['n_jobs'])
print('testing_metrics', testing_metrics)


print('\n===== Self-training the downstream classifier =====\n')

# Fetching the raw text data for self-training
X_train_text = utils.fetch_data(dataset=args['dataset'], path=args['data_path'], split='train')
X_test_text = utils.fetch_data(dataset=args['dataset'], path=args['data_path'], split='test')

model = torch.load(end_model_self_trained_path)

end_model_preds_test = model.predict_proba(X_test_text, batch_size=args['self_train_batch_size'], raw_text=True)


# Print statistics
testing_metrics = utils.compute_metrics_bootstrap(y_preds=np.argmax(end_model_preds_test, axis=1),
													y_true=y_test,
													average=args['average'],
													n_bootstrap=args['n_bootstrap'],
													n_jobs=args['n_jobs'])
print('testing_metrics after self train', testing_metrics)

utils.log(metrics=testing_metrics,
					filename='end_model_with_ground_truth_self_trained',
					results_dir=args['results_path'],
					split='test')


Confidence of least confident data point of class 0: 0.0
Confidence of least confident data point of class 1: 0.0
Confidence of least confident data point of class 2: 0.0
Confidence of least confident data point of class 3: 1.534037343171497e-304
Confidence of least confident data point of class 4: 0.0
Confidence of least confident data point of class 5: 3.33921741682772e-252
Confidence of least confident data point of class 6: 0.0
Confidence of least confident data point of class 7: 0.0
Confidence of least confident data point of class 8: 0.0
Confidence of least confident data point of class 9: 1.9832534801807736e-251
Confidence of least confident data point of class 10: 0.0
Confidence of least confident data point of class 11: 0.0
Confidence of least confident data point of class 12: 0.0
Confidence of least confident data point of class 13: 4.751756841670959e-306
Confidence of least confident data point of class 14: 7.34160645626098e-220
Confidence of least confident data point of cl


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


Precision is ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.


os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    2.4s
[Parallel(n_jobs=10)]: Done  81 out of 100 | elapsed:    2.4s remaining:    0.6s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    2.9s finished


testing_metrics [[0. 0.]
 [0. 0.]
 [0. 0.]]

===== Self-training the downstream classifier =====

testing_metrics after self train [[0. 0.]
 [0. 0.]
 [0. 0.]]
Saving results in /content/drive/MyDrive/KeyClass/results/mimic/test_end_model_with_ground_truth_self_trained_08-May-2024-04_21_02.txt...


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  40 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.1s finished


## 5. Plotting Results

Examine the Accuracy, Prescision, and Recall values of the different models.

In [6]:
import json
import plotly.graph_objects as go
import os
args = utils.Parser(config_file_path=config_file_path).parse()


files = {
    'Train End Model with Label Model': os.path.join(base_path, args['results_path'], '/content/drive/MyDrive/KeyClass/results/mimic/train_end_model_with_label_model_08-May-2024-04_11_46.txt'),
    'Test End Model with Ground Truth': os.path.join(base_path, args['results_path'], '/content/drive/MyDrive/KeyClass/results/mimic/test_end_model_with_ground_truth_08-May-2024-04_11_52.txt'),
    'Test Self-Trained End Model with Ground Truth': os.path.join(base_path, args['results_path'], '/content/drive/MyDrive/KeyClass/results/mimic/test_end_model_with_ground_truth_self_trained_08-May-2024-04_12_48.txt')
}

# Initialize lists to store the data
labels = []
accuracy = []
precision = []
recall = []
accuracy_err = []
precision_err = []
recall_err = []

# Read each file and extract the metrics
for label, file_path in files.items():
    with open(file_path, 'r') as f:
        data = json.load(f)
        labels.append(label)

        if 'mean' in str(data):
            # Test models with mean and std
            accuracy.append(data['Accuracy (mean, std)'][0])
            precision.append(data['Precision (mean, std)'][0])
            recall.append(data['Recall (mean, std)'][0])
            accuracy_err.append(data['Accuracy (mean, std)'][1])
            precision_err.append(data['Precision (mean, std)'][1])
            recall_err.append(data['Recall (mean, std)'][1])
        else:
            # Train models without std
            accuracy.append(data['Accuracy'])
            precision.append(data['Precision'])
            recall.append(data['Recall'])
            accuracy_err.append(0)
            precision_err.append(0)
            recall_err.append(0)

# Creating the plot with Plotly
fig = go.Figure()

# Adding Accuracy, Precision, and Recall traces
fig.add_trace(go.Bar(name='Accuracy', x=labels, y=accuracy, error_y=dict(type='data', array=accuracy_err)))
fig.add_trace(go.Bar(name='Precision', x=labels, y=precision, error_y=dict(type='data', array=precision_err)))
fig.add_trace(go.Bar(name='Recall', x=labels, y=recall, error_y=dict(type='data', array=recall_err)))

# Update the layout
fig.update_layout(
    barmode='group',
    title='Performance Metrics Across Different Models',
    xaxis_title='Small MIMIC-III KeyClass Model',
    yaxis_title='Metric Value',
    legend_title='Metric'
)

# Show the plot
fig.show()