In [1]:
import sys
sys.path.append('../keyclass/')
sys.path.append('../scripts/')

import argparse
import label_data, encode_datasets, train_downstream_model
import torch
import pickle
import numpy as np
import os
from os.path import join, exists
from datetime import datetime
import utils
import models
import create_lfs
import train_classifier
from transformers import AutoTokenizer, AutoModel

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/milesjg2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
train_path = '../fastag_data/icd9NotesDataTable_train.csv'
test_path = '../fastag_data/icd9NotesDataTable_valid.csv'
codeIdx = 9
textIdx = 6
utils.load_and_process_data(train_path, test_path, '../data/mimic/', max_length=1000, random_state=1234, sample_size=0.1)

  return np.array(ret)


In [3]:
!export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512'

In [4]:
config_file_path = r'../config_files/config_mimic2.yml' # Specify path to the configuration file
random_seed = 0

In [5]:
args = utils.Parser(config_file_path=config_file_path).parse()

if args['use_custom_encoder']:
    model = models.CustomEncoder(pretrained_model_name_or_path=args['base_encoder'], 
        device='cuda' if torch.cuda.is_available() else 'cpu')
else:
    model = models.Encoder(model_name=args['base_encoder'], 
        device='cuda' if torch.cuda.is_available() else 'cpu')

for split in ['train', 'test']:
    sentences = utils.fetch_data(dataset=args['dataset'], split=split, path=args['data_path'])
    embeddings = model.encode(sentences=sentences, batch_size=args['end_model_batch_size'], 
                                show_progress_bar=args['show_progress_bar'], 
                                normalize_embeddings=args['normalize_embeddings'])
    with open(join(args['data_path'], args['dataset'], f'{split}_embeddings.pkl'), 'wb') as f:
        pickle.dump(embeddings, f)

Some weights of the model checkpoint at bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12 were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Batches:   0%|          | 0/31 [00:00<?, ?it/s]

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

In [6]:
import gc

del model
gc.collect()
torch.cuda.empty_cache()

In [7]:
args = utils.Parser(config_file_path=config_file_path).parse()

train_text = utils.fetch_data(dataset=args['dataset'], path=args['data_path'], split='train')

training_labels_present = False
if exists(join(args['data_path'], args['dataset'], 'train_labels.txt')):
    with open(join(args['data_path'], args['dataset'], 'train_labels.txt'), 'r') as f:
        y_train = f.readlines()
    y_train = np.array([int(i.replace('\n','')) for i in y_train])
    training_labels_present = True
else:
    y_train = None
    training_labels_present = False
    print('No training labels found!')

with open(join(args['data_path'], args['dataset'], f'train_embeddings.pkl'), 'rb') as f:
    X_train = pickle.load(f)

# Print dataset statistics
print(f"Getting labels for the {args['dataset']} data...")
print(f'Size of the data: {len(train_text)}')
if training_labels_present:
    print('Class distribution', np.unique(y_train, return_counts=True))
class_balance = np.unique(y_train, return_counts=True)[1] / np.unique(y_train, return_counts=True)[1].sum()
# Load label names/descriptions
label_names = []
for a in args:
    if 'target' in a: label_names.append(args[a])

# Creating labeling functions
labeler = create_lfs.CreateLabellingFunctions(custom_encoder=args['use_custom_encoder'],
                                              base_encoder=args['base_encoder'], 
                                            device=torch.device(args['device']),
                                            label_model=args['label_model'])
proba_preds = labeler.get_labels(text_corpus=train_text, label_names=label_names, max_df = 0.3, min_df=0.001, 
                                ngram_range=(1,1), topk=args['topk'], y_train=y_train, 
                                label_model_lr=args['label_model_lr'], label_model_n_epochs=args['label_model_n_epochs'], 
                                verbose=True, n_classes=args['n_classes'], class_balance=class_balance, min_topk=False)

y_train_pred = np.argmax(proba_preds, axis=1)

# Save the predictions
if not os.path.exists(args['preds_path']): os.makedirs(args['preds_path'])
with open(join(args['preds_path'], f"{args['label_model']}_proba_preds.pkl"), 'wb') as f:
    pickle.dump(proba_preds, f)

# Print statistics
print('Label Model Predictions: Unique value and counts', np.unique(y_train_pred, return_counts=True))
if training_labels_present:
    print('Label Model Training Accuracy', np.mean(y_train_pred==y_train))

    # Log the metrics
    training_metrics_with_gt = utils.compute_metrics(y_preds=y_train_pred, y_true=y_train, average=args['average'])
    utils.log(metrics=training_metrics_with_gt, filename='label_model_with_ground_truth', 
        results_dir=args['results_path'], split='train')

Getting labels for the mimic data...
Size of the data: 3922
Class distribution (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16]), array([ 349,  284,  161,   42,   79,  105,    5, 1443,  440,  424,  124,
         13,   24,   51,   33,  283,   62]))


Some weights of the model checkpoint at bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12 were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Found assigned category counts [ 539  585 1386 2956  505  603  101 2524 1586   32  589  246  437  126
  360  435  651]
labeler.vocabulary:
 13661
labeler.word_indicator_matrix.shape (3922, 510)
Len keywords 510
assigned_category: Unique and Counts (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16]), array([30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30]))
tuberculosis,  found,  unspecified,  bacilli,  tubercle,  bacteriological,  examination,  histological,  specified,  sputum,  confirmed,  microscopy,  infection,  tuberculous,  due,  bacterial,  done,  unknown,  present,  animals,  inoculation,  culture,  histologically,  methods,  acute ['afibflutter' 'asbestosrelated' 'astsgot' 'bacterianone' 'bacteriarare'
 'bacterimany' 'bacterimod' 'bacterirare' 'culturefinal' 'culturenegative'
 'culturepreliminary' 'feverchills' 'fluidselectrolytesnutrition'
 'foulsmelling' 'havnegative' 'mthdonepos' 'mucousrare'
 'murmursrubsgallops' 'nonaerosol' 'op

INFO:root:Computing O...
INFO:root:Estimating \mu...
INFO:root:Using GPU...
  0%|                                                                               | 0/100 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.100]
100%|█████████████████████████████████████████████████████████████████████| 100/100 [03:09<00:00,  1.90s/epoch]
INFO:root:Finished Training


Label Model Predictions: Unique value and counts (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16]), array([376, 196, 130, 119, 104,  65, 118, 858,  91, 611, 116, 159, 100,
       202, 481, 150,  46]))
Label Model Training Accuracy 0.15170831208567057
Saving results in ../results/mimic/train_label_model_with_ground_truth_29-Apr-2023-20_52_54.txt...


In [8]:
import gc

del labeler
gc.collect()
torch.cuda.empty_cache()

In [9]:
import torch.optim as optim
import torch.nn as nn
from tqdm import tqdm, trange

def custom_train(model, device, X_train, y_train, epochs, batch_size, lr):
    if isinstance(y_train, np.ndarray):
        y_train = torch.from_numpy(y_train)

    if isinstance(X_train, np.ndarray):
        X_train = torch.from_numpy(X_train)
        
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=lr)
    N = len(X_train)
    pbar = trange(epochs, unit="batch")
    for nep in pbar:
        pbar.set_description(f"Epoch {nep}")
        permutation = torch.randperm(N)
        running_loss = 0.0
        for i in range(0, N, batch_size):
            optimizer.zero_grad()
            indices = permutation[i:i + batch_size]

            batch_x, batch_y = X_train[indices], y_train[indices]
            batch_y = batch_y.to(device)
            batch_x = batch_x.to(device)
            
            out = model.forward(batch_x, mode='inference', raw_text=False)
            loss = criterion(out, batch_y)
            loss.backward()
            optimizer.step()
            running_loss = running_loss + (loss.cpu().detach().numpy() *
                                           batch_size / N)
        print(running_loss)
    return model
            

In [10]:
# import importlib
# importlib.reload(models)

In [11]:
from sklearn.preprocessing import normalize
from sklearn.utils import shuffle
args = utils.Parser(config_file_path=config_file_path).parse()

# Set random seeds
random_seed = random_seed
torch.manual_seed(random_seed)
np.random.seed(random_seed)

X_train_embed_masked, y_train_lm_masked, y_train_masked, \
	X_test_embed, y_test, training_labels_present, \
	sample_weights_masked, proba_preds_masked = train_downstream_model.load_data(args, class_balance=class_balance, max_num=2000)

# X_train_embed_masked = normalize(X_train_embed_masked)
# X_test_embed = normalize(X_test_embed)
# with open(join(args['data_path'], args['dataset'], f'train_embeddings.pkl'), 'rb') as f:
#     X_train = pickle.load(f)
# Train a downstream classifier

if args['use_custom_encoder']:
	encoder = models.CustomEncoder(pretrained_model_name_or_path=args['base_encoder'], device=args['device'])
else:
	encoder = models.Encoder(model_name=args['base_encoder'], device=args['device'])
classifier = models.FeedForwardFlexible(encoder_model=encoder,
										h_sizes=[768, 1024, 512, 256, 64, 256, 512, 1024, 64, 17], 
										activation=eval(args['activation']),
										device=torch.device(args['device']))

# classifier = models.FeedForwardTCN(encoder_model=encoder,
# 										input_size=768, 
# 										output_size=768, 
# 										kernel_size=3, 
# 										stride=1, 
# 										dilation=1, 
# 										padding=2, 
# 										dropout=0.1, 
# 										device=torch.device(args['device']))
print('\n===== Training the downstream classifier =====\n')
X_train_embed_masked, y_train_lm_masked = shuffle(X_train_embed_masked, y_train_lm_masked, random_state=2)
model = train_classifier.train(model=classifier, 
							device=torch.device(args['device']),
							X_train=X_train_embed_masked, 
							y_train=y_train_lm_masked,
							sample_weights=sample_weights_masked if args['use_noise_aware_loss'] else None, 
							epochs=args['end_model_epochs'], 
							batch_size=512, 
							criterion=eval(args['criterion']), 
							raw_text=False, 
							lr=1, 
							weight_decay=0,
							patience=args['end_model_patience'])

end_model_preds_train = model.predict_proba(torch.from_numpy(X_train_embed_masked), batch_size=512, raw_text=False)
end_model_preds_test = model.predict_proba(torch.from_numpy(X_test_embed), batch_size=512, raw_text=False)

Confidence of least confident data point of class 0: 0.915886329789535
Confidence of least confident data point of class 1: 0.762174885049864
Confidence of least confident data point of class 2: 0.7060419607467858
Confidence of least confident data point of class 3: 0.9681031078989644
Confidence of least confident data point of class 4: 0.9328886085307548
Confidence of least confident data point of class 5: 0.5216979878548468
Confidence of least confident data point of class 6: 0.9999999998846478
Confidence of least confident data point of class 7: 0.36792452830188677
Confidence of least confident data point of class 8: 0.11218765935747067
Confidence of least confident data point of class 9: 0.9991456225285645
Confidence of least confident data point of class 10: 0.9999350583429301
Confidence of least confident data point of class 11: 0.9999999335410602
Confidence of least confident data point of class 12: 0.9991276155324404
Confidence of least confident data point of class 13: 0.99947

Some weights of the model checkpoint at bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12 were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



===== Training the downstream classifier =====



Epoch 4:  10%|█          | 2/20 [00:00<00:00, 18.82batch/s, best_loss=2.5, running_loss=2.5, tolerance_count=3]

Stopping early...


Epoch 4:  20%|██▏        | 4/20 [00:00<00:00, 20.13batch/s, best_loss=2.5, running_loss=2.5, tolerance_count=3]


In [19]:
from sklearn.manifold import TSNE
tsne = TSNE(random_state=1, n_iter=15000, metric="cosine")
embs =tsne.fit_transform(X_train_embed_masked)
embs



array([[ -37.617546 ,   -8.576798 ],
       [  41.166924 ,    1.4761266],
       [-131.00194  ,    8.082056 ],
       ...,
       [  -5.4491243,  -29.049335 ],
       [-124.237625 ,   11.969514 ],
       [  50.849026 ,   28.376295 ]], dtype=float32)

In [23]:
import plotly.express as px
fig = px.scatter(x=embs[:,0], y=embs[:,1],color=y_train_lm_masked.astype(str))
fig.show()

In [None]:
with open(
        join(args['data_path'], args['dataset'], f'train_embeddings.pkl'),
        'rb') as f:
    X_train_embed = pickle.load(f)
with open(join(args['data_path'], args['dataset'], f'test_embeddings.pkl'),
          'rb') as f:
    X_test_embed = pickle.load(f)

model = train_classifier.self_train(model=model, 
									X_train=X_train_embed, 
									X_val=X_test_embed, 
									y_val=y_test, 
									device=torch.device(args['device']), 
									lr=eval(args['self_train_lr']), 
									weight_decay=eval(args['self_train_weight_decay']),
									patience=args['self_train_patience'], 
									batch_size=args['self_train_batch_size'], 
									q_update_interval=args['q_update_interval'],
									self_train_thresh=eval(args['self_train_thresh']), 
									print_eval=True,
                  raw_text=False)

X_test_embed = torch.from_numpy(X_test_embed)
end_model_preds_test = model.predict_proba(X_test_embed, batch_size=args['self_train_batch_size'], raw_text=False)


# Print statistics
testing_metrics = utils.compute_metrics_bootstrap(y_preds=np.argmax(end_model_preds_test, axis=1),
													y_true=y_test, 
													average=args['average'], 
													n_bootstrap=args['n_bootstrap'], 
													n_jobs=args['n_jobs'])
print(testing_metrics)

In [None]:
torch.save(model.state_dict(), 'self_train_model.pt')

In [None]:
del model
gc.collect()
torch.cuda.empty_cache()