In [1]:
import sys
sys.path.append('../keyclass/')
sys.path.append('../scripts/')

import argparse
import label_data, encode_datasets, train_downstream_model
import torch
import pickle
import numpy as np
import os
from os.path import join, exists
from datetime import datetime
import utils
import models
import create_lfs
import train_classifier
from transformers import AutoTokenizer, AutoModel

import itertools
from sklearn.metrics import precision_recall_curve


[nltk_data] Downloading package stopwords to
[nltk_data]     /home/milesjg2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
!export 'PYTORCH_CUDA_ALLOC_CONF=max_split_size_mb:512'

In [3]:
config_file_path = r'../config_files/config_mimic3.yml' # Specify path to the configuration file
random_seed = 0

In [4]:
args = utils.Parser(config_file_path=config_file_path).parse()

if args['use_custom_encoder']:
    model = models.CustomEncoder(pretrained_model_name_or_path=args['base_encoder'], 
        device='cuda' if torch.cuda.is_available() else 'cpu')
else:
    model = models.Encoder(model_name=args['base_encoder'], 
        device='cuda' if torch.cuda.is_available() else 'cpu')

for split in ['train', 'test']:
    sentences = utils.fetch_data(dataset=args['dataset'], split=split, path=args['data_path'])
    embeddings = model.encode(sentences=sentences, batch_size=args['end_model_batch_size'], 
                                show_progress_bar=args['show_progress_bar'], 
                                normalize_embeddings=args['normalize_embeddings'])
    with open(join(args['data_path'], args['dataset'], f'{split}_embeddings.pkl'), 'wb') as f:
        pickle.dump(embeddings, f)

Some weights of the model checkpoint at bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Batches:   0%|          | 0/31 [00:00<?, ?it/s]

Batches:   0%|          | 0/11 [00:00<?, ?it/s]

In [5]:
import gc

del model
gc.collect()
torch.cuda.empty_cache()

In [6]:
args = utils.Parser(config_file_path=config_file_path).parse()

train_text = utils.fetch_data(dataset=args['dataset'], path=args['data_path'], split='train')

training_labels_present = False
if exists(join(args['data_path'], args['dataset'], 'train_labels_all.txt')):
    with open(join(args['data_path'], args['dataset'], 'train_labels_all.txt'), 'r') as f:
        y_train = f.readlines()
    # y_train = np.array([int(i.replace('\n','')) for i in y_train])
    y_train = np.array([np.array([int(i) for i in sub.strip().split()]) for sub in y_train], dtype=object)
    print(y_train)

[array([ 7,  5, 10,  2,  4]) array([10,  2,  8,  7, 16,  3]) array([7, 2])
 ... array([ 9, 10, 16,  7,  2])
 array([ 7,  3,  2, 13, 10,  6,  5,  8,  4,  9]) array([10])]


In [7]:
from sklearn.preprocessing import MultiLabelBinarizer

In [8]:
mlb = MultiLabelBinarizer()
# print(y_train[0])
# print(mlb.fit_transform(y_train)[0])
# print(mlb.classes_)
y_transform = mlb.fit_transform(y_train)
type(y_transform)
y_transform

array([[0, 0, 1, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 1],
       [0, 0, 1, ..., 0, 0, 0],
       ...,
       [0, 0, 1, ..., 0, 0, 1],
       [0, 0, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [9]:
import importlib
importlib.reload(create_lfs)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/milesjg2/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


<module 'create_lfs' from '../keyclass/create_lfs.py'>

In [11]:
args = utils.Parser(config_file_path=config_file_path).parse()

train_text = utils.fetch_data(dataset=args['dataset'], path=args['data_path'], split='train')

training_labels_present = False
if exists(join(args['data_path'], args['dataset'], 'train_labels_all.txt')):
    with open(join(args['data_path'], args['dataset'], 'train_labels_all.txt'), 'r') as f:
        y_train = f.readlines()
    y_train = np.array([[int(i) for i in sub.strip().split()] for sub in y_train], dtype=object)
    
    training_labels_present = True
else:
    y_train = None
    training_labels_present = False
    print('No training labels found!')

with open(join(args['data_path'], args['dataset'], f'train_embeddings.pkl'), 'rb') as f:
    X_train = pickle.load(f)

# Convert to MultiLabel format
mlb = MultiLabelBinarizer()
y_train_ml = mlb.fit_transform(y_train)

# Print dataset statistics
print(f"Getting labels for the {args['dataset']} data...")
print(f'Size of the data: {len(train_text)}')
if training_labels_present:
    print('Class distribution', np.unique(np.hstack(y_train.flatten()), return_counts=True))
class_balance = np.unique(np.hstack(y_train.flatten()), return_counts=True)[1] / np.unique(np.hstack(y_train.flatten()), return_counts=True)[1].sum()
# Load label names/descriptions
label_names = []
for a in args:
    if 'target' in a: label_names.append(args[a])


        
# Creating labeling functions
labeler = create_lfs.CreateLabellingFunctions(custom_encoder=args['use_custom_encoder'],
                                              base_encoder=args['base_encoder'], 
                                            device=torch.device(args['device']),
                                            label_model=args['label_model'])
# obtain predicted probabilities and labels from label model
proba_preds, y_preds = labeler.get_labels(text_corpus=train_text, label_names=label_names, max_df = 1.0, min_df=0.001, 
                                ngram_range=(1,3), topk=args['topk'], y_train=y_train_ml, 
                                label_model_lr=args['label_model_lr'], label_model_n_epochs=args['label_model_n_epochs'], 
                                verbose=True, n_classes=args['n_classes'], class_balance=class_balance, min_topk=False)

y_train_pred = y_preds


# Save the predictions
if not os.path.exists(args['preds_path']): os.makedirs(args['preds_path'])
with open(join(args['preds_path'], f"{args['label_model']}_proba_preds.pkl"), 'wb') as f:
    pickle.dump(proba_preds, f)

# Print statistics
print('Label Model Predictions: Unique value and counts', np.unique(
    y_preds.flatten(), return_counts=True
))
if training_labels_present:
    print('Label Model Training Accuracy', np.mean([(y_train_pred[i] in labels) for i,labels in enumerate(y_train)]))


    # Log the metrics
    training_metrics_with_gt = utils.compute_metrics(y_preds=y_train_pred, y_true=y_train, average=args['average'])
    utils.log(metrics=training_metrics_with_gt, filename='label_model_with_ground_truth', 
        results_dir=args['results_path'], split='train')

Getting labels for the mimic data...
Size of the data: 3922
Class distribution (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16]), array([1021,  643, 2666, 1418, 1192,  937,  284, 3079, 1797, 1567, 1577,
         13,  440,  742,  200,  300, 1413]))


Some weights of the model checkpoint at bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Found assigned category counts [ 7616  6929 13482 11563  6102  5822  2215 30058 11979   687  7085  2541
  6894  1949  7479  6713  9065]
labeler.vocabulary:
 138179
labeler.word_indicator_matrix.shape (3922, 510)
Len keywords 510
assigned_category: Unique and Counts (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16]), array([30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30]))
tuberculosis,  found,  unspecified,  bacilli,  tubercle,  bacteriological,  examination,  histological,  specified,  sputum,  confirmed,  microscopy,  infection,  tuberculous,  due,  bacterial,  done,  unknown,  present,  animals,  inoculation,  culture,  histologically,  methods,  acute ['amphetmnneg mthdoneneg pm' 'amphetmnneg mthdonepos'
 'bacteriamany yeastnone' 'bacteriamany yeastnone epi'
 'bacteriamod yeastnone epi' 'bacteriarare yeastnone'
 'bacteriarare yeastnone epi' 'bacterimany yeastnone'
 'bacterimod yeastnone' 'csfspinal fluid gram'
 'culture routinefinal inp

INFO:root:Computing O...
INFO:root:Estimating \mu...
INFO:root:Using GPU...
  0%|                                                                               | 0/100 [00:00<?, ?epoch/s]INFO:root:[0 epochs]: TRAIN:[loss=0.028]
100%|█████████████████████████████████████████████████████████████████████| 100/100 [03:06<00:00,  1.86s/epoch]
INFO:root:Finished Training


Label Model Predictions: Unique value and counts (array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16]), array([  71,   84,  404,  162,  107,  106,  150, 1871,   86,   71,   48,
         98,   89,  180,   89,  102,  204]))
Label Model Training Accuracy 0.5678225395206528
Saving results in ../results/mimic/train_label_model_with_ground_truth_04-May-2023-02_42_39.txt...


  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
import gc

del labeler
gc.collect()
torch.cuda.empty_cache()

In [13]:
import torch.optim as optim
import torch.nn as nn
from tqdm import tqdm, trange

def custom_train(model, device, X_train, y_train, epochs, batch_size, lr):
    if isinstance(y_train, np.ndarray):
        y_train = torch.from_numpy(y_train)

    if isinstance(X_train, np.ndarray):
        X_train = torch.from_numpy(X_train)
        
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=lr)
    N = len(X_train)
    pbar = trange(epochs, unit="batch")
    for nep in pbar:
        pbar.set_description(f"Epoch {nep}")
        permutation = torch.randperm(N)
        running_loss = 0.0
        for i in range(0, N, batch_size):
            optimizer.zero_grad()
            indices = permutation[i:i + batch_size]

            batch_x, batch_y = X_train[indices], y_train[indices]
            batch_y = batch_y.to(device)
            batch_x = batch_x.to(device)
            
            out = model.forward(batch_x, mode='inference', raw_text=False)
            loss = criterion(out, batch_y)
            loss.backward()
            optimizer.step()
            running_loss = running_loss + (loss.cpu().detach().numpy() *
                                           batch_size / N)
        print(running_loss)
    return model

In [14]:
import gc

gc.collect()
torch.cuda.empty_cache()

In [15]:
from sklearn.preprocessing import normalize
from sklearn.utils import shuffle
args = utils.Parser(config_file_path=config_file_path).parse()

# Set random seeds
random_seed = random_seed
torch.manual_seed(random_seed)
np.random.seed(random_seed)

X_train_embed_masked, y_train_lm_masked, y_train_masked, \
	X_test_embed, y_test, training_labels_present, \
	sample_weights_masked, proba_preds_masked = train_downstream_model.load_data_all(args, class_balance=class_balance, max_num=2000)

# X_train_embed_masked = normalize(X_train_embed_masked)
# X_test_embed = normalize(X_test_embed)
# with open(join(args['data_path'], args['dataset'], f'train_embeddings.pkl'), 'rb') as f:
#     X_train = pickle.load(f)
# Train a downstream classifier


if args['use_custom_encoder']:
	encoder = models.CustomEncoder(pretrained_model_name_or_path=args['base_encoder'], device=args['device'])
else:
	encoder = models.Encoder(model_name=args['base_encoder'], device=args['device'])
    

classifier = models.FeedForwardTCN(encoder_model=encoder,
										num_inputs=768, 
										num_channels1=[128, 128, 128], 
										num_channels2=[64, 64, 64],
										h_sizes=[128,64],
										kernel_size=3, 
										dropout=0.1, 
										batch_size=512, 
										device=torch.device(args['device']))
# classifier = models.FeedForwardFlexible(encoder_model=encoder,
# 										h_sizes=[768, 1024, 512, 256, 64, 256, 512, 1024, 64, 17], 
# 										activation=eval(args['activation']),
# 										device=torch.device(args['device']))


print('\n===== Training the downstream classifier =====\n')
X_train_embed_masked, y_train_lm_masked = shuffle(X_train_embed_masked, y_train_lm_masked, random_state=2)

model = train_classifier.train_multi_label(model=classifier, 
							device=torch.device(args['device']),
							X_train=X_train_embed_masked, 
							y_train=y_train_lm_masked,
							sample_weights=sample_weights_masked if args['use_noise_aware_loss'] else None, 
							epochs=args['end_model_epochs'], 
							batch_size=512, 
							criterion=eval(args['criterion']), 
							raw_text=False, 
							lr=0.001, 
							weight_decay=0,
							patience=args['end_model_patience'])

end_model_preds_train = model.predict_proba(torch.from_numpy(X_train_embed_masked), batch_size=512, raw_text=False)
end_model_preds_test = model.predict_proba(torch.from_numpy(X_test_embed), batch_size=512, raw_text=False)

Confidence of least confident data point of class 0: 0.05293172274353259
Confidence of least confident data point of class 1: 0.8517329214149676
Confidence of least confident data point of class 2: 0.9962841604905776
Confidence of least confident data point of class 3: 0.7814581440263394
Confidence of least confident data point of class 4: 0.21091249901671633
Confidence of least confident data point of class 5: 0.631550790285461
Confidence of least confident data point of class 6: 0.9917990024352776
Confidence of least confident data point of class 7: 0.9330492741922651
Confidence of least confident data point of class 8: 0.09316190574939086
Confidence of least confident data point of class 9: 0.08123801130177821
Confidence of least confident data point of class 10: 0.08175644149515268
Confidence of least confident data point of class 11: 0.9999999999051195
Confidence of least confident data point of class 12: 0.9578520456979125
Confidence of least confident data point of class 13: 0.9

Some weights of the model checkpoint at bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12 were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).



===== Training the downstream classifier =====



Epoch 19: 100%|█████████| 20/20 [00:02<00:00,  9.33batch/s, best_loss=0.2, running_loss=0.2, tolerance_count=0]


In [16]:
with open(
        join(args['data_path'], args['dataset'], f'train_embeddings.pkl'),
        'rb') as f:
    X_train_embed = pickle.load(f)
with open(join(args['data_path'], args['dataset'], f'test_embeddings.pkl'),
          'rb') as f:
    X_test_embed = pickle.load(f)

model = train_classifier.self_train(model=model, 
									X_train=X_train_embed, 
									X_val=X_test_embed, 
									y_val=y_test, 
									device=torch.device(args['device']), 
									lr=eval(args['self_train_lr']), 
									weight_decay=eval(args['self_train_weight_decay']),
									patience=args['self_train_patience'], 
									batch_size=args['self_train_batch_size'], 
									q_update_interval=args['q_update_interval'],
									self_train_thresh=eval(args['self_train_thresh']), 
									print_eval=True,
									raw_text=False, 
									train_multilabel=True)

X_test_embed = torch.from_numpy(X_test_embed)

##This is 1/2  what you should return. ONE RETURN FOR FEED FORWARD, ONE FOR TCN
model.forward(torch.tensor(X_train_embed).to("cuda"), raw_text=False).cpu().detach().numpy()


#IGNORE THIS
# end_model_preds_test = model.predict(X_test_embed, batch_size=args['self_train_batch_size'], raw_text=False)


# # Print statistics
# testing_metrics = utils.compute_metrics_bootstrap(y_preds=end_model_preds_test,
# 													y_true=y_test, 
# 													average=args['average'], 
# 													n_bootstrap=args['n_bootstrap'], 
# 													n_jobs=args['n_jobs'], 
# 													multilabel=True)
# print(testing_metrics)

Epoch 0: 100%|█| 1/1 [00:01<00:00,  1.10s/batch, self_train_agreement=0.844, tolerance_count=0, validation_accu


array([[0.2635968 , 0.27614966, 0.36435422, ..., 0.17386761, 0.19087194,
        0.3986823 ],
       [0.23661907, 0.27418262, 0.22898012, ..., 0.09838374, 0.19112627,
        0.34077525],
       [0.04292928, 0.12411701, 0.18535729, ..., 0.04165442, 0.03059874,
        0.1658059 ],
       ...,
       [0.0346226 , 0.02063135, 0.21299645, ..., 0.00964527, 0.01055443,
        0.0689842 ],
       [0.02287608, 0.03450003, 0.16921255, ..., 0.00537035, 0.01754055,
        0.05892736],
       [0.01842624, 0.01368896, 0.4027123 , ..., 0.02738493, 0.03685291,
        0.03441577]], dtype=float32)

### DONT WORRY ABOUT BEYOND THIS

In [17]:
def predict_on_cust_thresholds(model, X_train, X_test, y_train, y_test):
    model.eval()
    np.random.RandomState(1234)

    
    repeats = [len(x) for x in y_train]
    y_train_multi = np.array(list(itertools.chain.from_iterable(y_train)))
    y_pred_train = model.forward(torch.tensor(X_train).to("cuda"), raw_text=False).cpu().detach().numpy()
    y_pred_train_multi = np.repeat(y_pred_train,repeats,axis=0)
    
    _, n_classes = y_pred_train_multi.shape
    overall_thresholds = []
    for i in range(n_classes):
        
        # Computing best threshold for i-th class
        precision, recall, thresholds = precision_recall_curve(y_train_multi, y_pred_train_multi[:, i], pos_label=i)

        # compute f-1
        f1 = 2 * precision * recall / (precision + recall)

        # pick up the best threshold's index
        best_idx = np.argmax(f1)
        overall_thresholds.append(thresholds[best_idx])
    
    overall_thresholds = np.array(overall_thresholds)
    y_pred_test = model.forward(X_test_embed.to("cuda"), raw_text=False).cpu().detach().numpy()
    y_pred_bool = y_pred_test > overall_thresholds[None,:]
    return y_pred_bool

In [22]:
def precision_recall_accuracy(y_pred, y_true):
    mlb = MultiLabelBinarizer()
    y_true = mlb.fit_transform(y_true)
    actual_pos_sum = y_true.sum(axis=0)
    pred_pos_sum = y_pred.sum(axis=0)
    pred_actual_intersect = (y_true&y_pred).sum(axis=0)
    
    class_precision = np.nan_to_num(pred_actual_intersect/pred_pos_sum)
    overall_precision = pred_actual_intersect.sum() / pred_pos_sum.sum()
    class_recall = np.nan_to_num(pred_actual_intersect/actual_pos_sum)
    overall_recall = pred_actual_intersect.sum() / actual_pos_sum.sum()
    
    class_accuracy = (y_true==y_test_pred).sum(axis=0) / y_true.shape[0]
    overall_accuracy = (y_true==y_test_pred).sum() / (y_true.shape[0] * y_true.shape[1])
    
    return class_precision, class_recall, class_accuracy, overall_precision, overall_recall, overall_accuracy

In [24]:
y_test_pred = predict_on_cust_thresholds(model, X_train_embed, X_test_embed, y_train, y_test)
precision_recall_accuracy(y_test_pred, y_test)

  


(array([0.33333333, 0.        , 0.66488141, 0.        , 0.29266282,
        0.25519288, 0.        , 0.80169101, 0.        , 0.33333333,
        0.4078643 , 0.        , 0.2       , 0.2       , 0.        ,
        0.        , 0.        ]),
 array([0.00275482, 0.        , 0.99885057, 0.        , 0.92447917,
        0.77945619, 0.        , 0.99428027, 0.        , 0.00193798,
        0.9906367 , 0.        , 0.00675676, 0.0045045 , 0.        ,
        0.        , 0.        ]),
 array([0.72171254, 0.83944954, 0.66437309, 0.62691131, 0.32186544,
        0.36850153, 0.91896024, 0.79816514, 0.54434251, 0.60474006,
        0.40902141, 0.87155963, 0.88455657, 0.82798165, 0.93654434,
        0.93807339, 0.60474006]),
 0.4834782608695652,
 0.47147702744372494,
 0.6989116747616477)