In [85]:
import shap
import sklearn
import numpy as np
import pandas as pd
import sklearn
import sklearn.ensemble
import sklearn.metrics
import accelerate
import pytorch_lightning as pl
from transformers import pipeline, AutoTokenizer, AutoModel, DataCollatorWithPadding, EvalPrediction, TrainingArguments, Trainer, OPTForSequenceClassification, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig
from torch.optim import AdamW
from torch.utils.data import TensorDataset
import torch
import torch.nn as nn
import evaluate
import tqdm.notebook as tq
from datasets import load_dataset
from peft import PeftModel, PeftConfig, LoraConfig, get_peft_model, TaskType
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import os 

In [86]:
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [87]:
# Hyperparameters
MAX_LEN = 2048
MODEL = "facebook/opt-350m"
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
TEST_BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 3e-05

In [88]:
train_short_path = "data/train_10_top50_short.csv"
val_short_path = "data/val_10_top50_short.csv"
test_short_path = "data/test_10_top50_short.csv"
labels_path = 'data/icd10_codes_top50.csv'

train_10_top50_shorten = pd.read_csv(train_short_path)
val_10_top50_shorten = pd.read_csv(val_short_path)
test_10_top50_shorten = pd.read_csv(test_short_path)
labels_10_top50 = pd.read_csv(labels_path)

In [89]:
train_10_top50_shorten['text'][0]

'Sex:   M\n \nService: MEDICINE\n \nAllergies: \nmeropenem / posaconazole / vancomycin\n \n ___.\n \nChief Complaint:\nHiDAC\n \nMajor Surgical or Invasive Procedure:\ntunneled line placement\n\n \nHistory of Present Illness:\n A ___ year old M with PMHx of morbid obesity and\nosteoarthritis with recent diagnosis of CBF AML s/p 7+3 now\npresenting for cycle 1 of high dose cytarabine consolidation.\n\n \nPast Medical History:\nMorbid obesity\nVitamin D deficiency\nOsteoarthritis\nSleep apnea\nGout one flare per year controlled with diet not currently\nrequiring medications , was on allopurinol previously\nPAST SURGICAL HISTORY:\nAppendectomy\n\n \nSocial History:\n___\nFamily History:\nNo family history of malignancy other than a paternal cousin \nwith\nbreast cancer.\n \nPhysical Exam:\nADMISSION PHYSICAL EXAM;\nVS: TC 98.4 115/73 81 20 96%RA\nWT: 278.99 lb. \nHEENT: EOMI, no conjunctival injection or icterus. Moist mucus\nmembranes without lesion. \nCV: RRR, S1/S2. No murmurs apprecia

In [90]:
classes = [class_ for class_ in labels_10_top50["icd_code"] if class_]
class2id = {class_: id for id, class_ in enumerate(classes)}
id2class = {id: class_ for class_, id in class2id.items()}

In [91]:
lora_config = LoraConfig(
    r=16,
    target_modules=["q_proj", "v_proj"],
    task_type=TaskType.SEQ_CLS,
    lora_alpha=32,
    lora_dropout=0.05,
)

config, unused_kwargs = AutoConfig.from_pretrained(
    MODEL,
    num_labels=len(classes),
    id2label=id2class,
    label2id=class2id,
    problem_type="multi_label_classification",
    return_unused_kwargs=True,
)


model = OPTForSequenceClassification.from_pretrained(
    MODEL,
    config=config,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,598,464 || all params: 332,820,480 || trainable%: 0.4802781367300474


In [92]:
config = PeftConfig.from_pretrained("./OPT-350m-events_classification_biotech/")
model = PeftModel.from_pretrained(model, "./OPT-350m-events_classification_biotech/", is_trainable=False,)

In [93]:
model.print_trainable_parameters()

trainable params: 25,600 || all params: 332,820,480 || trainable%: 0.007691834348655467


In [94]:
class TokenizerWrapper:
    def __init__(self, tokenizer, MAX_LEN):
        self.tokenizer = tokenizer
        self.max_length = MAX_LEN
        self.classes = [class_ for class_ in labels_10_top50["icd_code"] if class_]
        self.class2id = {class_: id for id, class_ in enumerate(classes)}
        self.id2class = {id: class_ for class_, id in class2id.items()}
        
    def multi_labels_to_ids(self, labels: list[str]) -> list[float]:
        ids = [0.0] * len(self.class2id)  # BCELoss requires float as target type
        for label in labels:
            ids[self.class2id[label]] = 1.0
        return ids
    
    def tokenize_function(self, example):
        result = self.tokenizer(
            example["text"],
            max_length = self.max_length,
            padding = 'max_length',
            truncation = True,
            return_tensors='pt'
        )
        result["label"] = torch.tensor([self.multi_labels_to_ids(eval(label)) for label in example["label"]])
        return result

In [95]:
tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m", cache_dir='./model_ckpt/')

In [96]:
data_files = {
        "train": train_short_path,
        "validation": val_short_path,
        "test": test_short_path,
    }

tokenizer_wrapper = TokenizerWrapper(tokenizer, MAX_LEN)
dataset = load_dataset("csv", data_files=data_files)
dataset = dataset.map(tokenizer_wrapper.tokenize_function, batched=True, num_proc=1)

Map: 100%|██████████| 5000/5000 [00:07<00:00, 678.08 examples/s]
Map: 100%|██████████| 4221/4221 [00:06<00:00, 696.19 examples/s]
Map: 100%|██████████| 1000/1000 [00:01<00:00, 679.98 examples/s]


In [97]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 5000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 4221
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 1000
    })
})

In [98]:
sampleToTest = dataset['train']['label'][0]
check = train_10_top50_shorten['label'][0]
print(sampleToTest)
print(check)
ids = [0.0] * len(class2id)  # BCELoss requires float as target type
for label in eval(check):
    ids[class2id[label]] = 1.0
print(ids)

[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
['d-G4733', 'd-Z86718', 'p-02HV33Z']
[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]


In [100]:
tokenizerOut = tokenizer(dataset['train']['text'][0], max_length=MAX_LEN, padding='max_length', truncation=True, return_tensors='pt')
print(tokenizerOut)

{'input_ids': tensor([[    2, 35581,    35,  ...,     4,  1437, 50118]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1]])}


In [101]:
classifier = pipeline("text-classification", model=model,
                                        tokenizer=tokenizer,
                                        device=device)

The model 'PeftModelForSequenceClassification' is not supported for text-classification. Supported models are ['AlbertForSequenceClassification', 'BartForSequenceClassification', 'BertForSequenceClassification', 'BigBirdForSequenceClassification', 'BigBirdPegasusForSequenceClassification', 'BioGptForSequenceClassification', 'BloomForSequenceClassification', 'CamembertForSequenceClassification', 'CanineForSequenceClassification', 'LlamaForSequenceClassification', 'ConvBertForSequenceClassification', 'CTRLForSequenceClassification', 'Data2VecTextForSequenceClassification', 'DebertaForSequenceClassification', 'DebertaV2ForSequenceClassification', 'DistilBertForSequenceClassification', 'ElectraForSequenceClassification', 'ErnieForSequenceClassification', 'ErnieMForSequenceClassification', 'EsmForSequenceClassification', 'FalconForSequenceClassification', 'FlaubertForSequenceClassification', 'FNetForSequenceClassification', 'FunnelForSequenceClassification', 'GemmaForSequenceClassification'

In [107]:
classifier("I have cancer")

[{'label': 'd-J45909', 'score': 0.9755363464355469}]

In [116]:
def predictor(texts):
    outputs = model(**tokenizer(texts, max_length=MAX_LEN, padding='max_length', truncation=True, return_tensors='pt'))
    tensor_logits = outputs.logits
    probas = F.softmax(tensor_logits).detach().numpy()
    return probas

In [117]:
model(**tokenizer("I have cancer", max_length=MAX_LEN, padding='max_length', truncation=True, return_tensors='pt')).logits

tensor([[-2.7499, -1.8709, -2.0599, -0.6182, -0.7577, -0.0824, -2.1531, -1.9502,
          1.0857, -1.8505, -2.2305,  1.4303,  0.3326, -1.8009,  2.2412,  0.2839,
          0.8848, -0.3478,  3.6858,  2.6791,  1.7907, -0.7664, -0.2892, -4.6475,
          1.7753, -4.2389,  1.2407,  0.6144, -2.0558,  0.6384, -2.4194, -1.6926,
         -0.7171, -0.3246, -1.0963,  0.9743, -1.5627, -1.2289, -1.2280, -0.5481,
         -1.0376, -0.7625, -1.1220,  1.8353, -0.4621,  0.0883, -0.0982,  0.1386,
          1.1662, -2.4850]], grad_fn=<IndexBackward0>)

In [118]:
explainer = shap.Explainer(predictor, tokenizer)

In [120]:
dataset['test']['text'][:10]

 'Sex:   F\n \nService: PLASTIC\n \nAllergies: \ncucumber / Tegaderm\n \n ___\n \nChief Complaint:\nSurgical absence of L breast\n \nMajor Surgical or Invasive Procedure:\n1) ___ - Right prophylactic mastectomy, bilateral ___ \nreconstruction\n2) ___ - take back to OR for exploration of left flap \nvessels\n \nHistory of Present Illness:\n___ is a ___ year old female with history of L breast\ncancer (Stage I IDC and Paget\'s) and previous left sided \nmastectomy & SLNB. She was admitted to the hospital after her \nprophylactic R mastectomy with ___ reconstruction on \n___. She was taken back to the OR on ___ for flap \nexploration due to declining Vioptix recordings. \n \nPast Medical History:\nPNC:\n- ___ ___ by US\n- Labs: Rh+/ab neg/RPRNR/RI/HBsAg neg/HIV neg/ GBS unknown\n- Genetics: LR ERA\n- FFS: wnl \n- GLT: wnl\n- US: ___, 67%, breech, ___, nl fluid, anterior placenta\n- Issues:\n*) breast cancer in pregnancy: unilateral mastectomy w/ sentinel\nLN biopsy, s/p chemotherapy compl

In [119]:
shap_values = explainer(dataset['test']['text'][:10], fixed_context=1, batch_size=2)

ValueError: text input must be of type `str` (single example), `List[str]` (batch or single pretokenized example) or `List[List[str]]` (batch of pretokenized examples).

In [None]:
shap.summary_plot(shap_values, dataset['train'], feature_names=id2class)