In [1]:
import shap
import sklearn
import numpy as np
import pandas as pd
import sklearn
import sklearn.ensemble
import sklearn.metrics
import accelerate
import pytorch_lightning as pl
from transformers import pipeline, AutoTokenizer, AutoModel, DataCollatorWithPadding, EvalPrediction, TrainingArguments, Trainer, OPTForSequenceClassification, AutoConfig, AutoModelForCausalLM, BitsAndBytesConfig
from torch.optim import AdamW
from torch.utils.data import TensorDataset
import torch
import torch.nn as nn
import evaluate
import tqdm.notebook as tq
from datasets import load_dataset
from peft import PeftModel, PeftConfig, LoraConfig, get_peft_model, TaskType
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
import os 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
torch.cuda.empty_cache()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu


In [3]:
# Hyperparameters
MAX_LEN = 2048
MODEL = "facebook/opt-350m"
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 4
TEST_BATCH_SIZE = 4
EPOCHS = 5
LEARNING_RATE = 3e-05

In [4]:
model_checkpoint_path = './OPT-350m-events_classification_biotech'

In [9]:
dataset = load_dataset('knowledgator/events_classification_biotech')

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [39]:
classes = dataset['train'].features['label 1'].names
classes = [c for c in classes if c]
id2class = {i: label for i, label in enumerate(classes)}
class2id = {label: i for i, label in enumerate(classes)}

In [40]:
lora_config = LoraConfig(
    r=16,
    target_modules=["q_proj", "v_proj"],
    task_type=TaskType.SEQ_CLS,
    lora_alpha=32,
    lora_dropout=0.05,
)

config, unused_kwargs = AutoConfig.from_pretrained(
    MODEL,
    num_labels=len(classes),
    id2label=id2class,
    label2id=class2id,
    problem_type="multi_label_classification",
    return_unused_kwargs=True,
)


model = OPTForSequenceClassification.from_pretrained(
    MODEL,
    config=config,
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

Some weights of OPTForSequenceClassification were not initialized from the model checkpoint at facebook/opt-350m and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


trainable params: 1,588,224 || all params: 332,800,000 || trainable%: 0.47723076923076924


In [41]:
config = PeftConfig.from_pretrained("./OPT-350m-events_classification_biotech/")
model = PeftModel.from_pretrained(model, "./OPT-350m-events_classification_biotech/", is_trainable=False,)

In [42]:
model.print_trainable_parameters()

trainable params: 15,360 || all params: 332,800,000 || trainable%: 0.004615384615384616


In [43]:
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint_path)

In [51]:
def predict(texts):
    encoded_input = tokenizer(texts, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors='pt')
    with torch.no_grad():
        output = model(**encoded_input)
    probabilities = torch.nn.functional.softmax(output.logits, dim=-1).numpy()
    return probabilities

In [52]:
data_subset = dataset['train'][:10]

In [62]:
texts = data_subset['content']

In [67]:
# Tokenize the texts
tokenized_background = tokenizer(
    texts,
    padding='max_length',
    truncation=True,
    max_length=MAX_LEN,
    return_tensors='pt'
)

In [None]:
background_data = tokenized_background['input_ids'].detach().numpy()
explainer = shap.KernelExplainer(predict, background_data)

In [None]:
shap_values = explainer.shap_values(background_data)

In [None]:
shap.initjs()
shap.force_plot(explainer.expected_value[0], shap_values[0], texts)