In [7]:
import argparse
from pprint import pprint

import evaluate
import numpy as np
import pandas as pd
import torch
from torch import nn
from datasets import load_dataset
from peft import LoraConfig, TaskType, get_peft_model
from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    EvalPrediction,
    OPTForSequenceClassification,
    Trainer,
    TrainingArguments,
)

MODEL = "emilyalsentzer/Bio_ClinicalBERT"
MAX_POSITION_EMBEDDINGS = 512

from dataclasses import dataclass

In [10]:
TRAIN_DATSET_PATH = "./data/train_10_top50.csv"
VAL_DATASET_PATH = "./data/val_10_top50.csv"
TEST_DATSET_PATH = "./data/test_10_top50.csv"
CODE_PATH = "./data/icd10_codes_top50.csv"

# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = "cpu"

In [11]:
# Load dataset
# Load dataset
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True, device=device)

data_files = {
    "train": TRAIN_DATSET_PATH,
    "validation": VAL_DATASET_PATH,
    "test": TEST_DATSET_PATH,
}
code_labels = pd.read_csv(CODE_PATH)
dataset = load_dataset("csv", data_files=data_files)
# Create class dictionaries
classes = [class_ for class_ in code_labels["icd_code"] if class_]
class2id = {class_: id for id, class_ in enumerate(classes)}
id2class = {id: class_ for class_, id in class2id.items()}


def multi_labels_to_ids(labels: list[str]) -> list[float]:
    ids = [0.0] * len(class2id)  # BCELoss requires float as target type
    for label in labels:
        ids[class2id[label]] = 1.0
    return ids


def preprocess_function(example):
    result = tokenizer(
        example["text"]
    )
    result["label"] = [multi_labels_to_ids(eval(label)) for label in example["label"]]
    return result


dataset = dataset.map(
    preprocess_function, batched=True, num_proc=1
)

Map:   0%|          | 0/33768 [00:00<?, ? examples/s]

Map:   0%|          | 0/4221 [00:00<?, ? examples/s]

Map:   0%|          | 0/4221 [00:00<?, ? examples/s]

In [12]:
dataset["train"]

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 33768
})

In [None]:
import numpy as np
# get average length of text
train_dataset = dataset['train']


input_ids_lengths =[]
label_lengths = []
sets = ['train', 'test', 'validation']
for set in sets:
    input_ids_lengths.extend([len(example['input_ids']) for example in dataset[set]])


In [None]:
print(np.mean(input_ids_lengths))
print(np.max(input_ids_lengths))

3041.86259180289
6262


In [None]:
label_lengths = []

sets = ["train", "test", "validation"]
for set in sets:
    label_lengths.extend([example['label'].count(1) for example in dataset[set]])


In [None]:
print(np.mean(label_lengths))
print(np.max(label_lengths))

5.023146173892442
23
