In [1]:
import pandas as pd
import numpy as np
import os

from tlt.datasets import dataset_factory
from tlt.models.model_factory import get_model

from torch.utils.data import DataLoader
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from datasets.arrow_dataset import Dataset
import datasets

# Specify a directory for the dataset to be downloaded
dataset_dir = os.environ["DATASET_DIR"] if "DATASET_DIR" in os.environ else \
    os.path.join(os.environ["HOME"], "datasets")
     
# Specify a directory for output
output_dir = os.environ["OUTPUT_DIR"] if "OUTPUT_DIR" in os.environ else \
    os.path.join(os.environ["HOME"], "outputs")

print("Dataset directory:", dataset_dir)
print("Output directory:", output_dir)

annotation_file = os.path.join(dataset_dir, 'annotation.csv')
print("annotation file: ", annotation_file)
print(os.path.exists(annotation_file))

  from .autonotebook import tqdm as notebook_tqdm


Dataset directory: /nfs/site/home/hramayan/datasets
Output directory: /nfs/site/home/hramayan/saved_models
annotation file:  /nfs/site/home/hramayan/datasets/annotation.csv
True


# Load "annotation" csv data

In [2]:
def label_map_func(label):
    if label == 'Normal':
        return 0
    elif label == 'Benign':
        return 1
    elif label == 'Malignant':
        return 2

In [3]:
dataset = dataset_factory.load_dataset(dataset_dir=dataset_dir,
                                        use_case='text_classification',
                                        framework='pytorch',
                                        dataset_name='brca',
                                        file_name='annotation',
                                        label_map_func=label_map_func,
                                        class_names=['Benign', 'Malignant', 'Normal'],
                                        header=0,
                                        usecols=[3, 4],
                                        shuffle_files=True)


print(dataset.dataset)

Dataset({
    features: ['label', 'symptoms'],
    num_rows: 1644
})


# Preprocess

In [4]:
dataset.preprocess('emilyalsentzer/Bio_ClinicalBERT', batch_size=5, padding="max_length", max_length=64, truncation=True)

Downloading: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 385/385 [00:00<00:00, 523kB/s]
Downloading: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 208k/208k [00:00<00:00, 248kB/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00, 22.98ba/s]

tokenized_dataset: Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1644
})





# Shuffle split

In [5]:
dataset.shuffle_split(train_pct=0.8, val_pct=0.2)

Dataset split into:
-------------------
1315 train samples
0 test samples
328 validation samples


# Get the clinical bert model

In [6]:
model = get_model(model_name='clinical-bert', framework='pytorch', num_labels=3)

2022-11-17 17:43:11.528547: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2022-11-17 17:43:11.531891: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-17 17:43:11.531901: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████| 385/385 [00:00<00:00, 515kB/s]
Downloading: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 416M/416M [00:30<00:00, 14.5MB/s]
Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This I

# Train the model

In [7]:
model.train(dataset=dataset, output_dir=output_dir, epochs=5)



Epoch 1/5
----------


100%|██████████████████████████████████████████████████| 263/263 [00:51<00:00,  5.14it/s]                                                                     
100%|██████████████████████████████████████████████████| 66/66 [00:03<00:00, 19.29it/s]                                                                       


Epoch 2/5
----------


100%|██████████████████████████████████████████████████| 263/263 [00:39<00:00,  6.58it/s]                                                                     
100%|██████████████████████████████████████████████████| 66/66 [00:03<00:00, 19.02it/s]                                                                       


Epoch 3/5
----------


100%|██████████████████████████████████████████████████| 263/263 [00:39<00:00,  6.71it/s]                                                                     
100%|██████████████████████████████████████████████████| 66/66 [00:03<00:00, 18.15it/s]                                                                       


Epoch 4/5
----------


100%|██████████████████████████████████████████████████| 263/263 [00:39<00:00,  6.70it/s]                                                                     
100%|██████████████████████████████████████████████████| 66/66 [00:03<00:00, 19.56it/s]                                                                       


Epoch 5/5
----------


100%|██████████████████████████████████████████████████| 263/263 [00:38<00:00,  6.77it/s]                                                                     
100%|██████████████████████████████████████████████████| 66/66 [00:03<00:00, 19.17it/s]                                                                       


{'Loss': [6.209681091879711,
  6.209681102078224,
  6.209681124741587,
  6.209681059017834,
  6.209681105477729],
 'Acc': [0.28821292775665397,
  0.28821292775665397,
  0.28821292775665397,
  0.28821292775665397,
  0.28821292775665397],
 'Val Loss': [6.009616067012151,
  6.009616123004393,
  6.00961601282611,
  6.009616044434634,
  6.009616020954017],
 'Val Acc': [0.3079268292682927,
  0.3079268292682927,
  0.3079268292682927,
  0.3079268292682927,
  0.3079268292682927]}