In [None]:
!pip install sentencepiece
!pip install transformers
!pip install datasets
!pip install rouge_score
!pip install bert_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from nltk.tokenize import sent_tokenize
import pandas as pd

In [None]:
import nltk

nltk.download("punkt")
from datasets import load_metric

rouge_score = load_metric("rouge")
bert_score = load_metric("bertscore")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
  rouge_score = load_metric("rouge")


Downloading builder script:   0%|          | 0.00/2.16k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/2.92k [00:00<?, ?B/s]

## Initialize model and tokenizer

In [None]:
from transformers import AutoTokenizer, AutoModelForMaskedLM, AutoModelForSeq2SeqLM
from transformers import DataCollatorForSeq2Seq
model_checkpoint = "GanjinZero/biobart-v2-large"
model = AutoModelForSeq2SeqLM.from_pretrained(model_checkpoint)
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

Downloading:   0%|          | 0.00/1.68k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.77G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

## Prepare Data

In [None]:
from datasets import DatasetDict, Dataset


In [None]:
filtered_df = pd.read_csv('/content/drive/MyDrive/NLP/Cleaned100length.csv')


In [None]:
filtered_df.rename(columns = {'findings':'FINDINGS', 'impressions':'IMPRESSION'}, inplace = True)

In [None]:
filtered_df = filtered_df[:65000]

In [None]:
filtered_df.head()

Unnamed: 0.1,Unnamed: 0,FINDINGS,impression
0,0,previously seen hyperintensities flare sequenc...,small foci of bilateral calcification within t...
1,1,stable cardiomegaly without evidence pulmonary...,picc line terminates at mid svc without eviden...
2,2,heart stably enlarged patient status post cabg...,stable cardiomegaly minimal if any cardiac fai...
3,3,stable cardiomegaly minimal upper lung zone re...,no evidence of pneumonia stable cardiomegaly a...
4,4,tip right sided picc line seen overlying proxi...,tip of rightsided picc line overlying the prox...


In [None]:
filtered_df.shape

(65000, 3)

Train-test Split

In [None]:
filtered_df["splits"] = "train"
filtered_df["splits"][int(len(filtered_df)*0.8):] = "test"

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["splits"][int(len(filtered_df)*0.8):] = "test"


In [None]:
train_df = filtered_df[filtered_df["splits"] == "train"][["FINDINGS","impression"]].copy().reset_index()
test_df = filtered_df[filtered_df["splits"] == "test"][["FINDINGS","impression"]].copy().reset_index()

#### Tokenize Dataset

In [None]:
max_input_length = 1024
max_target_length = 300

def preprocess_dataset(examples):
    model_inputs = tokenizer(examples["FINDINGS"], max_length=max_input_length, truncation=True)

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(examples["impression"], max_length=max_target_length, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs



In [None]:

dataset = DatasetDict(
    train=Dataset.from_pandas(train_df.iloc[:]),
    test=Dataset.from_pandas(test_df.iloc[:])
)


In [None]:
tokenized_dataset = dataset.map(
    preprocess_dataset,
    batched=True,
    batch_size=2
)

  0%|          | 0/26000 [00:00<?, ?ba/s]



  0%|          | 0/6500 [00:00<?, ?ba/s]

In [None]:
tokenized_dataset = tokenized_dataset.remove_columns(["FINDINGS", "impression", "index"])
tokenized_dataset.set_format("torch")

In [None]:
# tokenized_dataset

In [None]:
len(tokenized_dataset)

2

In [None]:
tokenized_dataset["train"][6]

{'input_ids': tensor([    0, 12690, 35961, 54635, 36236, 61092,  2340, 64103,  1766, 37256,
         37531, 19147,   699, 52414, 22089, 26841,  3793,  1437,    90, 25268,
         59130,  6609, 40618, 36774,     2]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]),
 'labels': tensor([    0,  2362,  1283,     9,   181, 61191,     2])}

In [None]:
features = [tokenized_dataset["train"][i] for i in range(len(tokenized_dataset['train']))]
data_collator(features)
print(features)

You're using a BartTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



## Model Training

In [None]:
import torch
torch.cuda.empty_cache()



In [None]:
with torch.no_grad():
    torch.cuda.empty_cache()

In [None]:
print(torch.cuda.is_available())

True


In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm.notebook import tqdm

num_epochs = 3
num_training_steps = 3 * len(train_df)
optimizer = AdamW(model.parameters(), lr=5e-5, weight_decay=0.01)
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)





In [None]:
!pip install transformers
from transformers import TrainingArguments, Trainer
import numpy as np
from transformers import Seq2SeqTrainingArguments
from transformers import Seq2SeqTrainer

output_folder = "/content/drive/MyDrive/NLP/BioBartV2_trained"

batch_size = 1
num_train_epochs =4

# Show the training loss with every epoch
logging_steps = len(tokenized_dataset["train"]) // batch_size // 5
model_name = 'GanjinZero/biobart-v2-large'

args = Seq2SeqTrainingArguments(
    output_dir=f"/content/{model_name}-finetuned-mimiccxr",
    evaluation_strategy="epoch",
    learning_rate=5.6e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=num_train_epochs,
    predict_with_generate=True,
    logging_steps=logging_steps,
    push_to_hub=False,
)

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Decode generated summaries into text
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Replace -100 in the labels as we can't decode them
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    # Decode reference summaries into text
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    # ROUGE expects a newline after each sentence
    decoded_preds = ["\n".join(sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(sent_tokenize(label.strip())) for label in decoded_labels]
    # Compute ROUGE scores
    result = rouge_score.compute(
        predictions=decoded_preds, references=decoded_labels, use_stemmer=True
    )
    # Extract the median scores
    result = {key: value.mid.fmeasure * 100 for key, value in result.items()}
    return {k: round(v, 4) for k, v in result.items()}

trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
from transformers import TrainerCallback, EarlyStoppingCallback

class LoggingCallback(TrainerCallback):
    def __init__(self, log_path):
        self.log_path = log_path

    def on_log(self, args, state, control, logs=None, **kwargs):
        _ = logs.pop("total_flos", None)
        if state.is_local_process_zero:
            with open(self.log_path, "a") as f:
                f.write(json.dumps(logs) + "\n")


In [None]:
trainer.train()

In [None]:
trainer.save_model('/content/drive/MyDrive/NLP/BioBartV2_trained')

In [None]:
!zip -r 't5_gec_model.zip' '/content/drive/MyDrive/NLP/BioBartV2_trained'

In [None]:
!nvidia-smi

Wed Dec 21 20:01:35 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  A100-SXM4-40GB      Off  | 00000000:00:04.0 Off |                    0 |
| N/A   28C    P0    49W / 400W |   9536MiB / 40536MiB |      0%      Default |
|                               |                      |             Disabled |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
!sudo fuser -v /dev/nvidia*

!sudo kill -9 PID

                     USER        PID ACCESS COMMAND
/dev/nvidia0:        root       8208 F...m python3
/dev/nvidiactl:      root       8208 F...m python3
/dev/nvidia-uvm:     root       8208 F...m python3
kill: failed to parse argument: 'PID'


## Evaluating Model

In [None]:
evalute_result=trainer.evaluate()

***** Running Evaluation *****
  Num examples = 13000
  Batch size = 1


In [None]:
print(evalute_result)

{'eval_loss': 1.784794569015503, 'eval_rouge1': 44.8557, 'eval_rouge2': 29.0105, 'eval_rougeL': 42.0328, 'eval_rougeLsum': 42.0384, 'eval_runtime': 3650.2586, 'eval_samples_per_second': 3.561, 'eval_steps_per_second': 3.561, 'epoch': 4.0}


In [None]:
evalute_result

{'eval_loss': 1.784794569015503,
 'eval_rouge1': 44.8557,
 'eval_rouge2': 29.0105,
 'eval_rougeL': 42.0328,
 'eval_rougeLsum': 42.0384,
 'eval_runtime': 3650.2586,
 'eval_samples_per_second': 3.561,
 'eval_steps_per_second': 3.561,
 'epoch': 4.0}