In [1]:
!pip install transformers huggingface_hub datasets wandb evaluate rouge_score accelerate

Collecting evaluate
  Downloading evaluate-0.4.2-py3-none-any.whl.metadata (9.3 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Downloading evaluate-0.4.2-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24936 sha256=c5c289ebcb52a24ecabd4c335b5ebc71fab42c6ce31118946220cfbbd48d8d4a
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.2 rouge_score-0.1.2
[0m

In [2]:
from transformers import (
    T5Tokenizer,
    T5ForConditionalGeneration,
    TrainingArguments,
    Trainer
)
from datasets import load_dataset
import numpy as np
import evaluate
import torch
import wandb
import os

2024-06-20 12:56:45.812137: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-20 12:56:45.812196: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-20 12:56:45.814142: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-20 12:56:45.896969: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
from getpass import getpass

hf_token = getpass("Enter you hugging face token: ")
wandb_key = getpass("Enter your wandb key: ")

Enter you hugging face token:  ·····································
Enter your wandb key:  ········································


In [4]:
from huggingface_hub import login

login(token=hf_token)
wandb.login(key=wandb_key)

Token will not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: fineGrained).
Your token has been saved to /root/.cache/huggingface/token
Login successful


[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [5]:
from datasets import load_dataset, DatasetDict

data_files = 'created_simplification_data.json'

dataset = load_dataset("json", data_files=data_files)

dataset

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['simple', 'medical'],
        num_rows: 6967
    })
})

In [6]:
dataset = dataset['train']

num_samples = len(dataset)
num_train = int(0.8 * num_samples)
num_val = int(0.1 * num_samples)
num_test = num_samples - num_train - num_val

shuffled_dataset = dataset.shuffle(seed=42)

train_dataset = shuffled_dataset.select(range(num_train))
val_dataset = shuffled_dataset.select(range(num_train, num_train + num_val))
test_dataset = shuffled_dataset.select(range(num_train + num_val, num_samples))

dataset_dict = DatasetDict({
    'train': train_dataset,
    'valid': val_dataset,
    'test': test_dataset
})

dataset_dict

DatasetDict({
    train: Dataset({
        features: ['simple', 'medical'],
        num_rows: 5573
    })
    valid: Dataset({
        features: ['simple', 'medical'],
        num_rows: 696
    })
    test: Dataset({
        features: ['simple', 'medical'],
        num_rows: 698
    })
})

In [7]:
model_name = "google-t5/t5-base"
new_model = "t5-base-ft-medical-simplifier"

tokenizer = T5Tokenizer.from_pretrained(model_name)

model = T5ForConditionalGeneration.from_pretrained(model_name)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [8]:
def preprocess_function(examples):
    inputs = [f"simplify: {medical_text}" for medical_text in examples['medical']]

    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')

    targets = [simple_text for simple_text in examples['simple']]

    with tokenizer.as_target_tokenizer():
        labels = tokenizer(targets, max_length=512, truncation=True, padding='max_length')

    model_inputs['labels'] = labels['input_ids']

    return model_inputs

In [9]:
tokenized_dataset = dataset_dict.map(
    preprocess_function,
    batched=True,
)

Map:   0%|          | 0/5573 [00:00<?, ? examples/s]



Map:   0%|          | 0/696 [00:00<?, ? examples/s]

Map:   0%|          | 0/698 [00:00<?, ? examples/s]

In [10]:
rouge = evaluate.load('rouge')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred.predictions[0], eval_pred.label_ids

    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(
      predictions=decoded_preds,
      references=decoded_labels,
      use_stemmer=True,
      rouge_types=[
          'rouge1',
          'rouge2',
          'rougeL'
      ]
    )

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [11]:
def preprocess_logits_for_metrics(logits, labels):
    pred_ids = torch.argmax(logits[0], dim=-1)
    return pred_ids, labels

In [12]:
device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
model.to(device)

# set the wandb project where this run will be logged
os.environ["WANDB_PROJECT"]="t5_base_ft_medical_simplifier"

# save your trained model checkpoint to wandb
os.environ["WANDB_LOG_MODEL"]="true"

# turn off watch to log faster
os.environ["WANDB_WATCH"]="false"

training_args = TrainingArguments(
    output_dir='medical_simplifer_t5_base_results',
    num_train_epochs=10,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=1000,
    weight_decay=0.01,
    logging_dir="logs",
    logging_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    save_total_limit=5,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    report_to='wandb',
    learning_rate=3e-5,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['valid'],
    preprocess_logits_for_metrics=preprocess_logits_for_metrics,
    compute_metrics=compute_metrics
)

trainer.train()

wandb.finish()

[34m[1mwandb[0m: Currently logged in as: [33manishbasnet1600[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Gen Len
500,4.6234,0.063776,0.6187,0.3955,0.6063,22.8477
1000,0.0606,0.042756,0.7246,0.5172,0.7173,23.8491
1500,0.0496,0.037668,0.7508,0.5536,0.7442,23.8491
2000,0.0433,0.034106,0.7767,0.5952,0.7703,23.8491
2500,0.0401,0.031915,0.7878,0.6121,0.7822,23.8491
3000,0.0381,0.030482,0.7934,0.6237,0.7884,23.8491
3500,0.0334,0.029397,0.8017,0.6357,0.7966,23.8491
4000,0.0344,0.027964,0.8085,0.6464,0.8036,23.8491
4500,0.0318,0.027087,0.8146,0.6571,0.8098,23.8491
5000,0.0306,0.026255,0.82,0.6656,0.8158,23.8491


In [None]:
trainer.model.save_pretrained(new_model)
model.config.use_cache=True
model.eval()

In [None]:
def simplify(text, model, tokenizer, max_length=512, num_beams=2):
    
    inputs = tokenizer.encode(
        text,
        return_tensors='pt',
        max_length=max_length,
        truncation=True
    ).to(device)
    
    generated_ids = model.generate(
        inputs,
        max_new_tokens=1024,
        num_beams=num_beams,
        early_stopping=True
    )
    
    return tokenizer.decode(generated_ids[0], skip_special_tokens=True)

In [19]:
trainer.model.push_to_hub(new_model, use_temp_dir=False)

CommitInfo(commit_url='https://huggingface.co/anishbasnet/t5-base-ft-medical-simplifier/commit/5983c8a36d6e467566fa4f1356e7db05d9df111b', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='5983c8a36d6e467566fa4f1356e7db05d9df111b', pr_url=None, pr_revision=None, pr_num=None)

In [21]:
trainer.push_to_hub(new_model)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.54k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/anishbasnet/medical_simplifer_t5_base_results/commit/c94f4f8221fa85c7e3af62d546aa9df57cf8a7e8', commit_message='t5-base-ft-medical-simplifier', commit_description='', oid='c94f4f8221fa85c7e3af62d546aa9df57cf8a7e8', pr_url=None, pr_revision=None, pr_num=None)

In [26]:
tokenizer.push_to_hub(new_model)

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/anishbasnet/t5-base-ft-medical-simplifier/commit/65febfff20dc6af884c6391e6c8b6b4d686ff3bc', commit_message='Upload tokenizer', commit_description='', oid='65febfff20dc6af884c6391e6c8b6b4d686ff3bc', pr_url=None, pr_revision=None, pr_num=None)