In [1]:
from transformers import PegasusTokenizer, PegasusForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer
import evaluate
import numpy as np
import pandas as pd
from datasets import Dataset, load_dataset
from transformers import DataCollatorForSeq2Seq
import evaluate

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
file_path = "mtsamples_v1.csv"
df_sum = pd.read_csv(file_path)
df_sum1 = df_sum[['transcription', 'summary']]

In [4]:
df_dataset = Dataset.from_pandas(df_sum1)

In [5]:
billsum = df_dataset.train_test_split(test_size=0.2)

In [6]:
billsum['test'][0]

{'transcription': 'HISTORY OF PRESENT ILLNESS:,  The patient is a 26-year-old gravida 2, para 1-0-0-1, at 28-1/7 weeks who presents to the emergency room with left lower quadrant pain, reports no bowel movement in two weeks as well as nausea and vomiting for the last 24 hours or so.  She states that she has not voided in the last 24 hours as well due to pain.  She denies any leaking of fluid, vaginal bleeding, or uterine contractions.  She reports good fetal movement.  She denies any fevers, chills, or burning with urination.,REVIEW OF SYSTEMS: , Positive for back pain in her lower back only.  Her mother reports that she has been eating food without difficulty and that the current nausea and vomiting is much less than when she is not pregnant.  She continues to yell out for requesting pain medication and about how much "it hurts.",PAST MEDICAL HISTORY:,1.  Irritable bowel syndrome.,2.  Urinary tract infections times three.  The patient is unsure if pyelo is present or not.,PAST SURGICA

In [7]:
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")

In [8]:
# def preprocess_function(examples):
#     inputs = examples["transcription"]
#     model_inputs = tokenizer(inputs, truncation=True, padding="longest")

#     labels = examples["summary"]
#     model_inputs["labels"] = tokenizer(labels, truncation=True, padding="longest").input_ids
#     return model_inputs

def preprocess_function(examples):
    inputs = [str(text) for text in examples["transcription"]]
    model_inputs = tokenizer(inputs, truncation=True, padding="longest")

    labels = [str(summary) for summary in examples["summary"]]
    model_inputs["labels"] = tokenizer(labels, truncation=True, padding="longest").input_ids
    return model_inputs

In [9]:
tokenized_billsum = billsum.map(preprocess_function, batched=True)

Map:   0%|          | 0/3999 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [10]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [11]:
rouge = evaluate.load("rouge")

In [12]:
# def compute_metrics(eval_pred):
#     predictions, labels = eval_pred
#     decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
#     decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

#     result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

#     prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
#     result["gen_len"] = np.mean(prediction_lens)

#     return {k: round(v.mid.fmeasure * 100, 4) for k, v in result.items()}

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result = rouge.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=True)

    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)

    return {k: round(v, 4) for k, v in result.items()}

In [13]:
training_args = Seq2SeqTrainingArguments(
    output_dir="PegasusMedicalSummary",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=4,
    predict_with_generate=True,
    fp16=True,
    push_to_hub=True,
    gradient_accumulation_steps=4,
)

In [14]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_billsum["train"],
    eval_dataset=tokenized_billsum["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

/home/jupyter/HealthGPT/PegasusMedicalSummary is already a clone of https://huggingface.co/renegarza/PegasusMedicalSummary. Make sure you pull the latest changes with `repo.git_pull()`.


In [15]:
# Start training
trainer.train()



Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
0,6.5172,0.17838,0.4161,0.2373,0.3388,0.3384,52.102
1,0.3174,0.155042,0.4236,0.2434,0.343,0.3428,54.458
2,0.2632,0.146186,0.4269,0.2467,0.3465,0.3464,55.503
3,0.2477,0.143825,0.4318,0.2525,0.3524,0.3525,55.882


TrainOutput(global_step=3996, training_loss=1.0827994303660349, metrics={'train_runtime': 11059.6924, 'train_samples_per_second': 1.446, 'train_steps_per_second': 0.361, 'total_flos': 2.3092599598350336e+16, 'train_loss': 1.0827994303660349, 'epoch': 4.0})

In [16]:
# Push the trained model to the Hugging Face Model Hub
trainer.push_to_hub()

Upload file pytorch_model.bin:   0%|          | 1.00/2.12G [00:00<?, ?B/s]

To https://huggingface.co/renegarza/PegasusMedicalSummary
   6937902..e5e94d4  main -> main

   6937902..e5e94d4  main -> main

To https://huggingface.co/renegarza/PegasusMedicalSummary
   e5e94d4..893fd7b  main -> main

   e5e94d4..893fd7b  main -> main



'https://huggingface.co/renegarza/PegasusMedicalSummary/commit/e5e94d49fcd8a8c63ee7b7302ee8256eb7e77b05'

In [17]:
# Save the fine-tuned model
model.save_pretrained("PegasusMedicalSummary")

In [24]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, SummarizationPipeline

model_name = "renegarza/PegasusMedicalSummary"
tokenizer = PegasusTokenizer.from_pretrained(model_name)
model = PegasusForConditionalGeneration.from_pretrained(model_name)

summarizer = SummarizationPipeline(model=model, tokenizer=tokenizer)

In [28]:
text = "1.  The left ventricular cavity size and wall thickness appear normal.  The wall motion and left ventricular systolic function appears hyperdynamic with estimated ejection fraction of 70% to 75%.  There is near-cavity obliteration seen.  There also appears to be increased left ventricular outflow tract gradient at the mid cavity level consistent with hyperdynamic left ventricular systolic function.  There is abnormal left ventricular relaxation pattern seen as well as elevated left atrial pressures seen by Doppler examination.,2.  The left atrium appears mildly dilated.,3.  The right atrium and right ventricle appear normal.,4.  The aortic root appears normal.,5.  The aortic valve appears calcified with mild aortic valve stenosis, calculated aortic valve area is 1.3 cm square with a maximum instantaneous gradient of 34 and a mean gradient of 19 mm.,6.  There is mitral annular calcification extending to leaflets and supportive structures with thickening of mitral valve leaflets with mild mitral regurgitation.,7.  The tricuspid valve appears normal with trace tricuspid regurgitation with moderate pulmonary artery hypertension.  Estimated pulmonary artery systolic pressure is 49 mmHg.  Estimated right atrial pressure of 10 mmHg.,8.  The pulmonary valve appears normal with trace pulmonary insufficiency.,9.  There is no pericardial effusion or intracardiac mass seen.,10.  There is a color Doppler suggestive of a patent foramen ovale with lipomatous hypertrophy of the interatrial septum.,11.  The study was somewhat technically limited and hence subtle abnormalities could be missed from the study.,"
summary = summarizer(text)
print(summary)

[{'summary_text': 'This is a case report of a patient with left ventricular ejection fraction (LVEF) of 70% and left atrial pressure of 10 mmHg who was admitted to the intensive care unit for suspected heart failure.,2. The aortic valve appears calcified with mild aortic valve stenosis, calculated aortic valve area is 1.3 cm'}]
