In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from torch.utils.data import Dataset

In [3]:
df = pd.read_csv('ext_train_data.csv')

In [4]:
print(df.keys())

Index(['medical_history', 'ground_truth_summary'], dtype='object')


In [6]:
class abstractive_summ(Dataset):
    def __init__(self, tokenizer, file_path, num_samples, input_length, output_length, print_text=False):         
        self.dataset =  pd.read_csv(file_path)
        if num_samples:
            self.dataset = self.dataset[:num_samples]
        self.input_length = input_length
        self.tokenizer = tokenizer
        self.output_length = output_length
        self.print_text = print_text
  
    def __len__(self):
        return self.dataset.shape[0]
    
    def clean_text(self, text):
        text = text.replace('\n','')
        text = text.replace('``', '')
        text = text.replace('"', '')
        
        return text
    
    
    def convert_to_features(self, example_batch):
        # Tokenize contexts and questions (as pairs of inputs)
        
        if self.print_text:
            print("Input Text: ", self.clean_text(example_batch['medical_history']))
#         input_ = self.clean_text(example_batch['text']) + " </s>"
#         target_ = self.clean_text(example_batch['headline']) + " </s>"
        
        input_ = self.clean_text(example_batch['medical_history'])
        target_ = self.clean_text(example_batch['ground_truth_summary'])
        
        source = self.tokenizer.batch_encode_plus([input_], max_length=self.input_length, 
                                                     padding='max_length', truncation=True, return_tensors="pt")
        
        targets = self.tokenizer.batch_encode_plus([target_], max_length=self.output_length, 
                                                     padding='max_length', truncation=True, return_tensors="pt")
    
       
        return source, targets
  
    def __getitem__(self, index):
        source, targets = self.convert_to_features(self.dataset.iloc[index])
        
        source_ids = source["input_ids"].squeeze()
        target_ids = targets["input_ids"].squeeze()

        src_mask    = source["attention_mask"].squeeze()
        target_mask = targets["attention_mask"].squeeze()

        return {"source_ids": source_ids, "source_mask": src_mask, "target_ids": target_ids, "target_mask": target_mask}
        
    

In [8]:
def get_dataset(tokenizer, type_path, num_samples, args):
    file_path = f"{type_path}_data.csv"  # train_data.csv, validation_data.csv, test_data.csv
    dataset = abstractive_summ(tokenizer=tokenizer, file_path=file_path, num_samples=num_samples, input_length=args.max_input_length, output_length=args.max_output_length)
    return dataset


In [10]:
import torch
from torch.utils.data import DataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer, AdamW, get_linear_schedule_with_warmup
import pytorch_lightning as pl
from datasets import load_metric
import time
import numpy as np

class T5FineTuner(pl.LightningModule):
    def __init__(self, hparams):
        super(T5FineTuner, self).__init__()
        self.save_hyperparameters(hparams)
        self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path)
        self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path)
        self.rouge_metric = load_metric('rouge') 
        
        if self.hparams.freeze_embeds:
            self.freeze_embeds()
        if self.hparams.freeze_encoder:
            self.freeze_params(self.model.get_encoder())
            assert_all_frozen(self.model.get_encoder())
            
        self.n_obs = {
            "train": self.hparams.n_train,
            "validation": self.hparams.n_val,
            "test": self.hparams.n_test
        }
        
    def freeze_params(self, model):
        for par in model.parameters():
            par.requires_grad = False
            
    def freeze_embeds(self):
        try:
            self.freeze_params(self.model.shared)
            for d in [self.model.encoder, self.model.decoder]:
                self.freeze_params(d.embed_tokens)
        except AttributeError:
            self.freeze_params(self.model.shared)
            for d in [self.model.encoder, self.model.decoder]:
                self.freeze_params(d.embed_tokens)
    
    def forward(self, input_ids, attention_mask=None, decoder_input_ids=None, decoder_attention_mask=None, lm_labels=None):
        return self.model(
            input_ids,
            attention_mask=attention_mask,
            decoder_input_ids=decoder_input_ids,
            decoder_attention_mask=decoder_attention_mask,
            labels=lm_labels,
        )

    def _step(self, batch):
        lm_labels = batch["target_ids"]
        lm_labels[lm_labels[:, :] == self.tokenizer.pad_token_id] = -100

        outputs = self(
            input_ids=batch["source_ids"],
            attention_mask=batch["source_mask"],
            lm_labels=lm_labels,
            decoder_attention_mask=batch['target_mask']
        )

        loss = outputs.loss

        return loss

    def training_step(self, batch, batch_idx):
        loss = self._step(batch)
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss
  
    def validation_step(self, batch, batch_idx):
        loss = self._step(batch)
        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=self.hparams.learning_rate, eps=self.hparams.adam_epsilon)
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=self.hparams.warmup_steps, num_training_steps=self.hparams.total_steps
        )
        return [optimizer], [{'scheduler': scheduler, 'interval': 'step'}]

    def train_dataloader(self):   
        train_dataset = get_dataset(self.tokenizer, 'train', self.n_obs['train'], self.hparams)
        return DataLoader(train_dataset, batch_size=self.hparams.train_batch_size, shuffle=True, num_workers=4)

    def val_dataloader(self):
        validation_dataset = get_dataset(self.tokenizer, 'validation', self.n_obs['validation'], self.hparams)
        return DataLoader(validation_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)

    def test_dataloader(self):
        test_dataset = get_dataset(self.tokenizer, 'test', self.n_obs['test'], self.hparams)
        return DataLoader(test_dataset, batch_size=self.hparams.eval_batch_size, num_workers=4)


In [13]:
from argparse import Namespace

args_dict = {
    'model_name_or_path': 't5-large',
    'tokenizer_name_or_path': 't5-large',
    'max_input_length': 512,
    'max_output_length': 512,
    'freeze_encoder': False,
    'freeze_embeds': False,
    'learning_rate': 1e-4,
    'weight_decay': 0.0,
    'adam_epsilon': 1e-8,
    'warmup_steps': 0,
    'train_batch_size': 16,
    'eval_batch_size': 16,
    'num_train_epochs': 5,
    'n_train': 2000,  #training samples
    'n_val': 400,     # validation samples
    'n_test': 400,    #test samples
    'gradient_accumulation_steps': 16,
    'n_gpu': 1,
    'early_stop_callback': False,
    'fp_16': False,  
    'opt_level': 'O1', 
    'max_grad_norm': 1.0,
    'seed': 42,
    #'total_steps': (n_train / train_batch_size / gradient_accumulation_steps) * num_train_epochs
    'total_steps': 23
}
args = Namespace(**args_dict)


In [15]:
model = T5FineTuner(args)

In [17]:
from pytorch_lightning import Trainer
trainer = Trainer(
    max_epochs=args.num_train_epochs
)


Trainer will use only 1 of 2 GPUs because it is running inside an interactive / notebook environment. You may try to set `Trainer(devices=2)` but please note that multi-GPU inside interactive / notebook environments is considered experimental and unstable. Your mileage may vary.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [21]:
trainer.fit(model)


/home/goodone/.local/lib/python3.8/site-packages/pytorch_lightning/callbacks/model_checkpoint.py:639: Checkpoint directory /home/goodone/Desktop/mlhc/lightning_logs/version_3/checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

  | Name  | Type                       | Params
-----------------------------------------------------
0 | model | T5ForConditionalGeneration | 60.5 M
-----------------------------------------------------
60.5 M    Trainable params
0         Non-trainable params
60.5 M    Total params
242.026   Total estimated model params size (MB)


Sanity Checking: |                 | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=5` reached.


In [23]:
model.eval()  # set the model to evaluation mode

# Function to generate summary
def generate_summary(text, max_length=512):
    # Tokenize the input text
    inputs =  T5Tokenizer.from_pretrained('t5-small').encode("summarize: " + text, return_tensors="pt", max_length=512, truncation=True)

    # Generate summary
    summary_ids = model.model.generate(inputs, max_length=max_length, min_length=40, length_penalty=2.0, num_beams=4, early_stopping=True)
    
    # Decode and return the summary
    return T5Tokenizer.from_pretrained('t5-small').decode(summary_ids[0], skip_special_tokens=True)


In [40]:
test_example = pd.read_csv('test_data.csv')
input_text = test_example['medical_history'].iloc[115]
summary = generate_summary(input_text)
print(summary)

the 38-year-old was admitted to the NI 9224** service with close neurological monitoring. she underwent coiling of an aneurysm of the left internal capillary. she was taken to the angiography suite on [**2108-9-24**] where she underwent coiling of an aneurysm.


In [42]:
print(test_example['ground_truth_summary'].iloc[115])

[**Known firstname **] [**Known lastname 57383**] is a 38-year-old
female who is transferred here from an outside hospital for
evaluation of questionable subarachnoid hemorrhage with a
negative CT.  She notes that one day prior to admission, at
approximately 2:30 p.m. her head and neck "felt funny."  It
was not painful, and she had had a gradual onset of
increasing pain and then had sudden onset 2 hours later at
4:30 in the afternoon of pounding whole head headache and
neck pain that she describes as the worst of her life.  She
also had some photophobia.  She had no fever or chills.  She
had no history of trauma.  Lumbar puncture at an outside
hospital showed tube number two 117,000 red blood cells and
in tube number three 95,000 red cells.  Total protein was
241, glucose was 54, and there was no xanthochromia and no
opening pressure was recorded.

ALLERGIES:  She has no known drug allergies.

MEDICATIONS ON ADMISSION:  None.

SOCIAL HISTORY:  She does not drink alcohol and is not a
no

In [6]:
df1= pd.read_csv('../test_data.csv')

In [10]:
df1['ground_truth_summary'].iloc[123]

"The patient is a 73 year old\nmale with a history of coronary artery disease and aortic\nstenosis who has had a jaw tightness with walking short\ndistances.  He has been followed by his cardiologist given\nhis history of coronary artery disease and was discovered to\nhave aortic stenosis.  This aortic stenosis is followed by\nechocardiogram.  The patient's coronary history is\nsignificant for percutaneous transluminal coronary\nangioplasty with stent to obtuse marginal one.  This\npercutaneous transluminal coronary angioplasty was\ncomplicated by formation of a right femoral AV fistula and\npseudoaneurysm which eventually required surgical repair.\nCardiac catheterization in [**2196-12-7**], showed 80 percent\nin-stent restenosis of the obtuse marginal one which was\ntreated with roto.  Cardiac catheterization in [**2198-1-6**],\nshowed 30 percent in-stent restenosis of obtuse marginal one.\nAlso at that time, the patient was discovered to have a\nmoderate to severe aortic stenosis wi

In [11]:
df1['medical_history'].iloc[123]

'Admission Date: [**2200-6-25**]        Discharge Date: [**2200-6-29**]\n\nDate of Birth:  [**2126-9-17**]        Sex:  M\n\nService:  CSU\n\n\nPAST MEDICAL HISTORY:\n1. Coronary artery disease, status post percutaneous\n   transluminal coronary angioplasty with multiple in-stent\n   restenoses complicated by right femoral pseudoaneurysm\n   requiring surgical repair, worsening aortic stenosis.\n2. History of hypertension.\n3. History of diabetes mellitus.\n4. History of hypercholesterolemia.\n\n\nALLERGIES:  The patient denies any allergies to medications.\n\nMEDICATIONS ON ADMISSION:\n1. Aspirin 81 mg p.o. once daily.\n2. Lipitor 10 mg p.o. once daily.\n3. Lisinopril 40 mg p.o. once daily.\n4. Atenolol 50 mg p.o. once daily.\n5. Glipizide 10 mg p.o. once daily.\n\n\nSOCIAL HISTORY:  The patient denies any smoking and reports\noccasional alcohol use.\n\nREVIEW OF SYMPTOMS:  Otherwise, review of systems is\nunremarkable.\n\nPHYSICAL EXAMINATION:  The patient was afebrile with stable\nv