In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

In [None]:
file_path = "mtsamples_original.csv"
df_sum = pd.read_csv(file_path)

In [None]:
pretrained_peg = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")

In [None]:
# Generate summaries for each transcription and store them in a list
summaries = []
for text in df_sum["transcription"]:
    input_text = f"Please provide only the key medical notes of this transcription: {text}"
    input_ids = tokenizer.encode(input_text, return_tensors="pt", padding='max_length', truncation=True, max_length=300)
    summary_ids = pretrained_peg.generate(input_ids , max_length=300, min_length=50, num_beams=2, repetition_penalty=2.5, length_penalty=5.0, early_stopping=True)
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)
    summaries.append(summary)

In [None]:
# Add the summaries to the DataFrame as a new column
df_sum["summary"] = summaries

In [None]:
df_sum.to_csv("mtsamples_v1.csv", index=False)