# Google Colab setup

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cp -R ./drive/MyDrive/Grounding_LM/ ./

# Packages & Imports

In [3]:
%pip install -q pytorch-lightning
%pip install -q transformers
%pip install -q datasets
%pip install -q openai
%pip install -q tiktoken
# %pip install -q -r requirements.txt

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/720.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━[0m [32m317.4/720.6 kB[0m [31m9.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m720.6/720.6 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.2/519.2 kB[0m [31m48.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m66.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.5/114.5 kB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m31.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m149.6/149.6 kB[0m [31m18.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━

In [4]:
from datasets import load_dataset, load_from_disk
import pandas as pd
import tiktoken
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq, Seq2SeqTrainingArguments, Seq2SeqTrainer
from torch.optim import AdamW
from tqdm.auto import tqdm
import openai
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
import random

# Data

## Dataset

In [5]:
# Set Dataset
dataset = "xsum"

# Download Dataset
# data = load_dataset("xsum")
# data = load_dataset("cnn_dailymail", "3.0.0")
# data = load_dataset("webis/tldr-17")

# Split Dataset
# df_train = pd.DataFrame(data=data['train'])
# df_val = pd.DataFrame(data=data['validation'])
# df_test = pd.DataFrame(data=data['test'])

# Rename columns for later usage
# df_train.columns = ['text', 'summary', 'id']
# df_val.columns = ['text', 'summary', 'id']
# df_test.columns = ['text', 'summary', 'id']

# Save to data folder (inside Grounding_LM folder)
# df_train.to_csv(f'Grounding_LM/data/{dataset}/train.csv', index=False)
# df_val.to_csv(f'Grounding_LM/data/{dataset}/validation.csv', index=False)
# df_test.to_csv(f'Grounding_LM/data/{dataset}/test.csv', index=False)

# Read Dataset from folder
df_train = pd.read_csv(f'Grounding_LM/data/{dataset}/train.csv')
df_val = pd.read_csv(f'Grounding_LM/data/{dataset}/validation.csv')
df_test = pd.read_csv(f'Grounding_LM/data/{dataset}/test.csv')
df_test[['text', 'summary']] = df_test[['text', 'summary']].astype(str)

df_train.head()

Unnamed: 0,text,summary,id
0,"The full cost of damage in Newton Stewart, one...",Clean-up operations are continuing across the ...,35232142
1,A fire alarm went off at the Holiday Inn in Ho...,Two tourist buses have been destroyed by fire ...,40143035
2,Ferrari appeared in a position to challenge un...,Lewis Hamilton stormed to pole position at the...,35951548
3,"John Edward Bates, formerly of Spalding, Linco...",A former Lincolnshire Police officer carried o...,36266422
4,Patients and staff were evacuated from Cerahpa...,An armed man who locked himself into a room at...,38826984


## Custom Datamodule

In [6]:
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, text_max_len = 512, summary_max_len = 128):
        self.data = data
        self.tokenizer = tokenizer
        self.text_max_len = text_max_len
        self.summary_max_len = summary_max_len

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx: int):
        text = self.data.iloc[idx]['text']
        summary = self.data.iloc[idx]['summary']
        id = self.data.iloc[idx]['id']

        text_encoding = self.tokenizer(
            text,
            max_length=self.text_max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        summary_encoding = self.tokenizer(
            summary,
            max_length=self.summary_max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            add_special_tokens=True,
            return_tensors='pt'
        )

        labels = summary_encoding['input_ids']
        labels[labels == 0] = -100

        return {
            'text': text,
            'summary': summary,
            'id': id,
            'input_ids': text_encoding['input_ids'],
            'attention_mask': text_encoding['attention_mask'],
            'labels': labels.flatten(),
            'labels_attention_mask': summary_encoding['attention_mask']
        }

In [7]:
class CustomDataModule(pl.LightningDataModule):
    def __init__(self, df_train, df_val, df_test, tokenizer, batch = 8, text_max_len = 512, summary_max_len = 128):
        super().__init__()
        self.df_train = df_train
        self.df_val = df_val
        self.df_test = df_test
        self.tokenizer = tokenizer
        self.batch = batch
        self.text_max_len = text_max_len
        self.summary_max_len = summary_max_len

    def setup(self, stage=None):
      self.train_dataset = CustomDataset(self.df_train, self.tokenizer, self.text_max_len, self.summary_max_len)
      self.val_dataset = CustomDataset(self.df_val, self.tokenizer, self.text_max_len, self.summary_max_len)
      self.test_dataset = CustomDataset(self.df_test, self.tokenizer, self.text_max_len, self.summary_max_len)

    def collate_fn(self, batch):
      texts = [item['text'] for item in batch]
      summaries = [item['summary'] for item in batch]
      ids = [item['id'] for item in batch]
      text_input_ids = pad_sequence([item['input_ids'].flatten() for item in batch], batch_first=True)
      text_attention_masks = pad_sequence([item['attention_mask'].flatten() for item in batch], batch_first=True)
      labels = pad_sequence([item['labels'] for item in batch], batch_first=True)
      labels_attention_masks = pad_sequence([item['labels_attention_mask'].flatten() for item in batch], batch_first=True)

      return {
          'text': texts,
          'summary': summaries,
          'id': ids,
          'input_ids': text_input_ids,
          'attention_mask': text_attention_masks,
          'labels': labels,
          'labels_attention_mask': labels_attention_masks
      }

    def train_dataloader(self):
      return DataLoader(self.train_dataset, batch_size=self.batch, shuffle=True, num_workers=2, collate_fn=self.collate_fn)

    def val_dataloader(self):
      return DataLoader(self.val_dataset, batch_size=self.batch, shuffle=True, num_workers=2, collate_fn=self.collate_fn)

    def test_dataloader(self):
      return DataLoader(self.test_dataset, batch_size=self.batch, shuffle=True, num_workers=2, collate_fn=self.collate_fn)

## Model

In [None]:
class SummaryModel(pl.LightningModule):

  def __init__(self, model):
    super().__init__()
    self.model = AutoModelForSeq2SeqLM.from_pretrained(model, return_dict=True)

  def forward(self, input_ids, attention_mask, decoder_attention_mask, labels=None):

    output = self.model(input_ids=input_ids,
                        attention_mask=attention_mask,
                        decoder_attention_mask=decoder_attention_mask,
                        labels=labels,
    )
    return output

  def training_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    decoder_attention_mask = batch["labels_attention_mask"]
    labels = batch["labels"]

    output = self.forward(input_ids, attention_mask, decoder_attention_mask, labels)
    loss = output.loss
    self.log("train_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
    return loss

  def validation_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    decoder_attention_mask = batch["labels_attention_mask"]
    labels = batch["labels"]

    output = self.forward(input_ids, attention_mask, decoder_attention_mask, labels)
    loss = output.loss
    self.log("val_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
    return loss

  def test_step(self, batch, batch_idx):
    input_ids = batch["input_ids"]
    attention_mask = batch["attention_mask"]
    decoder_attention_mask = batch["labels_attention_mask"]
    labels = batch["labels"]

    output = self.forward(input_ids, attention_mask, decoder_attention_mask, labels)
    loss = output.loss
    self.log("test_loss", loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
    return loss

  def configure_optimizers(self):
    return AdamW(self.parameters(), lr=0.0001)

# Fine-Tuned models

### Pretrained T5 model

In [None]:
# N_EPOCHS = 5
# BATCH_SIZE = 32

# tokenizer = AutoTokenizer.from_pretrained('t5-base')
# data_module = CustomDataModule(df_train, df_val, df_test, tokenizer, batch=BATCH_SIZE)
# model = SummaryModel('t5-base')

# checkpoint_callback = ModelCheckpoint(dirpath="checkpoints",
#                                       filename="best_checkpoints",
#                                       save_top_k=1,
#                                       verbose=True,
#                                       monitor="val_loss",
#                                       mode="min")

# trainer = pl.Trainer(callbacks=checkpoint_callback, max_epochs=N_EPOCHS, accelerator="gpu", enable_progress_bar=True)

In [None]:
# trainer.fit(model, data_module)

# trained_model = SummaryModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
# trained_model.freeze()

In [None]:
tokenizer = AutoTokenizer.from_pretrained("sysresearch101/t5-large-finetuned-xsum-cnn")
model = AutoModelForSeq2SeqLM.from_pretrained("sysresearch101/t5-large-finetuned-xsum-cnn")

Downloading pytorch_model.bin:   0%|          | 0.00/2.95G [00:00<?, ?B/s]

In [None]:
data_module = CustomDataModule(df_train, df_val, df_test, tokenizer, batch=8)
data_module.setup()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

summaries, ids = [], []

with torch.no_grad():
    for batch in tqdm(data_module.test_dataloader()):
        summary_ids = model.generate(
          input_ids = batch['input_ids'].to(device),
          attention_mask=batch['attention_mask'].to(device),
          max_length=150,
          num_beams=2,
          repetition_penalty=2.5,
          length_penalty=1.0,
          early_stopping=True
      )

        summary_text = [tokenizer.decode(sum_id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for sum_id in summary_ids]
        summaries.extend(summary_text)
        ids.extend(batch['id'])

In [None]:
data = {'id': ids, 'generated': summaries}
df_new = pd.DataFrame(data)
df_new.to_csv("t5_large_xsum.csv", index=False)

In [None]:
%cp t5_large_xsum.csv ./drive/MyDrive/Grounding_LM/results/

### BART

In [None]:
# N_EPOCHS = 5
# BATCH_SIZE = 32

# tokenizer = AutoTokenizer.from_pretrained('bart-base')
# data_module = CustomDataModule(df_train, df_val, df_test, tokenizer, batch=BATCH_SIZE)
# model = SummaryModel('bart-base')

# checkpoint_callback = ModelCheckpoint(dirpath="checkpoints",
#                                       filename="best_checkpoints",
#                                       save_top_k=1,
#                                       verbose=True,
#                                       monitor="val_loss",
#                                       mode="min")

# trainer = pl.Trainer(callbacks=checkpoint_callback, max_epochs=N_EPOCHS, accelerator="gpu", enable_progress_bar=True)

In [None]:
# trainer.fit(model, data_module)

# trained_model = SummaryModel.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)
# trained_model.freeze()

In [8]:
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large-xsum")
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/bart-large-xsum")

Downloading (…)okenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.51k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/309 [00:00<?, ?B/s]

In [9]:
data_module = CustomDataModule(df_train, df_val, df_test, tokenizer, batch=8)
data_module.setup()

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

summaries, ids = [], []

dataloader = data_module.test_dataloader()

with torch.no_grad():
    for batch in tqdm(dataloader):
        summary_ids = model.generate(
          input_ids = batch['input_ids'].to(device),
          attention_mask=batch['attention_mask'].to(device),
          num_beams=6,
          length_penalty=2.0,
          no_repeat_ngram_size=4,
          min_length=10,
          max_length=60,
          early_stopping=True
        )

        summary_text = [tokenizer.decode(sum_id, skip_special_tokens=True, clean_up_tokenization_spaces=True) for sum_id in summary_ids]
        summaries.extend(summary_text)
        ids.extend(batch['id'])

  0%|          | 0/1417 [00:00<?, ?it/s]

In [11]:
data = {'id': ids, 'generated': summaries}
df_new = pd.DataFrame(data)
df_new.to_csv("bart_large_xsum.csv", index=False)

In [12]:
%cp bart_large_xsum.csv ./drive/MyDrive/Grounding_LM/results/

# OPEN-AI Models


In [None]:
openai.api_key = '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'

### text-davinci-003

In [None]:
def format_prompt(example_text, input_text):
    prompt = """System: You are an extractive summarizer that follows the output pattern.
            User: The following examples are successful extractive summarization instances: """ + example_text + """.
            Please summarize the following document. Document: + """ + str(input_text) + """,output: """
    return prompt

def generate_summary(input_list, example_text):
  sum_dict = {"text":[],"summary":[]}

  for i, input_text in enumerate(tqdm(input_list)):
    # if i == 2:
    #   break

    sum_dict['text'].append(input_text)
    prompt = format_prompt(example_text, input_text)

    response = openai.Completion.create(
      model="text-davinci-003",
      prompt=prompt,
      temperature=0.5,
      max_tokens=128,
      top_p=1.0,
      frequency_penalty=0.0,
      presence_penalty=0.0
    )
    sum_dict["summary"].append(response['choices'][0]['text'])

  sum_df = pd.DataFrame(sum_dict)
  sum_df.to_csv("davinci003_xsum.csv",index=False)

In [None]:
text_inputs = df_test['text'].tolist()

random_rows = df_train.sample(n=2)
examples_prompt = ""
for _, row in random_rows.iterrows():
  examples_prompt += "input: " + row['text'] + ",output: " + row['summary'].replace('\n', '') + "\n"

# print(examples_prompt + " \n\n Total wordcount examples: " + str(len(examples_prompt)))

In [None]:
# Ensure max token length of prompt is below 4097 token boundary (Note: Take into account max_tokens in 'openai.Completion.create' above)
encoding = tiktoken.encoding_for_model("text-davinci-003")
token_len = []
for text in text_inputs:
  input = format_prompt(examples_prompt, text)
  tokens = encoding.encode(input)
  token_len.append(len(tokens))

max(token_len)

4424

In [None]:
generate_summary(text_inputs, examples_prompt)

  0%|          | 0/11490 [00:00<?, ?it/s]

In [None]:
%cp davinci003_xsum.csv ./drive/MyDrive/Grounding_LM/results/

### get-3.5-turbo

In [None]:
def generate_summary(input_list):
  sum_dict = {"text":[],"summary":[]}
  for i, input_text in enumerate(tqdm(input_list)):
    if i == 2:
      break

    sum_dict['text'].append(input_text)
    response = openai.ChatCompletion.create(
                model= "gpt-3.5-turbo",
                messages= [{"role":"system", "content":"You are assistant who replies with a clear and concise summary for every document."},
                          {"role": "user", "content":"Please summarize the following document. Document: " + input_text}],
                temperature=0.5,
                max_tokens=128,
                top_p=1.0,
                frequency_penalty=0.0,
                presence_penalty=0.0
            )
    sum_dict["summary"].append(response['choices'][0]['text'])

  sum_df = pd.DataFrame(sum_dict)
  sum_df.to_csv("davinci003_xsum.csv",index=False)

In [None]:
text_inputs = df_test['text'].tolist()

encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
token_len = []
for text in text_inputs:
  tokens = encoding.encode(str(text))
  token_len.append(len(tokens))

print(f"max token length: {max(token_len)}, total tokens: {sum(token_len)}")

max token length: 15255, total tokens: 5395732


In [None]:
df_test['get-3.5-turbo'] = df['text'].apply(generate_summary)

In [None]:
df.to_csv('summary_output.csv', index=False)