<a target="_blank" href="https://colab.research.google.com/github/https://colab.research.google.com/github/ianuragbhatt/text-summarization/blob/main/ts_abstractive_GPT2.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [1]:
# !pip install -q transformers datasets -y
# !pip install -q tensorflow-gpu -y
# !pip upgrade -q numpy scipy -y 

## Importing Libraries

In [2]:
# !pip install torch

In [3]:
import pandas as pd
import numpy as np

import torch
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)

2023-05-11 15:59:54.863942: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-05-11 15:59:54.889923: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Preparing Datasets

In [4]:
# importing datasets 
data_path = 'assets/datasets/sample_findsum/'
train_data_path = data_path + "sample_findsum_train.csv"
test_data_path = data_path + "sample_findsum_test.csv"
val_data_path = data_path + "sample_findsum_val.csv"

In [5]:
# Load datasets
train_data = pd.read_csv(train_data_path)
test_data = pd.read_csv(test_data_path)
val_data = pd.read_csv(val_data_path)

In [6]:
train_data.shape, test_data.shape, val_data.shape

((1322, 2), (414, 2), (331, 2))

In [7]:
# train_dataset = load_dataset("csv", data_files=data_path + "sample_findsum_train.csv")
# test_dataset = load_dataset("csv", data_files=data_path + "sample_findsum_test.csv")
# val_dataset = load_dataset("csv", data_files=data_path + "sample_findsum_val.csv")

In [8]:
# train_dataset, test_dataset, val_dataset

In [9]:
## Tokenize datasets

In [10]:
# Initialize tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2", max_split_size_mb=20)
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [11]:
# Define data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [12]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [13]:
X_train_encodings = tokenizer(list(train_data["document"]), padding=True, truncation=True, max_length=128, return_tensors='pt')
y_train_encodings = tokenizer(list(train_data["summary"]), padding=True, truncation=True, max_length=64, return_tensors='pt')

In [14]:
training_set = Dataset(X_train_encodings, y_train_encodings['input_ids'])

In [15]:
X_val_encodings = tokenizer(list(val_data["document"]), padding=True, truncation=True, max_length=128, return_tensors='pt')
y_val_encodings = tokenizer(list(val_data["summary"]), padding=True, truncation=True, max_length=64, return_tensors='pt')

In [16]:
val_set = Dataset(X_val_encodings, y_val_encodings['input_ids'])

In [17]:
X_test_encodings = tokenizer(list(test_data["document"]), padding=True, truncation=True, max_length=128, return_tensors='pt')
y_test_encodings = tokenizer(list(test_data["summary"]), padding=True, truncation=True, max_length=64, return_tensors='pt')

In [18]:
test_set = Dataset(X_test_encodings, y_test_encodings['input_ids'])

## Finetuned the GPT-2 Model

In [19]:
# !pip install nltk

# from nltk.translate.bleu_score import corpus_bleu
# from transformers import EvalPrediction

# define custom evaluation function
# def corpus_bleu(p: EvalPrediction):
#     references = [refs for refs in p.label_ids]
#     hypotheses = [hyps for hyps in p.predictions]
#     bleu_scores = corpus_bleu(references, hypotheses)
#     return {"bleu": bleu_scores}

In [20]:
# Define training arguments
training_args = TrainingArguments(
    output_dir="assets/results",
    overwrite_output_dir=True,
    num_train_epochs=2,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    evaluation_strategy='steps',
    save_strategy='steps',
    eval_steps=500,
    save_steps=500,
    warmup_steps=500,
    logging_dir='assets/logs',
    logging_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=True # This line enables mixed precision training
)

In [21]:
# Define trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=training_set,
    eval_dataset=val_set,
    data_collator=data_collator,
#     compute_metrics=corpus_bleu
)

In [22]:
# Fine-tune the model
trainer.train()

# free up GPU memory
# del train_data, val_data, trainer, model, tokenizer
# del train_data, val_data, model, tokenizer
# torch.cuda.empty_cache()



  0%|          | 0/1322 [00:00<?, ?it/s]

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


{'loss': 3.481, 'learning_rate': 4.96e-05, 'epoch': 0.76}


  0%|          | 0/166 [00:00<?, ?it/s]

{'eval_loss': 3.0985708236694336, 'eval_runtime': 1.5787, 'eval_samples_per_second': 209.667, 'eval_steps_per_second': 105.15, 'epoch': 0.76}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


{'loss': 3.0273, 'learning_rate': 1.982968369829684e-05, 'epoch': 1.51}


  0%|          | 0/166 [00:00<?, ?it/s]

{'eval_loss': 2.966459035873413, 'eval_runtime': 1.7768, 'eval_samples_per_second': 186.295, 'eval_steps_per_second': 93.429, 'epoch': 1.51}


  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


{'train_runtime': 86.2976, 'train_samples_per_second': 30.638, 'train_steps_per_second': 15.319, 'train_loss': 3.1607398236573374, 'epoch': 2.0}


TrainOutput(global_step=1322, training_loss=3.1607398236573374, metrics={'train_runtime': 86.2976, 'train_samples_per_second': 30.638, 'train_steps_per_second': 15.319, 'train_loss': 3.1607398236573374, 'epoch': 2.0})

In [23]:
# Evaluate the model on the test dataset
trainer.evaluate(test_set)

  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
  item['labels'] = torch.tensor(self.labels[idx])


  0%|          | 0/207 [00:00<?, ?it/s]

{'eval_loss': 2.908712148666382,
 'eval_runtime': 2.1165,
 'eval_samples_per_second': 195.605,
 'eval_steps_per_second': 97.802,
 'epoch': 2.0}

In [24]:
# Save the model and tokenizer
tokenizer.save_pretrained("assets/finetuned_gpt2_model")
model.save_pretrained("assets/finetuned_gpt2_model")

## Check BLEU Score of test dataset
* We will use BLEU Score and see how good our model is generating summarization.
* I will use `corpus_bleu` from `nltk` so, that I will get one `bleu_score` of whole test dataset.

In [25]:
# load required libraries
import pandas as pd
from nltk.translate.bleu_score import corpus_bleu
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [27]:
# load test dataset
data_path = 'assets/datasets/sample_findsum/'
test_data_path = data_path + "sample_findsum_test.csv"
test_df = pd.read_csv(test_data_path)

In [28]:
# define path to saved model
# model_path = "./finetuned_gpt2_model"
model_path = "gpt2"

In [29]:
# Use GPU if available, otherwise use CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# initialize the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(model_path)
model = GPT2LMHeadModel.from_pretrained(model_path).to(device)

# set the padding token to the end-of-sequence token
tokenizer.pad_token = tokenizer.eos_token

In [30]:
def predict_summary(model, tokenizer, text):
    # encode the input sequence to a maximum length of 512 tokens

    inputs = tokenizer.encode_plus(
        text, max_length=128, truncation=True, padding=True, return_tensors='pt').to(device)
    # generate the summary using the model
    summary_ids = model.generate(
        inputs['input_ids'],
        max_length=129,
        num_beams=4,
        no_repeat_ngram_size=2,
        early_stopping=True,
        attention_mask=inputs['attention_mask'],
        pad_token_id=tokenizer.eos_token_id
    )
    
    # decode the summary tokens
    summary = tokenizer.decode(summary_ids.squeeze(), skip_special_tokens=True)
    
    # decode the input tokens
    decoded_inputs = tokenizer.decode(inputs['input_ids'][0])
    return decoded_inputs

In [31]:
# tokenize the reference and predicted summaries
ref_summaries = [[ref] for ref in test_df['summary'].tolist()]
pred_summaries = [predict_summary(model, tokenizer, text) for text in test_df['document'].tolist()]
pred_summaries = [[pred] for pred in pred_summaries]

In [32]:
# calculate the BLEU score
bleu_score = corpus_bleu(ref_summaries, pred_summaries)
print(f"BLEU score: {bleu_score}")

BLEU score: 0


## Generating paragraph summary

In [43]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

def compute_bleu_score(preds, targets):
    bleu_scores = []
    smoothing_func = SmoothingFunction().method1
    for pred, target in zip(preds, targets):
        pred = pred.replace('<s>', '').replace('</s>', '').strip()
        target = target.replace('<s>', '').replace('</s>', '').strip()
        pred_tokens = pred.split()
        target_tokens = target.split()
        bleu_score = sentence_bleu([target_tokens], pred_tokens, smoothing_function=smoothing_func)
        bleu_scores.append(bleu_score)
    avg_bleu_score = sum(bleu_scores) / len(bleu_scores)
    return {'bleu_score': avg_bleu_score}


In [33]:
input_text = '''
"on september 5 , 2012 , we acquired tog , a precision machined metal and alloy parts provider to original 
equipment manufacturers for the steam and natural gas turbine power generation market.
the addition of koontz-wagner 's engineered packaged control house solutions expanded our 
products portfolio to our current customers , and supports the global expansion into 
adjacent markets such as oil and gas pipelines . the acquisition of tog expanded our 
products portfolio to serve the steam turbine market and , combined with our consolidated 
fabricators business unit , established a growth platform for aftermarket energy parts sales .
the tog repair and replacement parts business provides a relatively stable revenue stream .
the financial results of the koontz-wagner acquisition and the tog acquisition have been included
in our product solutions segment .
'''

In [39]:
input_ids = tokenizer.encode(input_text, return_tensors='pt').to(device)

In [40]:
summary_ids = model.generate(input_ids, max_length=100, do_sample=False).to(device)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Input length of input_ids is 177, but `max_length` is set to 100. This can lead to unexpected behavior. You should consider increasing `max_new_tokens`.


In [41]:
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

In [45]:
print("BLEU Score: ", compute_bleu_score(input_text, summary)["bleu_score"], "\n")
print("Paragraph: \n", input_text)
print("Summary: \n", summary)

BLEU Score:  0.008458173528027359 

Paragraph: 
 
"on september 5 , 2012 , we acquired tog , a precision machined metal and alloy parts provider to original 
equipment manufacturers for the steam and natural gas turbine power generation market.
the addition of koontz-wagner 's engineered packaged control house solutions expanded our 
products portfolio to our current customers , and supports the global expansion into 
adjacent markets such as oil and gas pipelines . the acquisition of tog expanded our 
products portfolio to serve the steam turbine market and , combined with our consolidated 
fabricators business unit , established a growth platform for aftermarket energy parts sales .
the tog repair and replacement parts business provides a relatively stable revenue stream .
the financial results of the koontz-wagner acquisition and the tog acquisition have been included
in our product solutions segment .

Summary: 
 
"on september 5, 2012, we acquired tog, a precision machined metal a

## Conclusion
* GPT2 finedtuned and without finetuning is not working well on `FindSum` dataset.