# Mound Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Install Requirements

In [None]:
!pip install sentencepiece
!pip install transformers
# !pip install datasets

Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l[K     |▎                               | 10 kB 30.7 MB/s eta 0:00:01[K     |▌                               | 20 kB 28.1 MB/s eta 0:00:01[K     |▉                               | 30 kB 19.5 MB/s eta 0:00:01[K     |█                               | 40 kB 16.9 MB/s eta 0:00:01[K     |█▍                              | 51 kB 8.9 MB/s eta 0:00:01[K     |█▋                              | 61 kB 10.4 MB/s eta 0:00:01[K     |██                              | 71 kB 9.4 MB/s eta 0:00:01[K     |██▏                             | 81 kB 10.5 MB/s eta 0:00:01[K     |██▍                             | 92 kB 10.8 MB/s eta 0:00:01[K     |██▊                             | 102 kB 8.5 MB/s eta 0:00:01[K     |███                             | 112 kB 8.5 MB/s eta 0:00:01[K     |███▎                            | 122 kB 8.5 MB/s eta 0:00:01[K     |███▌   

# Imports

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import torch
import pandas as pd
import numpy as np
import pickle
import tensorflow as tf
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
# from datasets import load_dataset

# Pre-processing

In [None]:
#Preprocessing

#This the dictionary used for expanding contractions
contraction_mapping = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not",

                           "didn't": "did not", "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not",

                           "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",

                           "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would",

                           "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would",

                           "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam",

                           "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have",

                           "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock",

                           "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have",

                           "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is",

                           "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as",

                           "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would",

                           "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have",

                           "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have",

                           "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are",

                           "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",

                           "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is",

                           "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have",

                           "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have",

                           "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all",

                           "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have",

                           "you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have",

                           "you're": "you are", "you've": "you have"}

In [None]:
import nltk
from nltk.corpus import stopwords 
from bs4 import BeautifulSoup   #Package for pulling data out of HTML and XML files
import re

nltk.download('stopwords')

stop_words = set(stopwords.words('english')) 
def text_cleaner(text,num):
    newString = text.lower()  #converts all uppercase characters in the string into lowercase characters and returns it
    newString = BeautifulSoup(newString, "lxml").text #parses the string into an lxml.html 
    newString = re.sub(r'\([^)]*\)', '', newString) #used to replace a string that matches a regular expression instead of perfect match
    newString = re.sub('"','', newString)           
    newString = ' '.join([contraction_mapping[t] if t in contraction_mapping else t for t in newString.split(" ")]) #for expanding contractions using the contraction_mapping dictionary    
    newString = re.sub(r"'s\b","",newString)
    newString = re.sub("[^a-zA-Z]", " ", newString)
    if(num==0): 
      tokens = [w for w in newString.split() if not w in stop_words]  #converting the strings into tokens
    else :
      tokens = newString.split()
    long_words=[]
    for i in tokens:
        if len(i)>1:                  #removing short words
            long_words.append(i)   
    return (" ".join(long_words)).strip()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


# Fine-tuning

The following script contains the class which is responsible for Fine-tuning Pegasus. 
The class contains the function *prepare_fine_tuning* which contains all the nescessary configuration for tuning Pegasus. 

In [None]:
"""Script for fine-tuning Pegasus
Example usage:
  # use XSum dataset as example, with first 1000 docs as training data
  from datasets import load_dataset
  dataset = load_dataset("xsum")
  train_texts, train_labels = dataset['train']['document'][:1000], dataset['train']['summary'][:1000]
  
  # use Pegasus Large model as base for fine-tuning
  model_name = 'google/pegasus-large'
  train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels)
  trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset)
  trainer.train()
 
Reference:
  https://huggingface.co/transformers/master/custom_datasets.html
"""

from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import torch


class PegasusDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels['input_ids'][idx])  # torch.tensor(self.labels[idx])
        return item
    def __len__(self):
        return len(self.labels['input_ids'])  # len(self.labels)

      
def prepare_data(model_name, 
                 train_texts, train_labels, 
                 val_texts=None, val_labels=None, 
                 test_texts=None, test_labels=None):
  """
  Prepare input data for model fine-tuning
  """
  tokenizer = PegasusTokenizer.from_pretrained(model_name)

  prepare_val = False if val_texts is None or val_labels is None else True
  prepare_test = False if test_texts is None or test_labels is None else True

  def tokenize_data(texts, labels):
    encodings = tokenizer(texts, truncation=True, padding=True)
    decodings = tokenizer(labels, truncation=True, padding=True)
    dataset_tokenized = PegasusDataset(encodings, decodings)
    return dataset_tokenized

  train_dataset = tokenize_data(train_texts, train_labels)
  val_dataset = tokenize_data(val_texts, val_labels) if prepare_val else None
  test_dataset = tokenize_data(test_texts, test_labels) if prepare_test else None

  return train_dataset, val_dataset, test_dataset, tokenizer


def prepare_fine_tuning(model_name, tokenizer, train_dataset, val_dataset=None, freeze_encoder=False, output_dir='/content/drive/MyDrive/Project'):
  """
  Prepare configurations and base model for fine-tuning
  """
  torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
  model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)

  if freeze_encoder:
    for param in model.model.encoder.parameters():
      param.requires_grad = False

  if val_dataset is not None:
    training_args = TrainingArguments(
      output_dir=output_dir,           # output directory
      num_train_epochs=100,           # total number of training epochs
      per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
      per_device_eval_batch_size=1,    # batch size for evaluation, can increase if memory allows
      save_steps=500,                  # number of updates steps before checkpoint saves
      save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
      evaluation_strategy='steps',     # evaluation strategy to adopt during training
      eval_steps=100,                  # number of update steps before evaluation
      warmup_steps=500,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='/content/drive/MyDrive/Project',            # directory for storing logs
      logging_steps=10,
    )

    trainer = Trainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      eval_dataset=val_dataset,            # evaluation dataset
      tokenizer=tokenizer
    )

  else:
    training_args = TrainingArguments(
      output_dir=output_dir,           # output directory
      num_train_epochs=100,           # total number of training epochs
      per_device_train_batch_size=1,   # batch size per device during training, can increase if memory allows
      save_steps=500,                  # number of updates steps before checkpoint saves
      save_total_limit=5,              # limit the total amount of checkpoints and deletes the older checkpoints
      warmup_steps=500,                # number of warmup steps for learning rate scheduler
      weight_decay=0.01,               # strength of weight decay
      logging_dir='/content/drive/MyDrive/Project',            # directory for storing logs
      logging_steps=10,
    )

    trainer = Trainer(
      model=model,                         # the instantiated 🤗 Transformers model to be trained
      args=training_args,                  # training arguments, defined above
      train_dataset=train_dataset,         # training dataset
      tokenizer=tokenizer
    )

  return trainer


# if __name__=='__main__':
#   # use XSum dataset as example, with first 1000 docs as training data
#   from datasets import load_dataset
#   dataset = load_dataset("xsum")
#   train_texts, train_labels = dataset['train']['document'][:1000], dataset['train']['summary'][:1000]
  
#   # use Pegasus Large model as base for fine-tuning
#   model_name = 'google/pegasus-large'
#   train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels)
#   trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset)
#   trainer.train()

## Load Train Data

### From TensorFlow

In [None]:
builder = tfds.builder(name = 'cnn_dailymail')
split = 'test'

INFO:absl:No config specified, defaulting to first: cnn_dailymail/plain_text
INFO:absl:Load pre-computed DatasetInfo (eg: splits, num examples,...) from GCS: cnn_dailymail/plain_text/3.0.0
INFO:absl:Load dataset info from /tmp/tmprhq6iaactfds
INFO:absl:Field info.description from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.config_name from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.config_description from disk and from code do not match. Keeping the one from code.
INFO:absl:Field info.citation from disk and from code do not match. Keeping the one from code.


In [None]:
builder.download_and_prepare()
# 2. Load the `tf.data.Dataset`
ds = builder.as_dataset(split=split, shuffle_files=True)



# for example in ds:  # example is `{'image': tf.Tensor, 'label': tf.Tensor}`
#   # print(list(example.keys()))
#   text = example["article"]
#   sum = example["highlights"]
#   df = df.append({'article':text, 'summary':sum}, ignore_index = True)

INFO:absl:Generating dataset cnn_dailymail (/root/tensorflow_datasets/cnn_dailymail/plain_text/3.0.0)


[1mDownloading and preparing dataset cnn_dailymail/plain_text/3.0.0 (download: 558.32 MiB, generated: 1.27 GiB, total: 1.82 GiB) to /root/tensorflow_datasets/cnn_dailymail/plain_text/3.0.0...[0m


Dl Completed...: 0 url [00:00, ? url/s]

Dl Size...: 0 MiB [00:00, ? MiB/s]

Extraction completed...: 0 file [00:00, ? file/s]

INFO:absl:Downloading https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfTHk4NFg2SndKcjQ into /root/tensorflow_datasets/downloads/ucexport_download_id_0BwmD_VLjROrfTHk4NFg2SndKG8BdJPpt2iRo6Dpzz23CByJuAePEilB-pxbcBCHaWDs.tmp.fb678fd762dc4e7ab352857d542c2216...
INFO:absl:Downloading https://drive.google.com/uc?export=download&id=0BwmD_VLjROrfM1BxdkxVaTY2bWs into /root/tensorflow_datasets/downloads/ucexport_download_id_0BwmD_VLjROrfM1BxdkxVaTY2zVV-G71RIXPssrrvSAjt19Cy91r-9CQ2F9DMKA0uFk0.tmp.fbe8032564af4a56a33dd51d5a348430...
INFO:absl:Downloading https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_test.txt into /root/tensorflow_datasets/downloads/raw.gith.com_abis_cnn-dail_mast_url_list_a705_isK790OHOPsIZX-ACsObq_vchU9r5Uduh6ULX3c.txt.tmp.77304ebcbf0d4053b4802a2b8fe820e6...
INFO:absl:Downloading https://raw.githubusercontent.com/abisee/cnn-dailymail/master/url_lists/all_train.txt into /root/tensorflow_datasets/downloads/raw.gith.com_abis_cnn-dail_mast_url






INFO:absl:Generating split train


0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/cnn_dailymail/plain_text/3.0.0.incompleteDBNE07/cnn_dailymail-train.tfrecord


  0%|          | 0/287113 [00:00<?, ? examples/s]

INFO:absl:Done writing /root/tensorflow_datasets/cnn_dailymail/plain_text/3.0.0.incompleteDBNE07/cnn_dailymail-train.tfrecord. Shard lengths: [17945, 17944, 17945, 17944, 17945, 17944, 17945, 17944, 17945, 17945, 17944, 17945, 17944, 17945, 17944, 17945]
INFO:absl:Generating split validation


0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/cnn_dailymail/plain_text/3.0.0.incompleteDBNE07/cnn_dailymail-validation.tfrecord


  0%|          | 0/13368 [00:00<?, ? examples/s]

INFO:absl:Done writing /root/tensorflow_datasets/cnn_dailymail/plain_text/3.0.0.incompleteDBNE07/cnn_dailymail-validation.tfrecord. Shard lengths: [13368]
INFO:absl:Generating split test


0 examples [00:00, ? examples/s]

Shuffling and writing examples to /root/tensorflow_datasets/cnn_dailymail/plain_text/3.0.0.incompleteDBNE07/cnn_dailymail-test.tfrecord


  0%|          | 0/11490 [00:00<?, ? examples/s]

INFO:absl:Done writing /root/tensorflow_datasets/cnn_dailymail/plain_text/3.0.0.incompleteDBNE07/cnn_dailymail-test.tfrecord. Shard lengths: [11490]
INFO:absl:Skipping computing stats for mode ComputeStatsMode.SKIP.
INFO:absl:Constructing tf.data.Dataset for split test, from /root/tensorflow_datasets/cnn_dailymail/plain_text/3.0.0


[1mDataset cnn_dailymail downloaded and prepared to /root/tensorflow_datasets/cnn_dailymail/plain_text/3.0.0. Subsequent calls will reuse this data.[0m


In [None]:
for example in ds:  # example is `{'image': tf.Tensor, 'label': tf.Tensor}`
  print(list(example.keys()))
  # text = example["article"]
  # sum = example["highlights"]
  print(example)
  break

['article', 'highlights']
{'article': <tf.Tensor: shape=(), dtype=string, numpy=b"Ever noticed how plane seats appear to be getting smaller and smaller? With increasing numbers of people taking to the skies, some experts are questioning if having such packed out planes is putting passengers at risk. They say that the shrinking space on aeroplanes is not only uncomfortable - it's putting our health and safety in danger. More than squabbling over the arm rest, shrinking space on planes putting our health and safety in danger? This week, a U.S consumer advisory group set up by the Department of Transportation said at a public hearing that while the government is happy to set standards for animals flying on planes, it doesn't stipulate a minimum amount of space for humans. 'In a world where animals have more rights to space and food than humans,' said Charlie Leocha, consumer representative on the committee.\xc2\xa0'It is time that the DOT and FAA take a stand for humane treatment of passe

#### Save dataframe
Save using pickled for later use

In [None]:
df.to_pickle(f"/content/drive/MyDrive/PatternRecognition/Project/{split}_data")

In [None]:
train_df = pd.read_pickle("/content/drive/MyDrive/Utrecht/AI/PatternRecognition/project/train_data")

In [None]:
train_df.head()

Unnamed: 0,article,summary
0,"PUBLISHED: . 07:04 EST, 9 January 2014 . | . U...","Zhu Sanni, 23, had been left alone at home for..."
1,"Kabul, Afghanistan (CNN) -- Thousands of bottl...",Official: Bottles are almost exclusively from ...
2,Even death couldn't part two skeletons excavat...,"Two skeletons were found holding hands, buried..."
3,(CNN)The New York Police Department faced a ne...,Danny Cevallos: Arrests over tweets threatenin...
4,By . John Drayton . Lionel Messi took matters ...,Messi led the Argentina team talk between full...


### From Hugging Face

In [None]:
# from datasets import load_dataset
# dataset = load_dataset("cnn_dailymail", "3.0.0")
# train_texts, train_labels = dataset['train']['article'], dataset['train']['highlights']
# # train_texts, train_labels = list(df['text'].values[:1000]), list(df['sum'].values[:1000])

In [None]:
train_texts, train_labels = list(train_df['article'].values), list(train_df['summary'].values)

In [None]:
max_text_len = 300
max_summary_len = 30


# train_texts =np.array(train_texts)
# train_labels=np.array(train_labels)

short_text=[]
short_summary=[]

for i in range(len(train_texts)):
    if(len(train_labels[i].split())<=max_summary_len and len(train_texts[i].split())<=max_text_len):
        short_text.append(train_texts[i])
        short_summary.append(train_labels[i])
        
train_df=pd.DataFrame({'text':short_text,'summary':short_summary})

In [None]:
len(train_df)

4090

In [None]:
train_df['clean_text'] = train_df['text'].apply(lambda x: text_cleaner(x, 0))
train_df['clean_summary'] = train_df['summary'].apply(lambda x: text_cleaner(x, 0))

In [None]:
df100 = train_df.sample(n=100, random_state=1)
train_texts, train_labels = list(df100['clean_text'].values), list(df100['clean_summary'].values)

In [None]:
# use Pegasus Large model as base for fine-tuning
# model_name = 'google/pegasus-cnn_dailymail'
model_name = 'google/pegasus-large'
train_dataset, _, _, tokenizer = prepare_data(model_name, train_texts, train_labels)
trainer = prepare_fine_tuning(model_name, tokenizer, train_dataset)

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.02k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

In [None]:
import transformers
transformers.logging.set_verbosity_info()

In [None]:
trainer.train("/content/drive/MyDrive/Project/checkpoint-3500")

Loading model from /content/drive/MyDrive/Project/checkpoint-3500).
***** Running training *****
  Num examples = 100
  Num Epochs = 100
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 10000
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 35
  Continuing training from global step 3500
  Will skip the first 35 epochs then the first 0 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


0it [00:00, ?it/s]

Step,Training Loss
3510,0.0042
3520,0.0933
3530,0.0278
3540,0.0196
3550,0.0021
3560,0.0028
3570,0.0314
3580,0.0033
3590,0.0008
3600,0.0238


Saving model checkpoint to /content/drive/MyDrive/Project/checkpoint-4000
Configuration saved in /content/drive/MyDrive/Project/checkpoint-4000/config.json
Model weights saved in /content/drive/MyDrive/Project/checkpoint-4000/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Project/checkpoint-4000/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/Project/checkpoint-4000/special_tokens_map.json
Deleting older checkpoint [/content/drive/MyDrive/Project/checkpoint-1000] due to args.save_total_limit
Deleting older checkpoint [/content/drive/MyDrive/Project/checkpoint-1500] due to args.save_total_limit
Saving model checkpoint to /content/drive/MyDrive/Project/checkpoint-4500
Configuration saved in /content/drive/MyDrive/Project/checkpoint-4500/config.json
Model weights saved in /content/drive/MyDrive/Project/checkpoint-4500/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/Project/checkpoint-4500/tokenizer_config.json
Spec

TrainOutput(global_step=10000, training_loss=0.008019686486077262, metrics={'train_runtime': 2657.909, 'train_samples_per_second': 3.762, 'train_steps_per_second': 3.762, 'total_flos': 6743964794880000.0, 'train_loss': 0.008019686486077262, 'epoch': 100.0})

# Infer models for evaluation

## Load test data from pickle

In [None]:
split = 'test'

In [None]:
# df.to_pickle(f"/content/drive/MyDrive/Utrecht/AI/PatternRecognition/project/{split}_data")
# df = pd.read_pickle(f"/content/drive/MyDrive/Utrecht/AI/PatternRecognition/project/{split}_data")
df = pd.read_pickle('/content/drive/MyDrive/Utrecht/AI/PatternRecognition/project/test_data')

In [None]:
df.head()

Unnamed: 0,article,summary
0,"tf.Tensor(b""Ever noticed how plane seats appea...",tf.Tensor(b'Experts question if packed out pl...
1,"tf.Tensor(b""A drunk teenage boy had to be resc...","tf.Tensor(b""Drunk teenage boy climbed into lio..."
2,"tf.Tensor(b""Dougie Freedman is on the verge of...","tf.Tensor(b""Nottingham Forest are close to ext..."
3,"tf.Tensor(b""Liverpool target Neto is also want...",tf.Tensor(b'Fiorentina goalkeeper Neto has bee...
4,"tf.Tensor(b""Bruce Jenner will break his silenc...","tf.Tensor(b""Tell-all interview with the realit..."


In [None]:
summaries = [df.loc[i, 'summary'].numpy().decode("utf-8") for i in range(len(df))]
articles = [df.loc[i, 'article'].numpy().decode("utf-8") for i in range(len(df))]

In [None]:
print(summaries[0])

Experts question if  packed out planes are putting passengers at risk .
U.S consumer advisory group says minimum space must be stipulated .
Safety tests conducted on planes with more leg room than airlines offer .


# Fine-tuned Pegasus

In [None]:
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
# model_name = 'google/pegasus-cnn_dailymail'
model_name = 'google/pegasus-large'

In [None]:
# model = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
# model.load_state_dict(torch.load('/content/drive/MyDrive/PatternRecognition/Project/results/checkpoint-10000/pytorch_model.bin'))
model.load_state_dict(torch.load('/content/drive/MyDrive/Project/checkpoint-4000/pytorch_model.bin'))
# tokenizer = PegasusTokenizer(tokenizer_file='/content/drive/MyDrive/PatternRecognition/Project/results/checkpoint-10000/tokenizer_config.json')
tokenizer = PegasusTokenizer(vocab_file='/content/drive/MyDrive/Project/checkpoint-4000/spiece.model')
# tokenizer = PegasusTokenizer.from_pretrained(model_name)


model.eval()

Downloading:   0%|          | 0.00/3.02k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

PegasusForConditionalGeneration(
  (model): PegasusModel(
    (shared): Embedding(96103, 1024, padding_idx=0)
    (encoder): PegasusEncoder(
      (embed_tokens): Embedding(96103, 1024, padding_idx=0)
      (embed_positions): PegasusSinusoidalPositionalEmbedding(1024, 1024)
      (layers): ModuleList(
        (0): PegasusEncoderLayer(
          (self_attn): PegasusAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): LayerNorm((1024,), eps=1e-05, element

# Pretrained Pegasus

In [None]:
model2 = PegasusForConditionalGeneration.from_pretrained(model_name).to(torch_device)
tokenizer2 = PegasusTokenizer.from_pretrained(model_name)

Downloading:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.12G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.82M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

In [None]:
max_text_len = 300
max_summary_len = 30

#Adding START and END tags to summary for better decoding
cleaned_text =np.array(articles)
cleaned_summary=np.array(summaries)

short_text=[]
short_summary=[]

for i in range(len(cleaned_text)):
    if(len(cleaned_summary[i].split())<=max_summary_len and len(cleaned_text[i].split())<=max_text_len):
        short_text.append(cleaned_text[i])
        short_summary.append(cleaned_summary[i])
        
test_df=pd.DataFrame({'text':short_text,'summary':short_summary})

In [None]:
test_df['clean_text'] = test_df['text'].apply(lambda x: text_cleaner(x, 0))
test_df['clean_summary'] = test_df['summary'].apply(lambda x: text_cleaner(x, 0))

In [None]:
test_df.head()

Unnamed: 0,text,summary,clean_text,clean_summary
0,"(CNN)So, you'd like a ""Full House"" reunion and...","Show will return with a one-hour special, foll...",would like full house reunion spinoff got dude...,show return one hour special followed spinoff ...
1,(CNN)They used to do the guarding at Florida p...,The men are current or former Florida prison g...,used guarding florida prisons ones behind bars...,men current former florida prison guards charg...
2,(CNN)A photo of a baby boy being pulled from t...,Baby Sonit Awal found in rubble of Nepal earth...,photo baby boy pulled rubble nepal earthquake ...,baby sonit awal found rubble nepal earthquake ...
3,(CNN)Call it a little piece of heaven for a fa...,Sierra Sharry was eight months pregnant when h...,call little piece heaven family torn apart tra...,sierra sharry eight months pregnant son father...
4,(CNN)Marvel Comics superhero Hawkeye is a mast...,Renner showed off his vocal skills .\nHe sang ...,marvel comics superhero hawkeye master bow arr...,renner showed vocal skills sang ed sheeran hit


In [None]:
pre = True

if pre:
  inputs = list(test_df['clean_text'].values)
  targets = list(test_df['clean_summary'].values)
else:
  inputs = list(test_df['text'].values)
  targets = list(test_df['summary'].values)


In [None]:
from tqdm import tqdm
predictions = []

for i in tqdm(range(100)):

  batch = tokenizer2(inputs[i], truncation=True, padding='longest', return_tensors="pt").to(torch_device)
  sums = model2.generate(**batch)
  output = tokenizer2.batch_decode(sums, skip_special_tokens=True)
  predictions.append(output)

100%|██████████| 100/100 [03:39<00:00,  2.20s/it]


In [None]:
from tqdm import tqdm
preds = []

for i in tqdm(range(100)):

  batch = tokenizer(inputs[i], truncation=True, padding='longest', return_tensors="pt").to(torch_device)
  sums = model.generate(**batch)
  output = tokenizer.batch_decode(sums, skip_special_tokens=True)
  preds.append(output)

  0%|          | 0/100 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
100%|██████████| 100/100 [01:04<00:00,  1.55it/s]


In [None]:
for i in range(5):
  print('Article:')
  print(inputs[i])
  print('TARGET:')
  print(targets[i])
  # print('PREDICTION:')
  # print(predictions[i])
  print('pred:')
  print(preds[i])
  print('\n')

Article:
would like full house reunion spinoff got dude co star john stamos announced monday night jimmy kimmel live netflix ordered reunion special followed spinoff series called fuller house show feature candace cameron bure played eldest daughter tanner original series aired recently widowed mother three boys sort role reversal turn house stamos told kimmel jodie sweetin played stephanie tanner original series andrea barber portrayed best friend kimmy gibbler return new series netflix said stamos produce guest star talks co starsbob saget mary kate ashley olsen dave coulier lori loughlin ongoing netflix said show available next year netflix said big fans original full house thrilled able introduce fuller house new narrative existing fans worldwide grew original well new generation global viewers grown tanners syndication netflix vice president original content cindy holland said statement show starts tanner named tanner fuller pregnant recently widowed living san francisco younger s

In [None]:
pred_df = pd.DataFrame({'article':inputs[:100], 'reference': targets[:100], 'decoded': preds})

In [None]:
pred_df['decoded'] = pred_df['decoded'].apply(lambda x: x[0])

In [None]:
pred_df.head()

Unnamed: 0,article,reference,decoded
0,would like full house reunion spinoff got dude...,show return one hour special followed spinoff ...,candace cameron bure played eldest daughter ta...
1,used guarding florida prisons ones behind bars...,men current former florida prison guards charg...,court documents say former inmate plotted murd...
2,photo baby boy pulled rubble nepal earthquake ...,baby sonit awal found rubble nepal earthquake ...,baby boy pulled rubble nepal earthquake death ...
3,call little piece heaven family torn apart tra...,sierra sharry eight months pregnant son father...,sharry lane smith become parents sharry eight ...
4,marvel comics superhero hawkeye master bow arr...,renner showed vocal skills sang ed sheeran hit,marvel comics superhero hawkeye master bow arr...


In [None]:
pred_df.to_pickle("/content/drive/MyDrive/Utrecht/AI/PatternRecognition/project/pre_pred_df100")
# pred_df = pd.read_pickle("/content/drive/MyDrive/Utrecht/AI/PatternRecognition/project/pre_pred_df")

In [None]:
pred_json = pred_df.to_json(orient='records', lines=True)

In [None]:
import json

with open('/content/drive/MyDrive/Utrecht/AI/PatternRecognition/project/pre_predictions.json', 'w') as outfile:
    json.dump(pred_json, outfile, indent=4)

#### Save & Load predictions

In [None]:
with open('/content/drive/MyDrive/PatternRecognition/Project/predictions.pkl', 'wb') as f:
  pickle.dump(predictions, f)

In [None]:
with open('/content/drive/MyDrive/PatternRecognition/Project/predictions.pkl', 'rb') as f:
  predictions = pickle.load(f)

# SummEval

In [None]:
!pip install summ-eval



In [None]:
 import os
 os.environ['ROUGE_HOME']='/usr/local/lib/python3.7/dist-packages/summ_eval/ROUGE-1.5.5/'

In [None]:
 !pip install -U  git+https://github.com/bheinzerling/pyrouge.git

Collecting git+https://github.com/bheinzerling/pyrouge.git
  Cloning https://github.com/bheinzerling/pyrouge.git to /tmp/pip-req-build-8w533ucs
  Running command git clone -q https://github.com/bheinzerling/pyrouge.git /tmp/pip-req-build-8w533ucs


In [None]:
from summ_eval.rouge_metric import RougeMetric
rouge = RougeMetric()

Preparing ROUGE Perl script - this will take a few seconds


In [None]:
rouge_dict = rouge.evaluate_batch(summaries[:100], predictions)

CalledProcessError: ignored

In [None]:
!git clone https://github.com/andersjo/pyrouge.git rouge
!git clone https://github.com/bheinzerling/pyrouge

Cloning into 'rouge'...
remote: Enumerating objects: 393, done.[K
remote: Total 393 (delta 0), reused 0 (delta 0), pack-reused 393[K
Receiving objects: 100% (393/393), 298.74 KiB | 11.95 MiB/s, done.
Resolving deltas: 100% (109/109), done.
Cloning into 'pyrouge'...
remote: Enumerating objects: 551, done.[K
remote: Total 551 (delta 0), reused 0 (delta 0), pack-reused 551[K
Receiving objects: 100% (551/551), 123.17 KiB | 6.16 MiB/s, done.
Resolving deltas: 100% (198/198), done.


In [None]:
cd pyrouge/

/content/pyrouge


In [None]:
!python setup.py install
!pyrouge_set_rouge_path '/content/rouge/tools/ROUGE-1.5.5'

running install
running bdist_egg
running egg_info
creating pyrouge.egg-info
writing pyrouge.egg-info/PKG-INFO
writing dependency_links to pyrouge.egg-info/dependency_links.txt
writing top-level names to pyrouge.egg-info/top_level.txt
writing manifest file 'pyrouge.egg-info/SOURCES.txt'
adding license file 'LICENSE.txt'
writing manifest file 'pyrouge.egg-info/SOURCES.txt'
installing library code to build/bdist.linux-x86_64/egg
running install_lib
running build_py
creating build
creating build/lib
creating build/lib/pyrouge
copying pyrouge/test.py -> build/lib/pyrouge
copying pyrouge/Rouge155.py -> build/lib/pyrouge
copying pyrouge/__init__.py -> build/lib/pyrouge
creating build/lib/pyrouge/utils
copying pyrouge/utils/log.py -> build/lib/pyrouge/utils
copying pyrouge/utils/argparsers.py -> build/lib/pyrouge/utils
copying pyrouge/utils/file_utils.py -> build/lib/pyrouge/utils
copying pyrouge/utils/__init__.py -> build/lib/pyrouge/utils
copying pyrouge/utils/string_utils.py -> build/lib/p