In [None]:
import pathlib
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from datasets import load_dataset
import re

## Loading Data

## Sajia loading more data/combine with train data below

In [None]:
train_data = load_dataset("cnn_dailymail","3.0.0",split="train")
val_data = load_dataset("cnn_dailymail","3.0.0",split="validation")
test_data = load_dataset("cnn_dailymail","3.0.0",split="test")

In [None]:
for i in tqdm(range(len(dataset['test']))):
    dataset['test']['article'][i] = dataset['test']['article'][i].split()
    dataset['test']['highlights'][i] = dataset['test']['highlights'][i].split()

## Data Preproccessing 

In [None]:
#The following may not need to be used 
def clean_text(text):
    text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    text = re.sub(r"[^a-zA-z.!?'0-9]", ' ', text)
    text = re.sub('\t', ' ',  text)
    text = re.sub(r" +", ' ', text)
    return text

def load_data(path):
    onlyfiles = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
    print('found {} files'.format(len(onlyfiles)))
    all_text = []
    for f in onlyfiles:
        with open('{}/{}'.format(path, f)) as handle:
            lines = clean_text(handle.readlines()[0])
            all_text.append(lines)
        
    return all_text

In [None]:
def overlapping_subsection(text):
    for article in text:
        word_count = 0
        article_subsections = []
        while len(article) > 512:
            if len(article_subsections) == 0:
                article_subsections.append(article[:512])
                word_count = 512
                article = article[412:]        
            if len(article) > 412:
#                 article_subsections.append(article[word_count-100:])
#             else:
                article_subsections.append(article[word_count-100:word_count+412])   
                word_count = word_count + 412
                article = article[word_count-100:]
        article_subsections.append(article)
       
        article = article_subsections
        print(article)
        print(len(article))
        print(article[1])
        print(len(article[0]))
        break

In [None]:
overlapping_subsection(dataset['test']['article'])

## Create Bert model

In [None]:
import transformers
from transformers import BertTokenizer, BertModel, EncoderDecoderModel
import torch
from tqdm import tqdm_notebook as tqdm


## Encode Data with Bert

In [None]:
#The tokenizer to be used to create embeddings 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
batch_size = 16
encoder_max_length = 512
decoder_max_length = 128

def convert_data_to_model_inputs(batch):
    #Encodes the article
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length = encoder_max_length)
    #Encodes the summary
    outputs = tokenizer(batch["highlights"], padding="max_length", truncation=True, max_length = decoder_max_length)
    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]
    
    return batch
    
    

## Setup training

In [None]:
train_data = train_data.map(
    convert_data_to_model_inputs,
    batched = True,
    batch_size = batch_size,
    remove_columns=["article","highlights", "id"]
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
val_data = val_data.map(
    convert_data_to_model_inputs,
    batched = True,
    batch_size = batch_size,
    remove_columns = ["article","highlights", "id"]
)
val_data.set_format(
    type="torch", columns = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
test_data = test_data.map(
    convert_data_to_model_inputs,
    batched = True,
    batch_size=batch_size,
    remove_columns = ["article","highlights", "id"]
)
test_data.set_format(
    type="torch", columns = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

In [None]:
model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased","bert-base-uncased")
# set special tokens
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# sensible parameters for beam search
model.config.vocab_size = model.config.decoder.vocab_size
model.config.max_length = 128
model.config.min_length = 64
model.config.no_repeat_ngram_size = 3
model.config.early_stopping = True
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [None]:
!wget https://raw.githubusercontent.com/huggingface/transformers/master/examples/seq2seq/seq2seq_trainer.py
!wget https://raw.githubusercontent.com/huggingface/transformers/master/examples/seq2seq/seq2seq_training_args.py
!pip install git-python==1.0.3
!pip install rouge_score
!pip install sacrebleu

In [None]:
# from seq2seq_trainer import Seq2SeqTrainer
from transformers import TrainingArguments
from transformers import Trainer
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from dataclasses import dataclass, field
from typing import Optional

@dataclass
class Seq2SeqTrainingArguments(TrainingArguments):
    label_smoothing: Optional[float] = field(default=0.0)
    sortish_sampler: bool = field(default=False)
    predict_with_generate: bool = field(default=False)        
    adafactor: bool = field(default=False)
    encoder_layerdrop: Optional[float] = field(default=None)
    decoder_layerdrop: Optional[float] = field(default=None)
    dropout: Optional[float] = field(default=None)
    attention_dropout: Optional[float] = field(default=None)
    lr_scheduler: Optional[str] = field(default="linear")

In [None]:
import datasets
rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

## Train the model

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
#     evaluate_during_training=True,
    do_train=True,
    do_eval=True,
    logging_steps=1000,
    save_steps=500, 
    eval_steps=8000,
    warmup_steps=2000,  
    overwrite_output_dir=True,
    save_total_limit=3,
#     fp16=True, 
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
)
trainer.train()

## Evaluation

In [None]:
model = EncoderDecoderModel.from_pretrained("")
model.to("cuda")
batch_size = 64

def evaluate_test_data(batch):
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length = encoder_max_length,return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda")
    attention_mask = inputs.attention_mask.to("cuda")
    
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred"] = output_str

    return batch


results = test_data.map(generate_summary, batched=True, batch_size=batch_size, remove_columns=["article"])

pred_str = results["pred"]
label_str = results["highlights"]

rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

print(rouge_output)

## Testing on different data

In [None]:
article = "I like your take on facts vs opinions, it's certainly a logical stance to take. Sure you may have an opinion on a matter and have some reasonable justification for that opinion, but you should always remember to consider the facts against your opinion to ensure your decisions actually make sense because after all facts don't care about your opinion. In regards to your question I sort of have mixed feelings,certainly more often than not facts should overrule opinions, but sometimes in situations and scenarios you don't have all the necessary information and details to make decisions based on facts alone so you ultimately have to make an opinion based decision. For instance I personally won't be getting the COVID-19 vaccine at least not the first version or two of it. The reason being is because I have a severe peanut allergy and I am also seriously allergic to dairy products and some other things. While it may be a fact that the vaccine contains neither of those two substances, there have however been numerous cases of people who have similar allergies as I do who have in fact had serious reactions to the vaccine. While it's not a guarantee that this will happen to everyone who has allergies, therefore it's certainly not a fact that the virus will cause an allergic reaction in everyone who has allergies, yet I still will make the opinion based decision to not get the current version of the vaccine even if it's offered to me for free. In my opinion I much rather get the virus as I'm a healthy in shape adult then risk dying from a vaccine. Call me an idiot if you will but I'm not gonna risk it. Other than that, like Bruins suggested the only thing I can think of where facts are disregarded over opinions would be religion. No where else will you find a bunch of psychos blinding following  some random idea then at a church or at a psych ward with a bunch of schizophrenics."
input_ids = tokenizer(article, return_tensors="pt").input_ids
output_ids = model.generate(input_ids)
print(tokenizer.decode(output_ids[0], skip_special_tokens=True))

## Loader for quick model comparisons

# Pre trained comp

In [None]:
clean_data = []
clean_sum = []
for text in tqdm(range(len(dataset['train']['article']))[:1]):
    new = clean_text(dataset['train']['article'][text])
    clean_data.append(new)
    clean_sum.append(clean_text(dataset['train']['highlights'][text]))

In [None]:
from summarizer import Summarizer
from transformers import pipeline

In [None]:
model = Summarizer()

In [None]:
sum_list = ["google/pegasus-cnn_dailymail", "t5-base", "sshleifer/distilbart-cnn-12-6", "facebook/bart-large-cnn","nsi319/legal-led-base-16384", "google/pegasus-newsroom", "google/pegasus-wikihow", "ml6team/mt5-small-german-finetune-mlsum"]

In [None]:
summarizers = {}
for name in tqdm(sum_list):
    summarizers[name] = pipeline("summarization", model=name, tokenizer=name)

In [None]:
sum_preds = {}
for m in tqdm(summarizers):
    summary = []
    for data in tqdm(range(len(clean_data))):
        try:
            summary_text = summarizers[m](clean_data[data], clean_sum[data], max_length=100, min_length=5, do_sample=False)[0]['summary_text']
        except:
            summary_text = summarizers[m](model(clean_data[data]), clean_sum[data], max_length=100, min_length=5, do_sample=False)[0]['summary_text']
        summary.append(summary_text)
    sum_preds[m] = summary

In [None]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [None]:
sum_scores = {}
for model_name in tqdm(sum_preds):
    good = 0
    for text_sum in range(len(summary)):
        pred = sum_preds[model_name][text_sum]
        gold = clean_sum[text_sum]
        score = similar(pred, gold)
        if score > .1:
            good += 1
    sum_scores[model_name] = good / len(summary)


In [None]:
sum_scores

## Create Bert Decoder

## Train Model

## Validation

## Testing on the Test Set

## Results

## Future Work