# NLP Final Project
## Josh Coward, Ryan Pacheco, Sajia Zafreen

In [None]:
import pathlib
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from datasets import load_dataset
import re
import transformers
from transformers import BertTokenizer, BertModel, EncoderDecoderModel
import torch
from tqdm import tqdm_notebook as tqdm
import datasets
from transformers import TrainingArguments
from transformers import Trainer
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from dataclasses import dataclass, field
from typing import Optional
import datasets
from difflib import SequenceMatcher

# Begin here if you wish to run the fine tuned BERT model. Otherwise skip to `Pre Trained Comp`

## Loading Data

In [None]:
train_data = load_dataset("cnn_dailymail","3.0.0",split="train")
dataset = load_dataset("cnn_dailymail","3.0.0")
val_data = load_dataset("cnn_dailymail","3.0.0",split="validation")
test_data = load_dataset("cnn_dailymail","3.0.0",split="test")

## Data Preproccessing 

In [None]:
def clean_text(text):
    text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    text = re.sub(r"[^a-zA-z.!?'0-9]", ' ', text)
    text = re.sub('\t', ' ',  text)
    text = re.sub(r" +", ' ', text)
    return text

def load_data(path):
    onlyfiles = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
    print('found {} files'.format(len(onlyfiles)))
    all_text = []
    for f in onlyfiles:
        with open('{}/{}'.format(path, f)) as handle:
            lines = clean_text(handle.readlines()[0])
            all_text.append(lines)
        
    return all_text

In [None]:
# The following function iterates through all the articles given and creates a list of overlapping sections for each article 
# if that article is greater than 512 words  
def overlapping_subsection(dataset):
    articles = []
    for article in tqdm(dataset):
        article = article.split()
        word_count = 0
        article_subsections = []
        while len(article) > 512:
            if len(article_subsections) == 0:
                article_subsections.append(' '.join(word for word in article[:512]))
                word_count = 512
                article = article[word_count-100:]       
            if len(article) > 412:
                article_subsections.append(' '.join(word for word in article[:512]))   
                word_count = word_count + 412
                article = article[412:]
            if len(article) < 412:
                article_subsections.append(' '.join(word for word in article))   
        if len(article_subsections) != 0: 
            article = article_subsections
        articles.append(article)
    return articles

In [None]:
# The following is the orginal data after running it through the overlapping function saved as a pandas DataFrame
train_df =  pd.DataFrame({'article':overlapping_subsection(train_data['article']),'highlights':train_data['highlights'],"id": train_data['id']})
test_df = pd.DataFrame({'article':overlapping_subsection(test_data['article']),'highlights':test_data['highlights'],"id": test_data['id']})
val_df =  pd.DataFrame({'article':overlapping_subsection(val_data['article']),'highlights':val_data['highlights'],"id": val_data['id']})

## Create Tokenizer using Bert

In [None]:
#The tokenizer to be used to create embeddings for both Articles and Summaries
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:
batch_size = 16 # Change batch size to 4 for faster but less accurate training
encoder_max_length = 512
decoder_max_length = 128

def convert_data_to_model_inputs(batch):
    #Encodes the article
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length = encoder_max_length)
    #Encodes the summary
    outputs = tokenizer(batch["highlights"], padding="max_length", truncation=True, max_length = decoder_max_length)
    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["decoder_input_ids"] = outputs.input_ids
    batch["decoder_attention_mask"] = outputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()
    batch["labels"] = [[-100 if token == tokenizer.pad_token_id else token for token in labels] for labels in batch["labels"]]
    
    return batch
    
    

## Setup training

In [None]:
#Note: If data has already been mapped and saved to file DO NOT run this cell

# Uncomment out the following two line two train on a small subset
# train_data = train_data.select(range(32))
# val_data = val_data.select(range(32))

train_data = train_data.map(
    convert_data_to_model_inputs,
    batched = True,
    batch_size = batch_size,
    remove_columns=["article","highlights", "id"]
)
train_data.set_format(
    type="torch", columns=["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)
val_data = val_data.map(
    convert_data_to_model_inputs,
    batched = True,
    batch_size = batch_size,
    remove_columns = ["article","highlights", "id"]
)
val_data.set_format(
    type="torch", columns = ["input_ids", "attention_mask", "decoder_input_ids", "decoder_attention_mask", "labels"],
)

### Saves Mapped Data to File

In [None]:
# The following saves the training and validation data to file
# File size < 1.75 Gb
with open('train_data.pickle', 'wb') as handle:
    pickle.dump(train_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('val_data.pickle', 'wb') as handle:
    pickle.dump(val_data, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Load Mapped Data from File

In [None]:
#The following loads pre-mapped train/validation data
with open('train_data.pickle', 'rb') as handle:
     train_data = pickle.load(handle)
        
with open('val_data.pickle', 'rb') as handle:
    val_data = pickle.load(handle)

In [None]:
#Creates the base encoder decoder model
model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased","bert-base-uncased")

# set model configuration
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id

# sensible parameters for beam search
model.config.vocab_size = model.config.decoder.vocab_size
model.config.max_length = 128
model.config.min_length = 64
model.config.no_repeat_ngram_size = 3
model.config.early_stopping = True
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [None]:
import datasets
rouge = datasets.load_metric("rouge")

def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    # all unnecessary tokens are removed
    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output1 = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge1"])["rouge1"].mid
    rouge_output2 = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge1_precision": round(rouge_output1.precision, 4),
        "rouge1_recall": round(rouge_output1.recall, 4),
        "rouge1_fmeasure": round(rouge_output1.fmeasure, 4),
        "rouge2_precision": round(rouge_output2.precision, 4),
        "rouge2_recall": round(rouge_output2.recall, 4),
        "rouge2_fmeasure": round(rouge_output2.fmeasure, 4),
    }

## Train the model

In [None]:
#Frees up unused memory 
import gc
torch.cuda.empty_cache()
gc.collect(

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    predict_with_generate=True,
    evaluation_strategy='steps',
    do_train=True,
    do_eval=True,
    logging_steps=1000,
    save_steps=500, 
    eval_steps=8000,
    warmup_steps=2000,  
    overwrite_output_dir=True,
    save_total_limit=3,
    fp16=True, #Comment out this line if training a non-CUDA device
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_data,
    eval_dataset=val_data,
)
trainer.train()

## Evaluation

In [None]:
model = EncoderDecoderModel.from_pretrained("checkpoint-500")
model.to("cuda") # Comment out this line if not using CUDA
batch_size = 64

def evaluate_test_data(batch):
    inputs = tokenizer(batch["article"], padding="max_length", truncation=True, max_length = encoder_max_length,return_tensors="pt")
    input_ids = inputs.input_ids.to("cuda") # Remove ".to("cuda")" if not using CUDA
    attention_mask = inputs.attention_mask.to("cuda") # Remove ".to("cuda")" if not using CUDA
    
    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer.batch_decode(outputs, skip_special_tokens=True)

    batch["pred"] = output_str

    return batch


results = test_data.map(evaluate_test_data, batched=True, batch_size=batch_size, remove_columns=["article"])

pred_str = results["pred"]
label_str = results["highlights"]

rouge_output1 = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge1"])["rouge1"].mid
rouge_output2 = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

print(rouge_output1)
print(rouge_output2)

## Testing on different data

In [None]:
article = "Our proposal looks into the reasonability, feasibility, and benefits of introducing plant-based dining options to the Boise State community. Currently, Boise State University lacks a vegan exclusive dining option on campus. This creates a variety of problems not only at the University, but at an ecological scale. With a current campus population of 22,064, there are an array of dietary restrictions, allergens, and lifestyles. Adding a plant based restaurant could help appease those students and also offer a healthier dining option. Nutrition is crucial for students and their success with studying, work ethic, and attendance. Extensive nutritional research says that plant based foods can directly affect mental capacity among school-aged children. As an example, iron deficiency can decrease dopamine transmission, thus negatively impacting a students cognition. A balanced, plant based diet can also induce better learning behaviors and learning environments. Lastly, researchers have found that plant based foods have impacted student’s exam scores and more positive school related outcomes. This also creates a convenience for students at BSU, who otherwise would search for food options off-campus. We believe there is extreme significance in this proposal that most others are unaware of. By introducing vegan options, there would be an improvement in student and faculty health for those customers. In an omnivorous diet, there are many factors that are destructive to our health. Heart disease, cancer, diabetes, obesity and strokes are just some of the few. With an ever changing environment that is rapidly increasing in warmth, humans are front and center for the cause of another imminent, mass extinction. Waste from animals whether nitrous oxide, ammonia, methane, or feces & urine, the waste amasses a total weight of 7,742,000,000 (or 7.742 billion) pounds. While the gases emitted attributes to air pollution. Research suggests that roughly 80% of ammonia emissions in the U.S comes from animal waste and it’s estimated that over 50 percent of world greenhouse gases. More than a third of all raw materials and fossil fuels consumed in the U.S. are used in animal agriculture. Researchers also state that animal agriculture is responsible for 9 percent of global carbon dioxide emissions, 35 percent to 40 percent of global methane emissions, and 65 percent of nitrous oxide emissions. These pollutants affect natural environments such as the amazon rainforest, the great barrier reef, and even places in our own backyard such as the Boise greenbelt. Not only would vegan dining options help combat climate change or health related illnesses, it would help combat future pandemics and animal cruelty. As COVID-19 continues to roar through the United States taking the lives of many, the source of the virus has been in question by conspirists yet, scientists have stated that it likely came from a wet market in Wuhan, China. Wet markets are similar to the likes of factory farms that house billions of animals worldwide annually. These compact and dirty environments that animals are bred in, and exploited, have been the source of three-fourths of the most recent pandemics in history. Diseases like SARS, HIV, COVID, Swine flu and bird flu all originated from animals being in close contact with humans. World renown doctors have compiled research and substantial evidence that if we do not change our food system, and work towards a plant based system, pandemics will become inevitable. Furthermore, authors behind a new doctor-backed white paper say they have tied most, if not all of the worlds most major outbreaks to animal exploitation since 1900. The evidence is damning, and if we do not begin working towards a plant based future, the feasibility of humankind attending universities, public gatherings, or being in physical contact with one another, becomes bleak. "
input_ids = tokenizer(article, return_tensors="pt").input_ids
output_ids = model.generate(input_ids)
print(tokenizer.decode(output_ids[0], skip_special_tokens=True))

# Begin here if you wish to just do Pre Trained model comparisions

# Pre trained comp

* Load Datasets here
* https://huggingface.co/datasets?filter=task_ids:summarization,languages:en

In [None]:
dataset_bill = load_dataset("billsum")
dataset_cnn = load_dataset("cnn_dailymail", "3.0.0")
dataset_sam = load_dataset("samsum")

### Adjust the number of articles you wish to summarize

In [None]:
clean_data_cnn = []
clean_sum_cnn = []
for text in tqdm(range(len(dataset_cnn['test']['article']))[:200]):
    new = clean_text(dataset_cnn['test']['article'][text])
    clean_data_cnn.append(new)
    clean_sum_cnn.append(clean_text(dataset_cnn['test']['highlights'][text]))
    
clean_data_bill = []
clean_sum_bill = []
for text in tqdm(range(len(dataset_bill['test']['text']))[:200]):
    new = clean_text(dataset_bill['test']['text'][text])
    clean_data_bill.append(new)
    clean_sum_bill.append(clean_text(dataset_bill['test']['summary'][text]))
    
clean_data_sam = []
clean_sum_sam = []
for text in tqdm(range(len(dataset_sam['test']['dialogue']))[:200]):
    new = clean_text(dataset_sam['test']['dialogue'][text])
    clean_data_sam.append(new)
    clean_sum_sam.append(clean_text(dataset_sam['test']['summary'][text]))

In [None]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [None]:
from summarizer import Summarizer
from transformers import pipeline

In [None]:
model = Summarizer()

### Add more models to this list if you wish to add them to the comparision
* https://huggingface.co/models?pipeline_tag=summarization

In [None]:
sum_list = ["google/pegasus-cnn_dailymail", "t5-base", "sshleifer/distilbart-cnn-12-6", "facebook/bart-large-cnn","nsi319/legal-led-base-16384", "google/pegasus-newsroom", "google/pegasus-wikihow", "ml6team/mt5-small-german-finetune-mlsum"]

In [None]:
summarizers = {}
for name in tqdm(sum_list):
    summarizers[name] = pipeline("summarization", model=name, tokenizer=name)

In [None]:
def summarize_models(clean_data, clean_sum, summarizers, model):
    sum_preds = {}
    for m in tqdm(summarizers):
        summary = []
        for data in tqdm(range(len(clean_data))):
            try:
                summary_text = summarizers[m](clean_data[data], clean_sum[data], max_length=100, min_length=5, do_sample=False)[0]['summary_text']
            except IndexError:
                continue
            except:
                summary_text = summarizers[m](model(clean_data[data]), clean_sum[data], max_length=100, min_length=5, do_sample=False)[0]['summary_text']
            summary.append(summary_text)
        sum_preds[m] = summary

    sum_scores = {}
    for model_name in tqdm(sum_preds):
        sum_scores[model_name] = {}
        good_score = 0
        pred = []
        gold = []
        for text_sum in range(len(sum_preds[model_name])):
            pred.append(sum_preds[model_name][text_sum])
            gold.append(clean_sum[text_sum])
            score = similar(sum_preds[model_name][text_sum], clean_sum[text_sum])
            if score > .1:
                good_score += 1
        try:
            good = rouge.compute(predictions=pred, references=gold, rouge_types=["rouge2"])["rouge2"].mid
        except:
            good = [0.0, 0.0, 0.0] 
        sum_scores[model_name]['rouge'] = good
        sum_scores[model_name]['similar'] = good_score / len(summary)


    new_scores = {}
    for model in sum_scores:
        new_scores[model] = {}
        new_scores[model]['precision'] = sum_scores[model]['rouge'][0]
        new_scores[model]['recall'] = sum_scores[model]['rouge'][1]
        new_scores[model]['fmeasure'] = sum_scores[model]['rouge'][2]
        new_scores[model]['similar'] = sum_scores[model]['similar']
    return new_scores

## Call summarize_modes() to summarize all the datasets that have been cleaned through all the models in the summarizers list

In [None]:
cnn_scores = summarize_models(clean_data_cnn, clean_sum_cnn, summarizers, model)

In [None]:
bill_scores = summarize_models(clean_data_bill, clean_sum_bill, summarizers, model)

In [None]:
sam_scores = summarize_models(clean_data_sam, clean_sum_sam, summarizers, model)

* Precision = $\frac{TP}{TP + FP}$
* Recall = $\frac{TP}{TP + FN}$
* Fmeasure = 2 * $\frac{Precision * Recall}{Precision + Recall}$

## Results
### If more models have been added copy the next two cells to show results, replacing `cnn_scores` with `{VALUE RETURNED FROM summarize_models()}`

In [None]:
for score in cnn_scores:
    print("{}:\n\tPrecision: {}\n\tRecall: {}\n\tFmeasure: {}\n\tSimilar: {}\n\t".format(score, cnn_scores[score]['precision'],cnn_scores[score]['recall'],cnn_scores[score]['fmeasure'],cnn_scores[score]['similar']))

In [None]:
pd.DataFrame(cnn_scores).plot(kind='bar', figsize=(10,10))
plt.show()

In [None]:
for score in bill_scores:
    print("{}:\n\tPrecision: {}\n\tRecall: {}\n\tFmeasure: {}\n\tSimilar: {}\n\t".format(score, bill_scores[score]['precision'],bill_scores[score]['recall'],bill_scores[score]['fmeasure'],bill_scores[score]['similar']))

In [None]:
pd.DataFrame(bill_scores).plot(kind='bar', figsize=(10,10))
plt.show()

In [None]:
for score in sam_scores:
    print("{}:\n\tPrecision: {}\n\tRecall: {}\n\tFmeasure: {}\n\tSimilar: {}\n\t".format(score, sam_scores[score]['precision'],sam_scores[score]['recall'],sam_scores[score]['fmeasure'],sam_scores[score]['similar']))

In [None]:
pd.DataFrame(sam_scores).plot(kind='bar', figsize=(10,10))
plt.show()