In [None]:
import pathlib
import os
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from datasets import load_dataset
import re

## Loading Data

In [None]:
dataset = load_dataset("cnn_dailymail","3.0.0")

In [None]:
dataset['test']['article'][0]
# len(dataset['test']['highlights'][0])

## Data Preproccessing 

In [None]:
#The following may not need to be used 
def clean_text(text):
    text = re.sub(r"@[A-Za-z0-9]+", ' ', text)
    text = re.sub(r"https?://[A-Za-z0-9./]+", ' ', text)
    text = re.sub(r"[^a-zA-z.!?'0-9]", ' ', text)
    text = re.sub('\t', ' ',  text)
    text = re.sub(r" +", ' ', text)
    return text

def load_data(path):
    onlyfiles = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
    print('found {} files'.format(len(onlyfiles)))
    all_text = []
    for f in onlyfiles:
        with open('{}/{}'.format(path, f)) as handle:
            lines = clean_text(handle.readlines()[0])
            all_text.append(lines)
        
    return all_text

In [None]:
clean_text(dataset['test']['article'][0])

In [None]:
def overlapping_subsection(text):
    for article in text:
        article = clean_text(article)
        word_count = 0
        article_subsections = []
        while len(article) > 512:
            if len(article_subsections) == 0:
                article_subsections.append(article[:512])
                word_count = 512
                article = article[512:]
            if len(article) <= 412:
                article_subsections.append(article[word_count-100:])
            else:
                article_subsections.append(article[word_count-100:word_count+412])   
                word_count = word_count + 412
                article = article[word_count:]
        article_subsections.append(article)
        article = article_subsections
        print(article)
        break

In [None]:
overlapping_subsection(dataset['test']['article'])

## Create Bert model

In [None]:
import transformers
from transformers import BertTokenizer, BertModel, EncoderDecoderModel
import torch
from tqdm import tqdm_notebook as tqdm


## Encode Data with Bert

In [None]:
#The tokenizer to be used to create embeddings 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert from pre-trained checkpoints

# forward
input_ids = torch.tensor(tokenizer.encode(dataset['test']['article'], add_special_tokens=True)).unsqueeze(0)  # Batch size 1
outputs = model(input_ids=input_ids, decoder_input_ids=input_ids)

# # training
# outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
# loss, logits = outputs.loss, outputs.logits

# # save and load from pretrained
# model.save_pretrained("bert2bert")
# model = EncoderDecoderModel.from_pretrained("bert2bert")

# # generation
# generated = model.generate(input_ids, decoder_start_token_id=model.config.decoder.pad_token_id)

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self, NUM_OUT):
        super(BERTClass, self).__init__()
                   
        self.l1 = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased","bert-base-uncased")
#         self.pre_classifier = torch.nn.Linear(768, 256)
        self.classifier = torch.nn.Linear(768, NUM_OUT)
#         self.dropout = torch.nn.Dropout(0.5)
        self.softmax = torch.nn.Softmax(dim=1)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
#         pooler = self.pre_classifier(pooler)
#         pooler = torch.nn.Tanh()(pooler)
#         pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        output = self.softmax(output)
        return output



In [None]:
# tokenized_dataset = []
# model = BertClass(NUM_OUT)
# model.to(device)
# for i in range len(dataset['test']):
#     # forward
#     input_ids = torch.tensor(tokenizer.encode(dataset['test']['highlights'][i], add_special_tokens=True)).unsqueeze(0)  # Batch size 1
#     tokenized_dataset.append(model(input_ids=input_ids, decoder_input_ids=input_ids))

# # training
# outputs = model(input_ids=input_ids, decoder_input_ids=input_ids, labels=input_ids)
# loss, logits = outputs.loss, outputs.logits

# Pre trained comp

In [None]:
clean_data = []
clean_sum = []
for text in tqdm(range(len(dataset['train']['article']))[:1]):
    new = clean_text(dataset['train']['article'][text])
    clean_data.append(new)
    clean_sum.append(clean_text(dataset['train']['highlights'][text]))

In [None]:
from summarizer import Summarizer
from transformers import pipeline

In [None]:
model = Summarizer()

In [None]:
sum_list = ["google/pegasus-cnn_dailymail", "t5-base", "sshleifer/distilbart-cnn-12-6", "facebook/bart-large-cnn","nsi319/legal-led-base-16384", "google/pegasus-newsroom", "google/pegasus-wikihow", "ml6team/mt5-small-german-finetune-mlsum"]

In [None]:
summarizers = {}
for name in tqdm(sum_list):
    summarizers[name] = pipeline("summarization", model=name, tokenizer=name)

In [None]:
sum_preds = {}
for m in tqdm(summarizers):
    summary = []
    for data in tqdm(range(len(clean_data))):
        try:
            summary_text = summarizers[m](clean_data[data], clean_sum[data], max_length=100, min_length=5, do_sample=False)[0]['summary_text']
        except:
            summary_text = summarizers[m](model(clean_data[data]), clean_sum[data], max_length=100, min_length=5, do_sample=False)[0]['summary_text']
        summary.append(summary_text)
    sum_preds[m] = summary

In [None]:
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

In [None]:
sum_scores = {}
for model_name in tqdm(sum_preds):
    good = 0
    for text_sum in range(len(summary)):
        pred = sum_preds[model_name][text_sum]
        gold = clean_sum[text_sum]
        score = similar(pred, gold)
        if score > .1:
            good += 1
    sum_scores[model_name] = good / len(summary)


In [None]:
sum_scores

## Create Bert Decoder

## Train Model

## Validation

## Testing on the Test Set

## Results

## Future Work