# Extractive Summary as Text Matching on *FairySum*  🧚

## Imports & Downloads

In [None]:
# install the requirements
%pip install -r requirements.txt > /dev/null
# set to false if you already have the dataset
download_dataset = False 
if download_dataset:
    %cd FairySum
    !bash download_dataset.sh
    %cd ..

In [None]:
from src.hyperparameters import Hparams
from sbert.baseline import SentenceBERT
from sbert.regression_model import execute_booknlp_pipeline
from sbert.regression_model import count_event_sentence
from sbert.regression_model import LengthRegressionModel
from src.data_module import FairySum_Dataset, FairySum_DataModule
from src.model import MatchSum
from src.train import train_model

import dataclasses
from dataclasses import asdict
import matplotlib.pyplot as plt
import wandb
import pprint
import json
import torchvision
import pytorch_lightning as pl
import gc
from collections import Counter
import seaborn as sns
from tqdm import tqdm
import os
import pandas as pd
import numpy as np
from math import comb
import random
from datasets import load_metric
# reproducibility stuff
import numpy as np
import random
import torch
np.random.seed(0)
random.seed(0)
torch.cuda.manual_seed(0)
torch.manual_seed(0)
torch.backends.cudnn.deterministic = True  # Note that this Deterministic mode can have a performance impact
torch.backends.cudnn.benchmark = False
_ = pl.seed_everything(0)
# to have a better workflow using notebook https://stackoverflow.com/questions/5364050/reloading-submodules-in-ipython
# these commands allow to update the .py codes imported instead of re-importing everything every time.
%load_ext autoreload
%autoreload 2
#%env WANDB_NOTEBOOK_NAME = ./notebook.ipynb
gc.collect()

In [None]:
# login wandb to have the online logger. It is really useful since it stores all the plots and evolution of the model
# check also https://docs.wandb.ai/guides/integrations/lightning
wandb.login()

## Utilities

In [None]:
# order a dictionary based on its keys in order to manipulate it better
def order_dict(d):
    keys = list(d.keys())
    keys.sort()
    ris = {i: d[i] for i in keys}
    return ris

# evaluate performance of SBERT
def evaluate_performance_baseline(summaries): # the input is only a dictionary with the indices of the summary sentences for each story
    rouge_score_list = []
    rouge = load_metric("rouge")
    for story, indices_list in summaries.items():
        summary = ' '.join([FairySum_DataModule.texts[story][i] for i in indices_list])
        gold_list = FairySum_DataModule.gold_test[story]
        num_gold = len(gold_list)
        story_score = {"rouge_1" : 0, "rouge_2" : 0, "rouge_L" : 0}
        for g in gold_list:
            gold_summary = ' '.join([FairySum_DataModule.texts[story][i] for i in g])
            results = rouge.compute(predictions=[summary], references=[gold_summary])
            story_score["rouge_1"] += ((results["rouge1"].low.fmeasure+results["rouge1"].mid.fmeasure+results["rouge1"].high.fmeasure)/3)
            story_score["rouge_2"] += ((results["rouge2"].low.fmeasure+results["rouge2"].mid.fmeasure+results["rouge2"].high.fmeasure)/3)
            story_score["rouge_L"] += ((results["rougeL"].low.fmeasure+results["rougeL"].mid.fmeasure+results["rougeL"].high.fmeasure)/3)
        for k in story_score.keys():
            story_score.update({k : story_score[k] / num_gold})
        r1 = story_score["rouge_1"] / num_gold
        r2 = story_score["rouge_2"] / num_gold
        r3 = story_score["rouge_L"] / num_gold
                
        print(f"Story: {story}")
        print(f"ROUGE-1: {r1:.3f}")
        print(f"ROUGE-2: {r2:.3f}")
        print(f"ROUGE-L: {r3:.3f}")
        print("-------------------------------------------------")
        rouge_score_list.append(story_score)
    print()
    all_rouge_1 = [e["rouge_1"] for e in rouge_score_list]
    print(f"AVERAGE ROUGE-1 for all the stories: {np.array(all_rouge_1).mean()}")
    print()
    all_rouge_2 = [e["rouge_2"] for e in rouge_score_list]
    print(f"AVERAGE ROUGE-2 for all the stories: {np.array(all_rouge_2).mean()}")
    print()
    all_rouge_L = [e["rouge_L"] for e in rouge_score_list]
    print(f"AVERAGE ROUGE-L for all the stories: {np.array(all_rouge_L).mean()}")
    print()

# evaluate performance of my models
def evaluate_performance(model, data):
    model.eval()
    device = model.device
    dataset = data.val_dataloader()

    with torch.no_grad():
        rouge_score_list = []
        for batch in tqdm(iter(dataset)): # tqdm ci permette di visualizzare il progresso della lettura del dataset
            batch["text"] = {k : v.to(device) for k,v in batch["text"].items()}
            batch["candidates"] = [{k : v.to(device) for k,v in e.items()} for e in batch["candidates"]]
            best_candidates_list = model.predict(batch)
            
            # COMPUTE ROUGE SCORES
            rouge = load_metric("rouge")
            for story, best_candidate in list(zip(batch["id"], best_candidates_list)):
                gold_list = FairySum_DataModule.gold_test[story]
                num_gold = len(gold_list)
                story_score = {"rouge_1" : 0, "rouge_2" : 0, "rouge_L" : 0}
                for g in gold_list:
                    gold_summary = ' '.join([FairySum_DataModule.texts[story][i] for i in g])
                    results = rouge.compute(predictions=[best_candidate], references=[gold_summary])
                    story_score["rouge_1"] += ((results["rouge1"].low.fmeasure+results["rouge1"].mid.fmeasure+results["rouge1"].high.fmeasure)/3)
                    story_score["rouge_2"] += ((results["rouge2"].low.fmeasure+results["rouge2"].mid.fmeasure+results["rouge2"].high.fmeasure)/3)
                    story_score["rouge_L"] += ((results["rougeL"].low.fmeasure+results["rougeL"].mid.fmeasure+results["rougeL"].high.fmeasure)/3)
                for k in story_score.keys():
                    story_score.update({k : story_score[k] / num_gold})
                r1 = story_score["rouge_1"] / num_gold
                r2 = story_score["rouge_2"] / num_gold
                r3 = story_score["rouge_L"] / num_gold
                
                print(f"Story: {story}")
                print(f"ROUGE-1: {r1:.3f}")
                print(f"ROUGE-2: {r2:.3f}")
                print(f"ROUGE-L: {r3:.3f}")
                print("-------------------------------------------------")
                rouge_score_list.append(story_score)
        print()
        all_rouge_1 = [e["rouge_1"] for e in rouge_score_list]
        print(f"AVERAGE ROUGE-1 for all the stories: {np.array(all_rouge_1).mean()}")
        print()
        all_rouge_2 = [e["rouge_2"] for e in rouge_score_list]
        print(f"AVERAGE ROUGE-2 for all the stories: {np.array(all_rouge_2).mean()}")
        print()
        all_rouge_L = [e["rouge_L"] for e in rouge_score_list]
        print(f"AVERAGE ROUGE-L for all the stories: {np.array(all_rouge_L).mean()}")
        print()
                
            

## Preprocessing

### Candidates Extraction

#### **Phase 1** - *output summaries length regression*

In [None]:
# training the regression model for predicting the best summary length (i.e. the number of sentences to be extracted)

"""
    - The original texts length are saved in 'original_length.json' file. 
    - The golden output lengths (that are the computed average lenghts of the manually annotated summaries by the students) are saved 
      in the 'gold_length.json' file.
      We now need to compute a quantity which quantifies somehow the concentration of events in each story. We leverage the BookNLP library 
      to do so. The adopted strategy is very simple: once the library has detected the events, we count the number of unique sentences that
      contain at least one EVENT (it means that these sentences are relevant for the story).
"""

texts = json.load(open("data/texts.json","r"))

booknlp_already_computed = True if os.path.isdir("data/booknlp_processed_texts")==True else False
if not booknlp_already_computed:
  from booknlp.booknlp import BookNLP
  !python -m spacy download en_core_web_sm > /dev/null # needed for the BookNLP library
  model_params={"pipeline":"entity,event", "model":"big"}
  booknlp = BookNLP("en", model_params)
  texts = json.load(open("data/texts.json","r"))
  execute_booknlp_pipeline(booknlp, texts)
  !rm "data/current_story.txt"
  !rm -r "data/current"
  
events_already_computed = True
if not events_already_computed:
  count_event_sentence("data/booknlp_processed_texts/", texts)

# starting the regression phase
# since we miss one golden summary --> 'bn_02975525n_Rothschild_s Violin', we need to remove it from 'events' and from 'original_lenghts'
events = json.load(open("data/events.json","r"))
del events["bn_02975525n"]
events = order_dict(events)

gold_lengths = json.load(open("data/gold_length.json","r"))
gold_lengths = order_dict(gold_lengths)
original_lenghts = json.load(open("data/original_length.json","r"))
del original_lenghts["bn_02975525n"]
original_lenghts = order_dict(original_lenghts)

# instantiate the regression model for predicting  the output length of our generated extractive summaries
LengthRegressionModel = LengthRegressionModel(gold_lengths, original_lenghts, events)
# LengthRegressionModel.plot() # if we want to plot the regression curve
LengthRegressionModel.fit()
predictions = LengthRegressionModel.predict()


#### **Phase 2** - *sentences extraction*

In [None]:
hparams = asdict(Hparams())
hparams["sbert_mode"] = "extraction"
sbert = SentenceBERT(hparams, predictions)

texts = json.load(open("data/texts.json","r"))
extracted_sentences = sbert(texts) # we receive a dictionary with the extracted sentences indices for each story

#### **Phase 3** - *sentences selection*

In [None]:
# selection strategy

k_range = hparams["k_range"]
pick_random_n = hparams["pick_random_n"]

candidates_dict = {}
for k,v in extracted_sentences.items():
    candidates_list = []
    candidates_list.append(v)
    n = len(v)
    for i in range(n-k_range, n):
        #c = comb(n, i) # total number of combinations
        random_idx = []
        for _ in range(pick_random_n): # how many random combinations?
            candidates_list.append([v[i] for i in sorted(random.sample(range(n), i))])
            
    candidates_dict[k] = candidates_list
    
# save them
json.dump(candidates_dict, open("data/candidates/candidates.json", "w"))

#### Compute candidates ROUGE scores
*We need it for the training phase.*

In [None]:
# we need to know wich are the training texts
train_keys = []
for f in os.listdir("FairySum/texts/train/"):
    k = "_".join(f.split("_")[:2])
    train_keys.append(k)

In [None]:
# starting from the candidates.json file
candidates_dict = json.load(open("data/candidates/candidates.json", "r"))

gold_dict = json.load(open("data/gold/gold.json", "r"))
texts = json.load(open("data/texts.json", "r"))

# we need to build these two dictionaries
ROUGE_predictions = {}
ROUGE_references = {}

for k,v in candidates_dict.items():
    if k not in train_keys:
        continue
    original_story = texts[k]
    for candidate_indices in v:
        text_candidate = [original_story[i] for i in candidate_indices]
        if k in ROUGE_predictions.keys():
            ROUGE_predictions[k].append(" ".join(text_candidate))
        else:
            ROUGE_predictions[k] = [" ".join(text_candidate)]

for k,v in gold_dict.items():
    original_story = texts[k]
    for gold_indices in v:
        text_gold = [original_story[i] for i in gold_indices]
        if k in ROUGE_references.keys():
            ROUGE_references[k].append(" ".join(text_gold))
        else:
            ROUGE_references[k] = [" ".join(text_gold)]

# now we can start computing the ROUGE scores
rouge = load_metric("rouge")
scores = {}
for story, candidates in ROUGE_predictions.items():
    for candidate in tqdm(candidates):
        score = 0
        for gold in ROUGE_references[story]:
            results = rouge.compute(predictions=[candidate], references=[gold])
            score += ((results["rougeL"].low.fmeasure+results["rougeL"].mid.fmeasure+results["rougeL"].high.fmeasure)/3)
        score = score/len(ROUGE_references[story])
        if story in scores.keys():
            scores[story].append(score)
        else:
            scores[story] = [score]
            
# save the scores dictionary into the 'candodates' folder
json.dump(scores, open("data/candidates/scores.json", "w"))

### Abstractive Summaries

We use  the SOTA **PEGASUS model** for computing the abstractive summaries needed  for  training. We simply download a pretrained model and use it as it is (*plug-and-play*).

In [None]:
from transformers import PegasusForConditionalGeneration, PegasusTokenizer

# create the tokenizer
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-large")

# load the model
torch_device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-large").to(torch_device)

In [None]:
# we need to know wich are the training texts
train_keys = []
for f in os.listdir("FairySum/texts/train/"):
    k = "_".join(f.split("_")[:2])
    train_keys.append(k)

In [None]:
texts = json.load(open("data/texts.json","r"))
abstractives = {}

# we make inference on this model
model.eval()
with torch.no_grad():
    for i, (k,v) in tqdm(enumerate(texts.items())):
        if k not in train_keys:
            continue
        # we need to have the text available as a whole
        text = " ".join([e+"\n" for e in v])
        # create tokens batch
        batch = tokenizer.prepare_seq2seq_batch(text, truncation=True, padding="longest", return_tensors="pt").to(torch_device)
        # summary in tokens
        summary = model.generate(**batch)
        # (we need to decode it)
        abstractive_output = tokenizer.batch_decode(summary, skip_special_tokens=True)[0]
        abstractives[k] = abstractive_output
        ##########################
        ####   free GPU RAM   ####
        ##########################
        del batch
        del summary
        del abstractive_output
        torch.cuda.empty_cache()
        ##########################
        
# save the dictionary
json.dump(abstractives, open("data/abstractives/abstractives.json", "w"))

> We use *pegasus-large* pretrainied model because it outputs longer summaries, but since it is trained mainly on news articles which are shorter than our dataset, the output summaries are certainly not ideal. This can be a research direction to follow for future improvements of the overall method. For this mini-project we keep the model as it is, hoping in the achievement of decent results.

### Gold Summaries processing

In [None]:
# we need to know wich are the training texts
train_keys = []
for f in os.listdir("FairySum/texts/train/"):
    k = "_".join(f.split("_")[:2])
    train_keys.append(k)

In [None]:
# we want to create a dictionary for each gold summary with only the sentences indeces
# of TRAIN texts for the training phase
gold = {}
for file_name in ["FairySum/gold/"+x for x in os.listdir("FairySum/gold/")]:
  key = file_name.split("/")[-1]
  key = "_".join(key.split("_")[:2])
  if key not in train_keys:
    continue
  f = open(file_name, 'r')
  lines = f.readlines()
  text = []
  for l in lines:
    i = l.index(":")
    text.append(int(l[:i]))
  if key in gold.keys(): # if we have more than one gold summary we apppend to the list
    gold[key].append(text)
  else:
    gold[key] = [text]
    
# save the dictionary
json.dump(gold, open("data/gold/gold.json", "w"))

# and of the TEST texts for the evaluation performances pipeline
gold_test = {}
for file_name in ["FairySum/gold/"+x for x in os.listdir("FairySum/gold/")]:
  key = file_name.split("/")[-1]
  key = "_".join(key.split("_")[:2])
  if key in train_keys:
    continue
  f = open(file_name, 'r')
  lines = f.readlines()
  text = []
  for l in lines:
    i = l.index(":")
    text.append(int(l[:i]))
  if key in gold_test.keys(): # if we have more than one gold summary we apppend to the list
    gold_test[key].append(text)
  else:
    gold_test[key] = [text]
    
# save the dictionary
json.dump(gold_test, open("data/gold/gold_test.json", "w"))

## Dataset

In [None]:
hparams = asdict(Hparams())

In [None]:
FairySum_Data = FairySum_DataModule(hparams)
# to setup it takes 0.2s (but because the most demanding operations are made at "batch time")
FairySum_Data.setup()
print(len(FairySum_Data.data_train)) # -->  75 stories
print(len(FairySum_Data.data_test)) # -->  16 stories
print("TOTAL: "+str(len(FairySum_Data.data_train)+len(FairySum_Data.data_test))+" Fairy Tales and Short Stories")

In [None]:
batch1 = next(iter(FairySum_Data.train_dataloader()))
batch2 = next(iter(FairySum_Data.val_dataloader()))

## Model

### Finetuning on FairySum

In [None]:
user_name = "lavallone"
project_name = "NUANS_project"
version_name = "prova"
run = wandb.init(entity=user_name, project=project_name, name = version_name, mode = "online")

hparams = asdict(Hparams())
data = FairySum_DataModule(hparams)
model = MatchSum(hparams)
trainer = train_model(data, model, experiment_name = version_name, \
    patience=5, metric_to_monitor="val_ROUGE", mode="max", epochs = 1)

wandb.finish()

## Evaluation

### Baseline
> Computing the performances of the *baseline* model. These will be the values to beat.

In [None]:
hparams = asdict(Hparams())
hparams["sbert_mode"] = "evaluation"
sbert = SentenceBERT(hparams)

texts = json.load(open("data/texts.json","r"))
# we only give the texts from the TEST set
test_keys = []
for f in os.listdir("FairySum/texts/test/"):
    k = "_".join(f.split("_")[:2])
    test_keys.append(k)
test_texts = {k:v for k,v in texts.items() if k in test_keys}

summaries = sbert(test_texts)
evaluate_performance_baseline(summaries)

### MatchSum
> Computing the performances of my solutions.

In [None]:
load_ckpt = True
if load_ckpt:
    best_ckpt = "models/prova-epoch=00-val_ROUGE=0.6224.ckpt"
    model = MatchSum.load_from_checkpoint(best_ckpt, strict=False, device = "cuda" if torch.cuda.is_available() else "cpu")
# if we want to test without training we need to setup the data
trained = False 
if not trained:
    hparams = asdict(Hparams())
    data = FairySum_DataModule(hparams)
    data.setup()

evaluate_performance(model, data)

In [11]:
import torch
ckpt = torch.load('MatchSum_cnndm_bert.ckpt')

ModuleNotFoundError: No module named 'model'