The function *Summarize_texts* is called in this notebook.

This function contains the following input parameters: 

- `max_d_list`: A list of values for the hyperparameter *max_d* (between [0, 1])
- `diff_bits_list`: List of values for the *diff_bits* hyperparameter (between [0,5000])
- `max_d_sent_list`: List of values for the  *max_d_sent* hyperparameter (between [0, 5000])
- `size_summary`: Summary size, by default 200.
- `texts_list`: Number of documents to summarize from 1 to 57 (all)
- `Directory`: Location of the files to summarize
- `Precalc_Directory`: Location to store precalc distances used to save time
- `output_dir`: Final location of summary files


## Calling summarization function

An example of how Summarize_Texts can be called to summarize all texts in a given directory

In [9]:
%run Summarize_Texts.ipynb

Summarize_texts(max_d_list = [0.76],        # Between [0,1] 
              diff_bits_list = [2000],      # Between [0, 5000] 
              max_d_sent_list = [4000],     # Between [0, 5000]
              size_summary = 200,           # Summari size [100, 200, 400])
               Directory = 'Texts/',        # Text directory
               output_dir = 'Summaries/')   # Output directory

d082a.txt
d083a.txt
d084a.txt
d085d.txt
d086d.txt
d087d.txt
d089d.txt
d090d.txt
d091c.txt
d092c.txt
d093c.txt
d094c.txt
d095c.txt
d096c.txt
d097e.txt
d099e.txt
d100e.txt
d101e.txt
d102e.txt
d103g.txt
d104g.txt
d105g.txt
d106g.txt
d107g.txt
d108g.txt
d109h.txt
d110h.txt
d111h.txt
d112h.txt
d113h.txt
d114h.txt
d115i.txt
d116i.txt
d117i.txt
d118i.txt
d119i.txt
d120i.txt


## Evaluation Functions

The following functions require to have installed pyrouge, a wrapper for ROUGE155

In [None]:
# Python wrapper for the ROUGE summarization evaluation package
from pyrouge import Rouge155

# Miscellaneous operating system interfaces
import os

# High-level file operations
import shutil


In [None]:
def evaluate_rouge_one_file(rg_path, ps, pm, name_summmary, name_goold_std, year, folder_results, folder_experiment, ID_names_summaries, ID_names_gold_std, gen_summaries, ID_gold_std):
    # variables to evaluate summaries
    r = Rouge155()
    
    # summaries path
    r.system_dir = rg_path + ps

    # gold standard documents path
    r.model_dir = rg_path + pm
    
    r.system_filename_pattern = '({}{}).txt'.format(ID_names_summaries, name_summmary)
    r.model_filename_pattern = '({}{}).txt'.format(ID_names_gold_std, name_goold_std)

    output = r.convert_and_evaluate()
    output_dict = r.output_to_dict(output)
    
    return output_dict

In [None]:
def prom_rouge_golds(n_gs, rouge_meas_list):
    
    #prom_rouge = []
    s = int(len(rouge_meas_list)/n_gs)
    
    for i in range(s):
        prom_rouge = (rouge_meas_list[i] + rouge_meas_list[s + i])/n_gs

    return prom_rouge

In [None]:
def ROUGE_measures_summaries(ID_gold_std, #folder_gen_summaries, 
                            ID_names_summaries, 
                            ID_names_gold_std, name_goold_std, 
                            sc_gold_std_path, name_summmary, 
                            rg_gold_std_path, sc_summaries_path, 
                            rg_summaries_path, rg_path, ps, pm, 
                            year, folder_results, folder_experiment 
                            #gen_summaries
                            ):

    r1r = []
    r1p = []
    r1f = []
    r2r = []
    r2p = []
    r2f = []
    avr_r1r_list = []
    avr_r1p_list = []
    avr_r1f_list = []
    avr_r2r_list = []
    avr_r2p_list = []
    avr_r2f_list = []
    avr_r1r_all_golds_list = []
    avr_r1p_all_golds_list = []
    avr_r1f_all_golds_list = []
    avr_r2r_all_golds_list = []
    avr_r2p_all_golds_list = []
    avr_r2f_all_golds_list = []

    # Iterates gold standards documents
    for id_gold in ID_gold_std:

        # principal path for gold standard documents
        pr_gold_std_path = '/Users/flintlock/Desktop/model_summaries{}_2002_unsup'.format(id_gold)



        folder_gen_summaries, gen_summaries = folders_2_list(current_dir, folder_experiment)
        
        
        #Iterates summaries in each folder
        for i in range(len(ID_names_summaries)):
            
            # Reading generated summaries and gold standards documents
            shutil.copy('{}/{}{}.txt'.format(pr_gold_std_path, ID_names_gold_std[i], name_goold_std), '{}/{}{}.txt'.format(sc_gold_std_path, ID_names_gold_std[i], name_goold_std))
            shutil.copy('{}/{}{}.txt'.format(folder_gen_summaries, ID_names_summaries[i], name_summmary), '{}/{}{}.txt'.format(sc_summaries_path, ID_names_summaries[i], name_summmary))        
                
            # Moving both files to ROUGE path
            shutil.move('{}/{}{}.txt'.format(sc_gold_std_path, ID_names_gold_std[i], name_goold_std), '{}/{}{}.txt'.format(rg_gold_std_path, ID_names_gold_std[i], name_goold_std))
            shutil.move('{}/{}{}.txt'.format(sc_summaries_path, ID_names_summaries[i], name_summmary), '{}/{}{}.txt'.format(rg_summaries_path, ID_names_summaries[i], name_summmary))        

            # ROUGE measures
            output_dict = evaluate_rouge_one_file(rg_path, ps, pm, name_summmary, name_goold_std, year, folder_results, folder_experiment, ID_names_summaries[i], ID_names_gold_std[i], gen_summaries, id_gold)

            # Obtaining ROUGE measures
            r1r.append(output_dict['rouge_1_recall'])
            r1p.append(output_dict['rouge_1_precision'])
            r1f.append(output_dict['rouge_1_f_score'])
            r2r.append(output_dict['rouge_2_recall'])
            r2p.append(output_dict['rouge_2_precision'])
            r2f.append(output_dict['rouge_2_f_score'])

            # Deleting both files from ROUGE path
            os.remove('{}/{}{}.txt'.format(rg_summaries_path, ID_names_summaries[i], name_summmary))
            os.remove('{}/{}{}.txt'.format(rg_gold_std_path, ID_names_gold_std[i], name_goold_std))

        # Averaging ROUGE measures
        avr_r1r = sum(r1r)/len(r1r)
        avr_r1p = sum(r1p)/len(r1p)
        avr_r1f = sum(r1f)/len(r1f)

        avr_r2r = sum(r2r)/len(r2r)
        avr_r2p = sum(r2p)/len(r2p)
        avr_r2f = sum(r2f)/len(r2f)

        avr = [avr_r1r, avr_r1p, avr_r1f, avr_r2r, avr_r2p, avr_r2f]

        avr_r1r_list.append(avr_r1r)
        avr_r1p_list.append(avr_r1p)
        avr_r1f_list.append(avr_r1f)

        avr_r2r_list.append(avr_r2r)
        avr_r2p_list.append(avr_r2p)
        avr_r2f_list.append(avr_r2f)

        
        
        r1r = []
        r1p = []
        r1f = []
        r2r = []
        r2p = []
        r2f = []
        output_dict = 0

    # change i for k when the cycle is 1
    avr_r1r_all_golds = prom_rouge_golds(len(ID_gold_std), avr_r1r_list)
    avr_r1p_all_golds = prom_rouge_golds(len(ID_gold_std), avr_r1p_list)
    avr_r1f_all_golds = prom_rouge_golds(len(ID_gold_std), avr_r1f_list)

    avr_r2r_all_golds = prom_rouge_golds(len(ID_gold_std), avr_r2r_list)
    avr_r2p_all_golds = prom_rouge_golds(len(ID_gold_std), avr_r2p_list)
    avr_r2f_all_golds = prom_rouge_golds(len(ID_gold_std), avr_r2f_list)

    
    avr_all_golds = [avr_r1r_all_golds, avr_r1p_all_golds, avr_r1f_all_golds, avr_r2r_all_golds, avr_r2p_all_golds, avr_r2f_all_golds]

    # ROUGE measures of experiments
    avr_r1r_all_golds_list.append(avr_r1r_all_golds)
    avr_r1p_all_golds_list.append(avr_r1p_all_golds)
    avr_r1f_all_golds_list.append(avr_r1f_all_golds)

    avr_r2r_all_golds_list.append(avr_r2r_all_golds)
    avr_r2p_all_golds_list.append(avr_r2p_all_golds)
    avr_r2f_all_golds_list.append(avr_r2f_all_golds)

    return [avr_r1r_all_golds, avr_r2r_all_golds]

#### Evaluating with ROUGE

In [None]:


# returns current working directory of a process
current_dir = os.getcwd()

# folder name of experiment
folder_experiment = '/Summaries'

# ROUGE needs to know where your summaries and the gold standard documents are
rg_path = '/Users/flintlock/pyrouge/build/lib/pyrouge/tests/data'

# summaries path
ps = '/system_summaries_2004_labelling'

# gold standard documents path
pm = '/model_summariesA_2004_sup'

# folder name in which ROUGE results are saved
folder_results = current_dir + '/experiments'

# ID names of multi-document DUC 2002
ID_names_summaries2 = np.load('folder_name_2002' + '.npy')
ID_names_summaries = []
for item in ID_names_summaries2:
    if item != 'd082a' and item != 'd094c' and item != 'd099e' and item != 'd105g':
        ID_names_summaries.append(item)

        
ID_names_summaries.sort()

# ID names of gold standard documents DUC 2002
ID_names_gold_std = ID_names_summaries

# DUC year
year = '2002'

# ID name of summaries and gold standard documents
name_summmary = '_englishSyssym1'
name_goold_std = '_englishReference1'

# ID gold standard
ID_gold_std = ['A', 'B']

gs = 'all_golds'

# principal path for gold standard documents
#pr_gold_std_path = '/Users/flintlock/Desktop/model_summaries{}_2002_unsup'.format(ID_gold_std)

# secondary path for gold standard documents
sc_gold_std_path = current_dir + '/model_summariesA_2004_sup'

# secondary path for summaries
sc_summaries_path = current_dir + '/DUC2004_for_labelling'

# ROUGE path for gold standard documents
rg_gold_std_path = rg_path + '/model_summariesA_2004_sup'

# ROUGE path for summaries
rg_summaries_path = rg_path + '/system_summaries_2004_labelling' 

In [None]:
def summarize_and_evaluate(max_d_list, diff_bits_list, max_d_sent_list, size_summary, texts_list):
    
    %run Summarize_Texts.ipynb

    Resume_Textos(max_d_list = max_d_list,  # Entre [0,1] -> 0.9 buen valor
                  diff_bits_list = diff_bits_list,  # Entre [0, 5000] -> 2500 buen valor
                  #max_d_sent_list = [4000, 4100, 4250, 4300, 4500, 4600, 4750], # Entre [0, 5000] -> 4500 buen valor
                  max_d_sent_list = max_d_sent_list,
                  size_summary = size_summary,   # Tamaño del texto [100, 200, 400])
                  texts_list = texts_list)  # Por default 57 (todos)
    
    eva = ROUGE_measures_summaries(ID_gold_std, ID_names_summaries, ID_names_gold_std, name_goold_std, sc_gold_std_path, name_summmary, rg_gold_std_path, sc_summaries_path, rg_summaries_path, rg_path, ps, pm, year, folder_results, folder_experiment)    

    res=[1-x for x in eva]
    return res

In [None]:
res = resume_textos_y_evalua([0.985], [1750], [4375], 200, 57)