# We start by loading an example set of articles

In [None]:
%load_ext line_profiler
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
from joblib import Parallel, delayed
import networkx as nx
import pickle

In [None]:
ts_matrix_files = sorted(
    glob.glob('/home/joao/Thesis/ts_matrices_original/*.p'))
example_file = ts_matrix_files[0]

In [None]:
def get_coverage(example_bin, word, word_set):
    if (not word_set):
        return 0
    else:
        # we take the subset of phrases containing the word set
        # and the word
        if (len(word_set) > 1):
            sentence_subset = example_bin.loc[:, word_set]
            word_vec = example_bin.loc[:, word]
            repeated_vector = np.tile(
                word_vec, (sentence_subset.shape[1], 1)).transpose()
            coverage = np.logical_and(sentence_subset, repeated_vector)
            coverage = (coverage.sum(axis=1) > 0).sum() / word_vec.sum()
        else:
            sentence_subset = example_bin.loc[:, word_set[0]]
            word_vec = example_bin.loc[:, word]
            coverage = np.logical_and(sentence_subset,
                                      word_vec).sum() / word_vec.sum()

        return coverage

In [None]:
def get_subsuming_graphs(example_file):
    example = pd.read_pickle(example_file)
    keys = example[['this_file_name', 'sentence_order', 'word_count']]
    example.drop(
        columns=['this_file_name', 'sentence_order', 'word_count'],
        inplace=True)

    example_bin = example > 0

    # we then define the SPAN as being the number of
    # sentences containing a given word
    SPAN = example_bin.sum(axis=0)

    #we then define a function that gets the coverage
    #of a word w given a word set word_set:

    # we must now define the subsuming relationships
    #between the variables:
    # We then sort the words by SPAN
    sorted_span = SPAN.sort_values(ascending=False)
    subsuming_candidates = sorted_span[sorted_span > 1].index.values
    subsuming_candidates_no = subsuming_candidates.shape[0]
    lambda_2 = 0.75
    lambda_1_bar = 0.55

    # we now start defining subsuming relationships for all the
    # words in the file
    #initially hecka fucking slow - so let's do some heuristics
    """Corolary: No word with SPAN <= 1 can possibly subsume another.
    """
    subsuming_dict = {sorted_span.index[0]: []}
    for i in tqdm(range(1, sorted_span.shape[0])):
        current_word = sorted_span.index[i]
        max_cov = []
        max_cov = Parallel(n_jobs=8)(
            delayed(get_coverage)(example_bin, current_word, [candidate_word])
            for candidate_word in
            subsuming_candidates[0:min(i, subsuming_candidates_no)])
        max_cov = max(max_cov)
        lambda_1 = lambda_1_bar * max_cov
        for j in range(0, min(i, subsuming_candidates_no)):
            candidate_word = sorted_span.index[j]
            subsuming_j = subsuming_dict[candidate_word]
            condition_1 = get_coverage(example_bin, current_word,
                                       [candidate_word]) >= lambda_1
            condition_2 = get_coverage(example_bin, current_word,
                                       subsuming_j) < lambda_2
            if (condition_1 & condition_2):
                subsuming_j.append(current_word)
                subsuming_dict[candidate_word] = subsuming_j
        subsuming_dict.update({current_word: []})
    filename = ('/home/joao/Thesis/progressive/subsuming_graphs/' +
                example_file.split('/')[-1])
    pickle.dump(subsuming_dict, open(filename, 'wb'))
    return 0

In [None]:
Parallel(n_jobs=8)(delayed(get_subsuming_graphs)(example_file)
                   for example_file in ts_matrix_files)

## Time needed for execution
CPU times: user 698 ms, sys: 536 ms, total: 1.23 s
Wall time: 9h 19min 25s

## Second part: Building the summaries

In [None]:
# loading an example graph:
%load_ext line_profiler
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
from joblib import Parallel, delayed
import networkx as nx
import pickle
a = glob.glob('/home/joao/Thesis/progressive/subsuming_graphs/*')

In [None]:
example_graph = pickle.load(open(a[0], 'rb'))

# we then create the subsuming graph
DG = nx.DiGraph()

DG = nx.DiGraph(example_graph)
# we must also load the ts matrix:
ts_matrix_file = (
    '/home/joao/Thesis/ts_matrices_original/' + a[0].split('/')[-1])
example = pd.read_pickle(ts_matrix_file)
keys = example[['this_file_name', 'sentence_order']]
example.drop(columns=['this_file_name', 'sentence_order'], inplace=True)

example_bin = example > 0

# we then define the SPAN as being the number of sentences
# containing a given word
SPAN = example_bin.sum(axis=0)
# we also load the full sentence bank:
sentence_bank_file = ('/home/joao/Thesis/sentence_bank/' + a[0].split('/')[-1])
sentence_bank = pd.read_pickle(sentence_bank_file)

# we then define the damping factor, alfa
alfa = 0.5

# we now start defining the sentence selection algorithm
#first, we define the basic words - nodes with indegree = 0
full_word_list = example_bin.columns.values
degree_dict = dict(DG.in_degree(full_word_list))
degree_df = pd.DataFrame({
    'word': degree_dict.keys(),
    'in_degree': degree_dict.values()
})
general_words = degree_df.loc[degree_df.in_degree == 0, 'word']
# we then define the sentence_sets:
sentence_sets = []
for i in example_bin.index:
    tmp = example_bin.loc[i, :]
    sentence_sets.append(tmp[tmp > 0].index.values)


# we then define the function that gets the conditional
# saliency for a sentence given another:
def get_conditional_saliency(word_set_1, word_set_2, SPAN, DG):
    cs = 0
    sub_df = list(word_set_1) + list(word_set_2)
    subset_dg = DG.subgraph(sub_df)
    for i in word_set_1:
        for j in word_set_2:
            if (nx.has_path(subset_dg, j, i)):
                cs += np.log(SPAN[i])
                break
    return cs


total_cs = []
sentence_nums = []
for sentence_num in tqdm(range(len(sentence_sets))):
    this_sentence = sentence_sets[sentence_num]
    total_cs.append(
        get_conditional_saliency(this_sentence, general_words, SPAN, DG))
    sentence_nums.append(sentence_num)
final_cs = pd.DataFrame({'sentence_num': sentence_nums, 'cs': total_cs})

first_sentence = final_cs[final_cs.cs == final_cs.cs.max()].sentence_num

selected_sentence_keys = keys.iloc[first_sentence]

selected_sentence_keys
this_text = sentence_bank[sentence_bank.filename == selected_sentence_keys.
                          this_file_name.values[0]]

this_sentence = this_text[this_text.sentence_order == selected_sentence_keys.
                          sentence_order.values[0]].sentence.values

abstract = list(this_sentence)
selected_sentence_nums = [first_sentence.values[0]]
selected_sentences = [sentence_sets[first_sentence.values[0]]]

total_abstract_length = len(abstract[-1][0].split(' '))
# we then update SPAN
SPAN[selected_sentences[-1]] = SPAN[selected_sentences[-1]] * alfa

# we may then start the progressive summarization procedure -
# using only 400 words
while (total_abstract_length < 400):
    cs = []
    sentence_nums = []
    for sentence_num in range(len(sentence_sets)):
        this_sentence = sentence_sets[sentence_num]
        potential_cs = []
        for present_sentence in selected_sentences:
            potential_cs.append(
                get_conditional_saliency(this_sentence, present_sentence, SPAN,
                                         DG))
        cs.append(max(potential_cs))
        sentence_nums.append(sentence_num)
    final_cs = pd.DataFrame({'sentence_num': sentence_nums, 'cs': total_cs})
    #we ignore already selected sentences
    final_cs = final_cs[~final_cs.sentence_num.isin(selected_sentence_nums)]
    next_sentence = final_cs[final_cs.cs == final_cs.cs.max()].sentence_num
    if (next_sentence.shape[0] != 1):
        selected_sentence_keys = keys.iloc[next_sentence.values]
        # we select the sentence that came first
        selected_sentence_keys = selected_sentence_keys[
            selected_sentence_keys.sentence_order == selected_sentence_keys.
            sentence_order.min()]
        next_sentence = selected_sentence_keys.index.values[0]
    else:
        next_sentence = next_sentence.values[0]
        selected_sentence_keys = keys.iloc[next_sentence]
        selected_sentence_keys = keys.iloc[next_sentence]
    print(next_sentence)
    this_text = sentence_bank[sentence_bank.filename == selected_sentence_keys.
                              this_file_name]
    this_sentence = this_text[this_text.sentence_order ==
                              selected_sentence_keys.
                              sentence_order].sentence.values[0]
    remaining_words = 400 - total_abstract_length
    if (len(this_sentence.split(' ')) > remaining_words):
        break
    else:
        abstract.append(this_sentence)
        selected_sentence_nums.append(next_sentence)
        selected_sentences.append(sentence_sets[next_sentence])
        total_abstract_length += len(abstract[-1].split(' '))
        SPAN[selected_sentences[-1]] = SPAN[selected_sentences[-1]] * alfa

In [None]:
#saving the abstract:
final_filename = ('/home/joao/Thesis/progressive/abstracts/' +
                  a[0].split('/')[-1][:-2] + '.txt')
with open(final_filename, 'wb') as f:
    for i in abstract:
        f.write(str(i))

## We now turn it into a function and paralellize its execution

In [None]:
# loading an example graph:
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
from joblib import Parallel, delayed
import networkx as nx
import pickle
graph_files = pd.Series(
    sorted(glob.glob('/home/joao/Thesis/progressive/subsuming_graphs/*')))
done_files = pd.Series(
    sorted(glob.glob('/home/joao/Thesis/progressive/abstracts/*')))


In [None]:
def create_abstract(graph_file):
    example_graph = pickle.load(open(graph_file, 'rb'))

    # we then create the subsuming graph
    DG = nx.DiGraph()

    DG = nx.DiGraph(example_graph)
    # we must also load the ts matrix:
    ts_matrix_file = '/home/joao/Thesis/ts_matrices_original/' + graph_file.split(
        '/')[-1]
    example = pd.read_pickle(ts_matrix_file)
    keys = example[['this_file_name', 'sentence_order', 'word_count']]
    example.drop(
        columns=['this_file_name', 'sentence_order', 'word_count'],
        inplace=True)

    example_bin = example > 0

    # we then define the SPAN as being the number of sentences containing a given word
    SPAN = example_bin.sum(axis=0)
    # we also load the full sentence bank:
    sentence_bank_file = '/home/joao/Thesis/sentence_bank/' + graph_file.split(
        '/')[-1]
    sentence_bank = pd.read_pickle(sentence_bank_file)

    # we then define the damping factor, alfa
    alfa = 0.5

    # we now start defining the sentence selection algorithm
    #first, we define the basic words - nodes with indegree = 0
    full_word_list = example_bin.columns.values
    degree_dict = dict(DG.in_degree(full_word_list))
    degree_df = pd.DataFrame({
        'word': degree_dict.keys(),
        'in_degree': degree_dict.values()
    })
    general_words = degree_df.loc[degree_df.in_degree == 0, 'word']
    # we then define the sentence_sets:
    sentence_sets = []
    for i in example_bin.index:
        tmp = example_bin.loc[i, :]
        sentence_sets.append(tmp[tmp > 0].index.values)

    # we then define the function that gets the conditional saliency
    #for a sentence given another:
    def get_conditional_saliency(word_set_1, word_set_2, SPAN, DG):
        cs = 0
        sub_df = list(word_set_1) + list(word_set_2)
        subset_dg = DG.subgraph(sub_df)
        for i in word_set_1:
            for j in word_set_2:
                if (nx.has_path(subset_dg, j, i)):
                    cs += np.log(SPAN[i])
                    break
        return cs

    total_cs = []
    sentence_nums = []
    for sentence_num in tqdm(range(len(sentence_sets))):
        this_sentence = sentence_sets[sentence_num]
        total_cs.append(
            get_conditional_saliency(this_sentence, general_words, SPAN, DG))
        sentence_nums.append(sentence_num)
    final_cs = pd.DataFrame({'sentence_num': sentence_nums, 'cs': total_cs})

    first_sentence = final_cs[final_cs.cs == final_cs.cs.max()].sentence_num

    selected_sentence_keys = keys.iloc[first_sentence]

    selected_sentence_keys
    this_text = sentence_bank[sentence_bank.filename == selected_sentence_keys.
                              this_file_name.values[0]]

    this_sentence = this_text[this_text.sentence_order ==
                              selected_sentence_keys.sentence_order.
                              values[0]].sentence.values

    abstract = list(this_sentence)
    selected_sentence_nums = [first_sentence.values[0]]
    selected_sentences = [sentence_sets[first_sentence.values[0]]]

    total_abstract_length = len(abstract[-1][0].split(' '))
    # we then update SPAN
    SPAN[selected_sentences[-1]] = SPAN[selected_sentences[-1]] * alfa

    # we may then start the progressive summarization procedure -
    # using only 400 words
    while (total_abstract_length < 400):
        cs = []
        sentence_nums = []
        for sentence_num in range(len(sentence_sets)):
            this_sentence = sentence_sets[sentence_num]
            potential_cs = []
            for present_sentence in selected_sentences:
                potential_cs.append(
                    get_conditional_saliency(this_sentence, present_sentence,
                                             SPAN, DG))
            cs.append(max(potential_cs))
            sentence_nums.append(sentence_num)
        final_cs = pd.DataFrame({
            'sentence_num': sentence_nums,
            'cs': total_cs
        })
        #we ignore already selected sentences
        final_cs = final_cs[~final_cs.sentence_num.
                            isin(selected_sentence_nums)]
        next_sentence = final_cs[final_cs.cs == final_cs.cs.max()].sentence_num
        if (next_sentence.shape[0] != 1):
            selected_sentence_keys = keys.iloc[next_sentence.values]
            # we select the sentence that came first
            selected_sentence_keys = selected_sentence_keys[
                selected_sentence_keys.sentence_order ==
                selected_sentence_keys.sentence_order.min()]
            next_sentence = selected_sentence_keys.index.values[0]
            selected_sentence_keys = keys.iloc[next_sentence]

        else:
            next_sentence = next_sentence.values[0]
            selected_sentence_keys = keys.iloc[next_sentence]


        this_text = sentence_bank[sentence_bank.filename ==
                                  selected_sentence_keys.this_file_name]
        this_sentence = this_text[this_text.sentence_order ==
                                  selected_sentence_keys.
                                  sentence_order].sentence.values[0]
        remaining_words = 400 - total_abstract_length
        if (len(this_sentence.split(' ')) > remaining_words):
            break
        else:
            abstract.append(this_sentence)
            selected_sentence_nums.append(next_sentence)
            selected_sentences.append(sentence_sets[next_sentence])
            total_abstract_length += len(abstract[-1].split(' '))
            SPAN[selected_sentences[-1]] = SPAN[selected_sentences[-1]] * alfa
    #saving the abstract:
    final_filename = '/home/joao/Thesis/progressive/abstracts/' + graph_file.split(
        '/')[-1][:-2] + '.txt'
    with open(final_filename, 'wb') as f:
        for i in abstract:
            f.write(str(i).strip())
            if (i != abstract[-1]):
                f.write('\r\n')
    return 0

In [None]:
Parallel(
    n_jobs=8, verbose=11)(
        delayed(create_abstract)(graph_file) for graph_file in graph_files)

### Abstract Generation time : 45.1 minutes

In [None]:
#putting the abstracts in the right name format:
a = glob.glob('/home/joao/Thesis/progressive/abstracts/*.txt')

In [None]:
for i in a:
    this_file = '/home/joao/Thesis/progressive/right_name/abstract.' + i.split(
        '/')[-1][1:4] + '.txt'
    with open(i, 'rb') as f:
        abstract = f.read()
    with open(this_file, 'wb') as f:
        f.write(abstract.stri)

In [None]:
#removing the trailing \r\n
b = glob.glob('/home/joao/Thesis/progressive/right_name/*.txt')
for i in b:
    with open(i, 'rb') as f:
        abstract = f.read()
    with open(i, 'wb') as f:
        f.write(abstract[:-2])

## evaluating results

In [None]:
from rouge import Rouge, FilesRouge
import pandas as pd
import glob
from tqdm import tqdm
import numpy as np

In [None]:
# we then load all summaries and candidate summaries:
auto_summaries = glob.glob('/home/joao/Thesis/progressive/right_name/*')
true_summaries = pd.Series(
    sorted(glob.glob('/home/joao/Thesis/simplified_abstracts/*')))

In [None]:
total_scores = []
for i in tqdm(auto_summaries):
    this_file = i.split('/')[-1]
    this_file_num = this_file.split('.')[-2]
    ground_truths = true_summaries[true_summaries.str[-7:-4] == this_file_num]
    scores = []
    rouge = Rouge()
    with open(i, 'rb') as f:
        auto_summary = f.read()
    for j in ground_truths:
        with open(j, 'rb') as f:
            ground_truth = f.read()
            tmp_scores = rouge.get_scores(auto_summary, ground_truth, avg=True)
        scores.append(tmp_scores['rouge-2']['p'])
    total_scores.append(np.mean(scores))

In [None]:
np.mean(scores)

# we now transform the function to allow it to be implemented for the papers

In [None]:
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
from joblib import Parallel, delayed
import networkx as nx
import pickle

ts_matrix_files = sorted(
    glob.glob('/home/joao/Thesis/test_set/ts_matrices/*.p'))
example_file = ts_matrix_files[0]


def get_coverage(example_bin, word, word_set):
    if (not word_set):
        return 0
    else:
        # we take the subset of phrases containing the word
        # set and the word
        if (len(word_set) > 1):
            sentence_subset = example_bin.loc[:, word_set]
            word_vec = example_bin.loc[:, word]
            repeated_vector = np.tile(
                word_vec, (sentence_subset.shape[1], 1)).transpose()
            coverage = np.logical_and(sentence_subset, repeated_vector)
            coverage = (coverage.sum(axis=1) > 0).sum() / word_vec.sum()
        else:
            sentence_subset = example_bin.loc[:, word_set[0]]
            word_vec = example_bin.loc[:, word]
            coverage = np.logical_and(sentence_subset,
                                      word_vec).sum() / word_vec.sum()

        return coverage


def get_subsuming_graphs(example_file):
    example = pd.read_pickle(example_file)
    keys = example[['this_file_name', 'sentence_order', 'word_count']]
    example.drop(
        columns=['this_file_name', 'sentence_order', 'word_count'],
        inplace=True)

    example_bin = example > 0

    # we then define the SPAN as being the number of sentences containing a given word
    SPAN = example_bin.sum(axis=0)

    #we then define a function that gets the coverage of a word w given a word set word_set:

    # we must now define the subsuming relationships between the variables:
    # We then sort the words by SPAN
    sorted_span = SPAN.sort_values(ascending=False)
    subsuming_candidates = sorted_span[sorted_span > 1].index.values
    subsuming_candidates_no = subsuming_candidates.shape[0]
    lambda_2 = 0.75
    lambda_1_bar = 0.55

    # we now start defining subsuming relationships for all the words in the file
    #initially hecka fucking slow - so let's do some heuristics
    """Corolary: No word with SPAN <= 1 can possibly subsume another.
    """
    subsuming_dict = {sorted_span.index[0]: []}
    for i in tqdm(range(1, sorted_span.shape[0])):
        current_word = sorted_span.index[i]
        max_cov = []
        max_cov = Parallel(n_jobs=8)(
            delayed(get_coverage)(example_bin, current_word, [candidate_word])
            for candidate_word in
            subsuming_candidates[0:min(i, subsuming_candidates_no)])
        max_cov = max(max_cov)
        lambda_1 = lambda_1_bar * max_cov
        for j in range(0, min(i, subsuming_candidates_no)):
            candidate_word = sorted_span.index[j]
            subsuming_j = subsuming_dict[candidate_word]
            condition_1 = get_coverage(example_bin, current_word,
                                       [candidate_word]) >= lambda_1
            condition_2 = get_coverage(example_bin, current_word,
                                       subsuming_j) < lambda_2
            if (condition_1 & condition_2):
                subsuming_j.append(current_word)
                subsuming_dict[candidate_word] = subsuming_j
        subsuming_dict.update({current_word: []})
    filename = '/home/joao/Thesis/progressive/test_subsuming_graphs/' + example_file.split(
        '/')[-1]
    pickle.dump(subsuming_dict, open(filename, 'wb'))
    return 0


Parallel(
    n_jobs=8, verbose=11)(delayed(get_subsuming_graphs)(example_file)
                          for example_file in ts_matrix_files)

## EXECUTION TIME: 1h 19min 34s

# Now creating the abstracts

In [None]:
# loading an example graph:
import pandas as pd
import numpy as np
import glob
from tqdm import tqdm
from joblib import Parallel, delayed
import networkx as nx
import pickle
graph_files = pd.Series(
    sorted(glob.glob('/home/joao/Thesis/progressive/test_subsuming_graphs/*')))
done_files = pd.Series(
    sorted(glob.glob('/home/joao/Thesis/progressive/test_abstracts/*')))



def create_abstract(graph_file):
    example_graph = pickle.load(open(graph_file, 'rb'))

    # we then create the subsuming graph
    DG = nx.DiGraph()

    DG = nx.DiGraph(example_graph)
    # we must also load the ts matrix:
    ts_matrix_file = '/home/joao/Thesis/test_set/ts_matrices/' + graph_file.split(
        '/')[-1]
    example = pd.read_pickle(ts_matrix_file)
    keys = example[['this_file_name', 'sentence_order', 'word_count']]
    example.drop(
        columns=['this_file_name', 'sentence_order', 'word_count'],
        inplace=True)

    example_bin = example > 0

    # we then define the SPAN as being the number of sentences containing a given word
    SPAN = example_bin.sum(axis=0)
    # we also load the full sentence bank:
    sentence_bank_file = '/home/joao/Thesis/test_set/sentence_banks/' + graph_file.split(
        '/')[-1]
    sentence_bank = pd.read_pickle(sentence_bank_file)

    # we then define the damping factor, alfa
    alfa = 0.5

    # we now start defining the sentence selection algorithm
    #first, we define the basic words - nodes with indegree = 0
    full_word_list = example_bin.columns.values
    degree_dict = dict(DG.in_degree(full_word_list))
    degree_df = pd.DataFrame({
        'word': degree_dict.keys(),
        'in_degree': degree_dict.values()
    })
    general_words = degree_df.loc[degree_df.in_degree == 0, 'word']
    # we then define the sentence_sets:
    sentence_sets = []
    for i in example_bin.index:
        tmp = example_bin.loc[i, :]
        sentence_sets.append(tmp[tmp > 0].index.values)

    # we then define the function that gets the conditional
    # saliency for a sentence given another:
    def get_conditional_saliency(word_set_1, word_set_2, SPAN, DG):
        cs = 0
        sub_df = list(word_set_1) + list(word_set_2)
        subset_dg = DG.subgraph(sub_df)
        for i in word_set_1:
            for j in word_set_2:
                if (nx.has_path(subset_dg, j, i)):
                    cs += np.log(SPAN[i])
                    break
        return cs

    total_cs = []
    sentence_nums = []
    for sentence_num in tqdm(range(len(sentence_sets))):
        this_sentence = sentence_sets[sentence_num]
        total_cs.append(
            get_conditional_saliency(this_sentence, general_words, SPAN, DG))
        sentence_nums.append(sentence_num)
    final_cs = pd.DataFrame({'sentence_num': sentence_nums, 'cs': total_cs})

    first_sentence = final_cs[final_cs.cs == final_cs.cs.max()].sentence_num

    selected_sentence_keys = keys.iloc[first_sentence]

    selected_sentence_keys
    this_text = sentence_bank[sentence_bank.filename == selected_sentence_keys.
                              this_file_name.values[0]]

    this_sentence = this_text[this_text.sentence_order ==
                              selected_sentence_keys.sentence_order.
                              values[0]].sentence.values

    abstract = list(this_sentence)
    selected_sentence_nums = [first_sentence.values[0]]
    selected_sentences = [sentence_sets[first_sentence.values[0]]]

    total_abstract_length = len(abstract[-1][0].split(' '))
    # we then update SPAN
    SPAN[selected_sentences[-1]] = SPAN[selected_sentences[-1]] * alfa

    # we may then start the progressive summarization procedure - using only 400 words
    while (total_abstract_length < 250):
        cs = []
        sentence_nums = []
        for sentence_num in range(len(sentence_sets)):
            this_sentence = sentence_sets[sentence_num]
            potential_cs = []
            for present_sentence in selected_sentences:
                potential_cs.append(
                    get_conditional_saliency(this_sentence, present_sentence,
                                             SPAN, DG))
            cs.append(max(potential_cs))
            sentence_nums.append(sentence_num)
        final_cs = pd.DataFrame({
            'sentence_num': sentence_nums,
            'cs': total_cs
        })
        #we ignore already selected sentences
        final_cs = final_cs[~final_cs.sentence_num.
                            isin(selected_sentence_nums)]
        next_sentence = final_cs[final_cs.cs == final_cs.cs.max()].sentence_num
        if (next_sentence.shape[0] != 1):
            selected_sentence_keys = keys.iloc[next_sentence.values]
            # we select the sentence that came first
            selected_sentence_keys = selected_sentence_keys[
                selected_sentence_keys.sentence_order ==
                selected_sentence_keys.sentence_order.min()]
            next_sentence = selected_sentence_keys.index.values[0]
            selected_sentence_keys = keys.iloc[next_sentence]

        else:
            next_sentence = next_sentence.values[0]
            selected_sentence_keys = keys.iloc[next_sentence]


        this_text = sentence_bank[sentence_bank.filename ==
                                  selected_sentence_keys.this_file_name]
        this_sentence = this_text[this_text.sentence_order ==
                                  selected_sentence_keys.
                                  sentence_order].sentence.values[0]
        remaining_words = 250 - total_abstract_length
        if (len(this_sentence.split(' ')) > remaining_words):
            break
        else:
            abstract.append(this_sentence)
            selected_sentence_nums.append(next_sentence)
            selected_sentences.append(sentence_sets[next_sentence])
            total_abstract_length += len(abstract[-1].split(' '))
            SPAN[selected_sentences[-1]] = SPAN[selected_sentences[-1]] * alfa
    #saving the abstract:
    final_filename = '/home/joao/Thesis/progressive/abstracts/' + graph_file.split(
        '/')[-1][:-2] + '.txt'
    with open(final_filename, 'wb') as f:
        for i in abstract:
            f.write(str(i).strip())
            if (i != abstract[-1]):
                f.write('\r\n')
    final_abstract = ''
    for i in abstract:
        final_abstract += i
    return (ts_matrix_file, final_abstract)

In [None]:
final_results = Parallel(
    n_jobs=8, verbose=11)(
        delayed(create_abstract)(graph_file) for graph_file in graph_files)

# Execution time:  5min 59s

In [None]:
filenames = []
abstracts = []
for i in final_results:
    filenames.append(i[0])
    abstracts.append(i[1])
final_df = pd.DataFrame({'filename': filenames, 'abstracts': abstracts})

In [None]:
final_df.to_pickle('/home/joao/Thesis/progressive/final_test_results.p')

# Final Scoring

In [5]:
from rouge import Rouge, FilesRouge
import pandas as pd
import glob
from tqdm import tqdm
import numpy as np
from unidecode import unidecode

results_df = pd.read_pickle(
    '/home/joao/Thesis/progressive/final_test_results.p')

results_df.filename = results_df.filename.str.split('/', expand=True)[6]


def remove_non_ascii(text):
    """This function removes all non-ascii characters from text and replaces 
    them with their closest ascii representation"""
    return unidecode(unicode(text, encoding="utf-8"))


# we then load all summaries and candidate summaries:

total_scores = []
scores = []
r1 = []
r2 = []
rl = []
for i in tqdm(results_df.index):
    ground_truth = '/home/joao/Thesis/test_set/abstracts/ground_truths/' + results_df.loc[
        i, 'filename'][:-2] + '.txt'
    rouge = Rouge()
    with open(ground_truth, 'rb') as f:
        ground_truth = f.read()
    ground_truth = remove_non_ascii(ground_truth)
    tmp_scores = rouge.get_scores(
        results_df.loc[i, 'abstracts'], ground_truth, avg=True)
    r2.append(tmp_scores['rouge-2']['f'])
    r1.append(tmp_scores['rouge-1']['f'])
    rl.append(tmp_scores['rouge-l']['f'])

100%|██████████| 69/69 [00:03<00:00, 21.52it/s]


In [None]:
print('r1', np.mean(r1), np.std(r1, ddof=1))
print('r2', np.mean(r2), np.std(r2, ddof=1))
print('rl', np.mean(rl), np.std(rl, ddof=1))

In [17]:
from rouge import Rouge, FilesRouge
import pandas as pd
import glob
from tqdm import tqdm
import numpy as np
from unidecode import unidecode
results_df = pd.read_pickle(
    '/home/joao/Thesis/progressive/final_test_results.p')
results_df['r1'] = r1
results_df['r2'] = r2
results_df['rl'] = rl
results_df.abstracts = results_df.abstracts.str.replace('\n',' ').str.replace('\r',' ')

In [18]:
results_df.to_csv('/home/joao/Thesis/progressive/final_test_results.csv', sep = '|')